From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from smtp-1.sys.kth.se (smtp-1.sys.kth.se [130.237.32.175])
	by krisdoz.my.domain (8.14.3/8.14.3) with ESMTP id p38Co95u031612
	for <tech@mdocml.bsd.lv>; Fri, 8 Apr 2011 08:50:10 -0400 (EDT)
Received: from mailscan-1.sys.kth.se (mailscan-1.sys.kth.se [130.237.32.91])
	by smtp-1.sys.kth.se (Postfix) with ESMTP id 3815B15588C
	for <tech@mdocml.bsd.lv>; Fri,  8 Apr 2011 14:50:04 +0200 (CEST)
X-Virus-Scanned: by amavisd-new at kth.se
Received: from smtp-1.sys.kth.se ([130.237.32.175])
	by mailscan-1.sys.kth.se (mailscan-1.sys.kth.se [130.237.32.91]) (amavisd-new, port 10024)
	with LMTP id qx6JDxcbjNdc for <tech@mdocml.bsd.lv>;
	Fri,  8 Apr 2011 14:50:01 +0200 (CEST)
X-KTH-Auth: kristaps [193.10.49.5]
X-KTH-mail-from: kristaps@bsd.lv
X-KTH-rcpt-to: tech@mdocml.bsd.lv
Received: from [172.16.18.84] (unknown [193.10.49.5])
	by smtp-1.sys.kth.se (Postfix) with ESMTP id E651F156B4B
	for <tech@mdocml.bsd.lv>; Fri,  8 Apr 2011 14:50:00 +0200 (CEST)
Message-ID: <4D9F0478.40006@bsd.lv>
Date: Fri, 08 Apr 2011 14:50:00 +0200
From: Kristaps Dzonsons <kristaps@bsd.lv>
User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.16) Gecko/20110303 Icedove/3.0.11
X-Mailinglist: mdocml-tech
Reply-To: tech@mdocml.bsd.lv
MIME-Version: 1.0
To: tech@mdocml.bsd.lv
Subject: Re: Unifying the escape-sequence parser.
References: <4D9DC396.9010504@bsd.lv> <4D9EEF2F.2030307@bsd.lv> <4D9EFC85.3040301@bsd.lv>
In-Reply-To: <4D9EFC85.3040301@bsd.lv>
Content-Type: multipart/mixed;
 boundary="------------080508010603020000010006"

This is a multi-part message in MIME format.
--------------080508010603020000010006
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

On 04/08/2011 02:16 PM, Kristaps Dzonsons wrote:
>> Step 2.
>>
>> This finishes off the new escape-sequence parser and puts it into
>> mandoc_escape (mandoc.c, mandoc.h), then makes it the underlying engine
>> for a2roffdeco (out.c) (requiring a tiny change to term.c and html.c for
>> bailing out on bad sequences) and roff_res (roff.c).
>>
>> Now all escape-sequences are being parsed with the same engine! This
>> logic was being repeated in THREE different places, earlier (mandoc.c
>> for validation, out.c for output, and roff.c for predefined escapes).
>>
>> I've run this over all manuals I know of without problems, but it can
>> really use a close look-over with border cases.
>>
>> The next step is to clean out the out.c code, completely removing enum
>> roffdeco (putting that logic into mandoc.c, perhaps).
>
> Step 2b. I rolled back the roff part: the search/replace of predefined
> strings must happen prior to escape processing. I'd also forgotten to
> include mdoc_validate.c in the patch.

Step 3: fixed where I forgot to for->while in mdoc_validate.c's 
check_text() loop; removed DECO_SSPECIAL; cleaned up DECO_NOSPACE; 
cleaned up print_encode() in html.c.  The next will be removing the DECO 
stuff entirely.

--------------080508010603020000010006
Content-Type: text/plain;
 name="patch.escapes.txt"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="patch.escapes.txt"

Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 12:47:36 -0000
@@ -230,7 +230,7 @@
 	if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
 		printf("&#%d;", cp);
 		return;
-	} else if (-1 == cp && DECO_SSPECIAL == d) {
+	} else if (-1 == cp && 1 == len) {
 		fwrite(p, 1, len, stdout);
 		return;
 	} else if (-1 == cp)
@@ -304,40 +304,41 @@
 	int		 len, nospace;
 	const char	*seq;
 	enum roffdeco	 deco;
-	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
+	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH };
 
 	nospace = 0;
 
-	for (; *p; p++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, rejs);
 
 		fwrite(p, 1, sz, stdout);
-		p += /* LINTED */
-			sz;
+		p += (int)sz;
 
-		if ('<' == *p) {
+		if ('\0' == *p)
+			break;
+
+		switch (*p++) {
+		case ('<'):
 			printf("&lt;");
 			continue;
-		} else if ('>' == *p) {
+		case ('>'):
 			printf("&gt;");
 			continue;
-		} else if ('&' == *p) {
+		case ('&'):
 			printf("&amp;");
 			continue;
-		} else if (ASCII_HYPH == *p) {
-			/*
-			 * Note: "soft hyphens" aren't graphically
-			 * displayed when not breaking the text; we want
-			 * them to be displayed.
-			 */
-			/*printf("&#173;");*/
+		case (ASCII_HYPH):
 			putchar('-');
 			continue;
-		} else if ('\0' == *p)
+		default:
+			break;
+		}
+
+		seq = p;
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
 			break;
 
-		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		p += len;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -346,8 +347,6 @@
 		case (DECO_RESERVED):
 			print_res(h, seq, sz);
 			break;
-		case (DECO_SSPECIAL):
-			/* FALLTHROUGH */
 		case (DECO_SPECIAL):
 			print_spec(h, deco, seq, sz);
 			break;
@@ -362,14 +361,13 @@
 				break;
 			print_metaf(h, deco);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *p)
+				nospace = 1;
+			break;
 		default:
 			break;
 		}
-
-		p += len - 1;
-
-		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
-			nospace = 1;
 	}
 
 	return(nospace);
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 12:47:37 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 12:47:37 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 12:47:37 -0000
@@ -35,198 +35,315 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz;
+	const char	*cp;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	if (start)
+		*start = cp;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('F'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONTFAM;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		if (start) 
+			*start = &cp[i];
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		if (start) 
+			*start = &cp[i];
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
-			break;
-		}
-		/* FALLTHROUGH */
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
 	default:
-		len = 1;
-		p--;
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
+	assert(ESCAPE_ERROR != gly);
+
+	if (start)
+		*start = &cp[i];
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+		if (sz)
+			*sz = *end - &cp[i];
+		(*end)++;
+		return(gly);
 	}
 
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
-}
+	assert(lim > 0);
 
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	if (sz)
+		*sz = lim;
+	*end = &cp[i] + lim;
+	return(gly);
+}
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 12:47:37 -0000
@@ -288,6 +288,16 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTFAM, /* a font family */
+	ESCAPE_NUMBERED /* a numbered glyph */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +319,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 12:47:37 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
-	for ( ; *p; p++, pos++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 12:47:37 -0000
@@ -178,237 +178,70 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
+	cp = start = *word;
+
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
+		break;
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_FONTFAM):
+		*d = DECO_FFONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		lim = 2;
 		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	}
+
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
+
+	if (1 == *sz && (DECO_FONT == *d || DECO_FFONT == *d))
+		switch (**word) {
 		case ('3'):
 			/* FALLTHROUGH */
 		case ('B'):
 			*d = DECO_BOLD;
-			return(i);
+			break;
 		case ('2'):
 			/* FALLTHROUGH */
 		case ('I'):
 			*d = DECO_ITALIC;
-			return(i);
+			break;
 		case ('P'):
 			*d = DECO_PREVIOUS;
-			return(i);
+			break;
 		case ('1'):
 			/* FALLTHROUGH */
 		case ('R'):
 			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
 		default:
-			i--;
-			lim = 1;
 			break;
 		}
 
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
+	if (1 == *sz && DECO_SPECIAL == *d && 'c' == **word)
 		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
-			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
-		}
-
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: out.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.h,v
retrieving revision 1.18
diff -u -r1.18 out.h
--- out.h	22 Mar 2011 10:13:01 -0000	1.18
+++ out.h	8 Apr 2011 12:47:37 -0000
@@ -35,7 +35,6 @@
 	DECO_NONE,
 	DECO_NUMBERED, /* numbered character */
 	DECO_SPECIAL, /* special character */
-	DECO_SSPECIAL, /* single-char special */
 	DECO_RESERVED, /* reserved word */
 	DECO_BOLD, /* bold font */
 	DECO_ITALIC, /* italic font */
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 12:47:37 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 12:47:37 -0000
@@ -366,7 +366,7 @@
 	rhs = chars_spec2str(p->symtab, word, len, &sz);
 	if (rhs) 
 		encode(p, rhs, sz);
-	else if (DECO_SSPECIAL == d)
+	else if (1 == len)
 		encode(p, word, len);
 }
 
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -497,8 +500,6 @@
 			res(p, seq, ssz);
 			break;
 		case (DECO_SPECIAL):
-			/* FALLTHROUGH */
-		case (DECO_SSPECIAL):
 			spec(p, deco, seq, ssz);
 			break;
 		case (DECO_BOLD):
@@ -513,12 +514,13 @@
 		case (DECO_PREVIOUS):
 			term_fontlast(p);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *word)
+				p->flags |= TERMP_NOSPACE;
+			break;
 		default:
 			break;
 		}
-
-		if (DECO_NOSPACE == deco && '\0' == *word)
-			p->flags |= TERMP_NOSPACE;
 	}
 }
 
@@ -620,13 +622,11 @@
 					(p->symtab, seq, ssz, &rsz);
 				break;
 			case (DECO_SPECIAL):
-				/* FALLTHROUGH */
-			case (DECO_SSPECIAL):
 				rhs = chars_spec2str
 					(p->symtab, seq, ssz, &rsz);
 
 				/* Allow for one-char escapes. */
-				if (DECO_SSPECIAL != d || rhs)
+				if (ssz != 1 || rhs)
 					break;
 
 				rhs = seq;

--------------080508010603020000010006--
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv