Unifying the escape-sequence parser.

tech@mandoc.bsd.lv
 help / color / mirror / Atom feed

* Unifying the escape-sequence parser.
@ 2011-04-07 14:00 Kristaps Dzonsons
  2011-04-08 11:19 ` Kristaps Dzonsons
  0 siblings, 1 reply; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-07 14:00 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 719 bytes --]

Hi,

Enclosed are my efforts to unify the escaped-sequence functions in out.c 
(a2roffdeco()) and mandoc.c (mandoc_special()).

This handles, as far as I can see, all syntaxes of the groff(7) escapes.

When called during libmandoc validation, it will check for GLYPH_ERROR 
and be followed by a search-and-replace of ASCII_HYPH for `-' in the 
substring.  When invoked from term.c or html.c, it will switch on the 
returned type and substring value.

This will clear up a nice big chunk of code, but it's a pretty delicate 
area, so please look it over!

If you compile this file, you can test escapes by running, e.g.,

   % ./a.out s+\'\(\\f\[asdf\]\)\'123

to see the values of "start" and "end".

Thanks,

Kristaps

[-- Attachment #2: glyph.c --]
[-- Type: text/x-csrc, Size: 6971 bytes --]

#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define	ASCII_HYPH	30

enum mandoc_gly {
	GLYPH_ERROR = 0,
	GLYPH_IGNORE, /* glyph to be ignored */
	GLYPH_SPECIAL, /* a regular special character */
	GLYPH_PREDEF, /* a predefined special character */
	GLYPH_FONT, /* a font mode */
	GLYPH_FONTFAM, /* a font family */
	GLYPH__MAX
};

enum mandoc_gly	mandoc_glyph(const char **, const char **, int *);
static int	mandoc_glyphexp(const char *);

static const char * const glyphs[GLYPH__MAX] = {
	"error", 
	"ignore", 
	"special", 
	"predefined", 
	"font", 
	"font family"
};

/*
 * Pass over recursive numerical expressions.  This context of this
 * function is important: it's only called within character-terminating
 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
 * recursion: we don't care about what's in these blocks. 
 * This returns the number of characters skipped or -1 if an error
 * occurs (the caller should bail).
 */
static int
mandoc_glyphexp(const char *start)
{
	int		 i;
	size_t		 sz;
	const char	*cp;

	i = 0;

	/* The expression consists of a subexpression. */

	if ('\\' == start[i]) {
		cp = &start[++i];
		/*
		 * Read past the end of the subexpression.
		 * Bail immediately on errors.
		 */
		if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
			return(-1);
		return(i + cp - &start[i]);
	} 

	if ('(' != start[i++])
		return(0);

	/*
	 * A parenthesised subexpression.  Read until the closing
	 * parenthesis, making sure to handle any nested subexpressions
	 * that might ruin our parse.
	 */

	while (')' != start[i]) {
		sz = strcspn(&start[i], ")\\");
		i += (int)sz;

		if ('\0' == start[i])
			return(-1);
		else if ('\\' != start[i])
			continue;

		cp = &start[++i];
		if (GLYPH_ERROR == mandoc_glyph(&cp, NULL, NULL))
			return(-1);
		i += cp - &start[i];
	}

	/* Read past the terminating ')'. */
	return(++i);
}

/*
 * Handle an escaped sequeence.  This should be called with any
 * string subsequent a `\'.  Pass a pointer to this substring as "end";
 * it will be set to the supremum of the parsed escape sequence.  If
 * this returns GLYPH_ERROR, the string is bogus and should be thrown
 * away.  If not GLYPH_ERROR or GLYPH_IGNORE, "start" is set to the
 * first relevant character of the substring (font, glyph, whatever) of
 * length sz.  Both "start" and "sz" may be NULL.
 */
enum mandoc_gly
mandoc_glyph(const char **end, const char **start, int *sz)
{
	char		 c, term, numeric;
	int		 i, lim, ssz;
	const char	*cp;
	enum mandoc_gly	 gly; 

	cp = *end;
	if (start)
		*start = cp;
	i = 0;
	gly = GLYPH_ERROR;
	term = '\0';
	numeric = 0;

	switch ((c = cp[i++])) {
	/*
	 * First the glyphs.  There are several different forms of
	 * these, but each eventually returns a substring of the glyph
	 * name.
	 */
	case ('('):
		gly = GLYPH_SPECIAL;
		lim = 2;
		break;
	case ('['):
		gly = GLYPH_SPECIAL;
		term = ']';
		break;
	case ('C'):
		if ('\'' != cp[i])
			return(GLYPH_ERROR);
		gly = GLYPH_SPECIAL;
		term = '\'';
		break;

	/*
	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
	 * 'X' is the trigger.  These have opaque sub-strings.
	 */
	case ('g'):
		/* FALLTHROUGH */
	case ('k'):
		/* FALLTHROUGH */
	case ('M'):
		/* FALLTHROUGH */
	case ('m'):
		/* FALLTHROUGH */
	case ('n'):
		/* FALLTHROUGH */
	case ('V'):
		/* FALLTHROUGH */
	case ('Y'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_IGNORE;
		/* FALLTHROUGH */
	case ('*'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_PREDEF;
		/* FALLTHROUGH */
	case ('F'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_FONTFAM;
		/* FALLTHROUGH */
	case ('f'):
		if (GLYPH_ERROR == gly)
			gly = GLYPH_FONT;

		if (start) 
			*start = &cp[i];

		switch (cp[i++]) {
		case ('('):
			lim = 2;
			break;
		case ('['):
			term = ']';
			break;
		default:
			lim = 1;
			i--;
			break;
		}
		break;

	/*
	 * These escapes are of the form \X'Y', where 'X' is the trigger
	 * and 'Y' is any string.  These have opaque sub-strings.
	 */
	case ('A'):
		/* FALLTHROUGH */
	case ('b'):
		/* FALLTHROUGH */
	case ('D'):
		/* FALLTHROUGH */
	case ('o'):
		/* FALLTHROUGH */
	case ('R'):
		/* FALLTHROUGH */
	case ('X'):
		/* FALLTHROUGH */
	case ('Z'):
		if ('\'' != cp[i++])
			return(GLYPH_ERROR);
		gly = GLYPH_IGNORE;
		term = '\'';
		break;

	/*
	 * These escapes are of the form \X'N', where 'X' is the trigger
	 * and 'N' resolves to a numerical expression.
	 */
	case ('B'):
		/* FALLTHROUGH */
	case ('h'):
		/* FALLTHROUGH */
	case ('H'):
		/* FALLTHROUGH */
	case ('L'):
		/* FALLTHROUGH */
	case ('l'):
		/* FALLTHROUGH */
	case ('N'):
		/* FALLTHROUGH */
	case ('S'):
		/* FALLTHROUGH */
	case ('v'):
		/* FALLTHROUGH */
	case ('w'):
		/* FALLTHROUGH */
	case ('x'):
		if ('\'' != cp[i++])
			return(GLYPH_ERROR);
		gly = GLYPH_IGNORE;
		term = numeric = '\'';
		break;

	/* 
	 * Sizes get a special category of their own.
	 */
	case ('s'):
		gly = GLYPH_IGNORE;

		if (start) 
			*start = &cp[i];

		/* See +/- counts as a sign. */
		c = cp[i];
		if ('+' == c || '-' == c || ASCII_HYPH == c)
			++i;

		switch (cp[i++]) {
		case ('('):
			lim = 2;
			break;
		case ('['):
			term = numeric = ']';
			break;
		case ('\''):
			term = numeric = '\'';
			break;
		default:
			lim = 1;
			i--;
			break;
		}

		/* See +/- counts as a sign. */
		c = cp[i];
		if ('+' == c || '-' == c || ASCII_HYPH == c)
			++i;

		break;

	/*
	 * Anything else is assumed to be a glyph.
	 */
	default:
		gly = GLYPH_SPECIAL;
		lim = 1;
		i--;
		break;
	}

	assert(GLYPH_ERROR != gly);

	if (start)
		*start = &cp[i];

	/*
	 * If a terminating block has been specified, we need to
	 * handle the case of recursion, which could have their
	 * own terminating blocks that mess up our parse.  This, by the
	 * way, means that the "start" and "size" values will be
	 * effectively meaningless.
	 */

	ssz = 0;
	if (numeric && -1 == (ssz = mandoc_glyphexp(&cp[i])))
		return(GLYPH_ERROR);

	i += ssz;

	/*
	 * We have a character terminator.  Try to read up to that
	 * character.  If we can't (i.e., we hit the nil), then return
	 * an error; if we can, calculate our length, read past the
	 * terminating character, and exit.
	 */

	if ('\0' != term) {
		*end = strchr(&cp[i], term);
		if ('\0' == *end)
			return(GLYPH_ERROR);
		if (sz)
			*sz = *end - &cp[i];
		(*end)++;
		return(gly);
	}

	assert(lim > 0);

	/*
	 * We have a numeric limit.  If the string is shorter than that,
	 * stop and return an error.  Else adjust our endpoint, length,
	 * and return the current glyph.
	 */

	if ((size_t)lim > strlen(&cp[i]))
		return(GLYPH_ERROR);

	if (sz)
		*sz = lim;
	*end = &cp[i] + lim;
	return(gly);
}

int
main(int argc, char *argv[])
{
	const char	*v, *start;
	enum mandoc_gly	 gly;
	int		 sz;

	if (2 != argc)
		return(EXIT_FAILURE);

	v = argv[1];
	printf("input: %s\n", v);

	gly = mandoc_glyph(&v, &start, &sz);
	printf("glyph=%s, end=%s, start=%s, sz=%d\n", glyphs[gly], v, start, sz);

	return(EXIT_SUCCESS);
}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Unifying the escape-sequence parser.
  2011-04-07 14:00 Unifying the escape-sequence parser Kristaps Dzonsons
@ 2011-04-08 11:19 ` Kristaps Dzonsons
  2011-04-08 12:16   ` Kristaps Dzonsons
  0 siblings, 1 reply; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-08 11:19 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 1474 bytes --]

> Enclosed are my efforts to unify the escaped-sequence functions in out.c
> (a2roffdeco()) and mandoc.c (mandoc_special()).
>
> This handles, as far as I can see, all syntaxes of the groff(7) escapes.
>
> When called during libmandoc validation, it will check for GLYPH_ERROR
> and be followed by a search-and-replace of ASCII_HYPH for `-' in the
> substring. When invoked from term.c or html.c, it will switch on the
> returned type and substring value.
>
> This will clear up a nice big chunk of code, but it's a pretty delicate
> area, so please look it over!
>
> If you compile this file, you can test escapes by running, e.g.,
>
> % ./a.out s+\'\(\\f\[asdf\]\)\'123
>
> to see the values of "start" and "end".

Step 2.

This finishes off the new escape-sequence parser and puts it into 
mandoc_escape (mandoc.c, mandoc.h), then makes it the underlying engine 
for a2roffdeco (out.c) (requiring a tiny change to term.c and html.c for 
bailing out on bad sequences) and roff_res (roff.c).

Now all escape-sequences are being parsed with the same engine!  This 
logic was being repeated in THREE different places, earlier (mandoc.c 
for validation, out.c for output, and roff.c for predefined escapes).

I've run this over all manuals I know of without problems, but it can 
really use a close look-over with border cases.

The next step is to clean out the out.c code, completely removing enum 
roffdeco (putting that logic into mandoc.c, perhaps).

Thoughts?

Kristaps

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 21158 bytes --]

Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 11:16:10 -0000
@@ -337,7 +337,8 @@
 			break;
 
 		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
+			break;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 11:16:10 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 11:16:11 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 11:16:11 -0000
@@ -35,198 +35,315 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz;
+	const char	*cp;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	if (start)
+		*start = cp;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('F'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONTFAM;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		if (start) 
+			*start = &cp[i];
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		if (start) 
+			*start = &cp[i];
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
-			break;
-		}
-		/* FALLTHROUGH */
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
 	default:
-		len = 1;
-		p--;
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
+	assert(ESCAPE_ERROR != gly);
+
+	if (start)
+		*start = &cp[i];
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+		if (sz)
+			*sz = *end - &cp[i];
+		(*end)++;
+		return(gly);
 	}
 
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
-}
+	assert(lim > 0);
 
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	if (sz)
+		*sz = lim;
+	*end = &cp[i] + lim;
+	return(gly);
+}
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 11:16:11 -0000
@@ -288,6 +288,16 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTFAM, /* a font family */
+	ESCAPE_NUMBERED /* a numbered glyph */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +319,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 11:16:11 -0000
@@ -178,237 +178,70 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
+	cp = start = *word;
+
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
+		break;
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_FONTFAM):
+		*d = DECO_FFONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		lim = 2;
 		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	}
+
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
+
+	if (1 == *sz && (DECO_FONT == *d || DECO_FFONT == *d))
+		switch (**word) {
 		case ('3'):
 			/* FALLTHROUGH */
 		case ('B'):
 			*d = DECO_BOLD;
-			return(i);
+			break;
 		case ('2'):
 			/* FALLTHROUGH */
 		case ('I'):
 			*d = DECO_ITALIC;
-			return(i);
+			break;
 		case ('P'):
 			*d = DECO_PREVIOUS;
-			return(i);
+			break;
 		case ('1'):
 			/* FALLTHROUGH */
 		case ('R'):
 			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
 		default:
-			i--;
-			lim = 1;
-			break;
-		}
-
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
-		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
 			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
 		}
 
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
+	if (1 == *sz && DECO_SPECIAL == *d)
+		*d = 'c' == **word ? DECO_NOSPACE : DECO_SSPECIAL;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 11:16:11 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: roff.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/roff.c,v
retrieving revision 1.131
diff -u -r1.131 roff.c
--- roff.c	5 Apr 2011 22:22:33 -0000	1.131
+++ roff.c	8 Apr 2011 11:16:11 -0000
@@ -382,9 +382,10 @@
 	const char	*stnam;	/* start of the name, after "[(*" */
 	const char	*cp;	/* end of the name, e.g. before ']' */
 	const char	*res;	/* the string to be substituted */
-	int		 i, maxl;
+	int		 i;
 	size_t		 nsz;
 	char		*n;
+	enum mandoc_esc	 esc;
 
 	/* Search for a leading backslash and save a pointer to it. */
 
@@ -392,48 +393,11 @@
 	while (NULL != (cp = strchr(cp, '\\'))) {
 		stesc = cp++;
 
-		/*
-		 * The second character must be an asterisk.
-		 * If it isn't, skip it anyway:  It is escaped,
-		 * so it can't start another escape sequence.
-		 */
-
-		if ('\0' == *cp)
-			return(1);
-		if ('*' != *cp++)
-			continue;
-
-		/*
-		 * The third character decides the length
-		 * of the name of the string.
-		 * Save a pointer to the name.
-		 */
-
-		switch (*cp) {
-		case ('\0'):
-			return(1);
-		case ('('):
-			cp++;
-			maxl = 2;
-			break;
-		case ('['):
-			cp++;
-			maxl = 0;
-			break;
-		default:
-			maxl = 1;
+		esc = mandoc_escape(&cp, &stnam, &i);
+		if (ESCAPE_ERROR == esc)
 			break;
-		}
-		stnam = cp;
-
-		/* Advance to the end of the name. */
-
-		for (i = 0; 0 == maxl || i < maxl; i++, cp++) {
-			if ('\0' == *cp)
-				return(1); /* Error. */
-			if (0 == maxl && ']' == *cp)
-				break;
-		}
+		else if (ESCAPE_PREDEF != esc)
+			continue;
 
 		/*
 		 * Retrieve the replacement string; if it is
@@ -442,10 +406,8 @@
 
 		res = roff_getstrn(r, stnam, (size_t)i);
 
-		if (NULL == res) {
-			cp -= maxl ? 1 : 0;
+		if (NULL == res)
 			continue;
-		}
 
 		/* Replace the escape sequence by the string. */
 
@@ -454,7 +416,7 @@
 
 		strlcpy(n, *bufp, (size_t)(stesc - *bufp + 1));
 		strlcat(n, res, nsz);
-		strlcat(n, cp + (maxl ? 0 : 1), nsz);
+		strlcat(n, cp, nsz);
 
 		free(*bufp);
 
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 11:16:11 -0000
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Unifying the escape-sequence parser.
  2011-04-08 11:19 ` Kristaps Dzonsons
@ 2011-04-08 12:16   ` Kristaps Dzonsons
  2011-04-08 12:50     ` Kristaps Dzonsons
  0 siblings, 1 reply; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-08 12:16 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 940 bytes --]

> Step 2.
>
> This finishes off the new escape-sequence parser and puts it into
> mandoc_escape (mandoc.c, mandoc.h), then makes it the underlying engine
> for a2roffdeco (out.c) (requiring a tiny change to term.c and html.c for
> bailing out on bad sequences) and roff_res (roff.c).
>
> Now all escape-sequences are being parsed with the same engine! This
> logic was being repeated in THREE different places, earlier (mandoc.c
> for validation, out.c for output, and roff.c for predefined escapes).
>
> I've run this over all manuals I know of without problems, but it can
> really use a close look-over with border cases.
>
> The next step is to clean out the out.c code, completely removing enum
> roffdeco (putting that logic into mandoc.c, perhaps).

Step 2b.  I rolled back the roff part: the search/replace of predefined 
strings must happen prior to escape processing.  I'd also forgotten to 
include mdoc_validate.c in the patch.

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 20299 bytes --]

Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 12:15:23 -0000
@@ -337,7 +337,8 @@
 			break;
 
 		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
+			break;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 12:15:23 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 12:15:23 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 12:15:23 -0000
@@ -35,198 +35,315 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz;
+	const char	*cp;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	if (start)
+		*start = cp;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('F'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONTFAM;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		if (start) 
+			*start = &cp[i];
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		if (start) 
+			*start = &cp[i];
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
-			break;
-		}
-		/* FALLTHROUGH */
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
 	default:
-		len = 1;
-		p--;
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
+	assert(ESCAPE_ERROR != gly);
+
+	if (start)
+		*start = &cp[i];
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+		if (sz)
+			*sz = *end - &cp[i];
+		(*end)++;
+		return(gly);
 	}
 
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
-}
+	assert(lim > 0);
 
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	if (sz)
+		*sz = lim;
+	*end = &cp[i] + lim;
+	return(gly);
+}
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 12:15:23 -0000
@@ -288,6 +288,16 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTFAM, /* a font family */
+	ESCAPE_NUMBERED /* a numbered glyph */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +319,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 12:15:24 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
 	for ( ; *p; p++, pos++) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 12:15:24 -0000
@@ -178,237 +178,70 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
+	cp = start = *word;
+
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
+		break;
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_FONTFAM):
+		*d = DECO_FFONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		lim = 2;
 		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	}
+
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
+
+	if (1 == *sz && (DECO_FONT == *d || DECO_FFONT == *d))
+		switch (**word) {
 		case ('3'):
 			/* FALLTHROUGH */
 		case ('B'):
 			*d = DECO_BOLD;
-			return(i);
+			break;
 		case ('2'):
 			/* FALLTHROUGH */
 		case ('I'):
 			*d = DECO_ITALIC;
-			return(i);
+			break;
 		case ('P'):
 			*d = DECO_PREVIOUS;
-			return(i);
+			break;
 		case ('1'):
 			/* FALLTHROUGH */
 		case ('R'):
 			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
 		default:
-			i--;
-			lim = 1;
-			break;
-		}
-
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
-		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
 			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
 		}
 
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
+	if (1 == *sz && DECO_SPECIAL == *d)
+		*d = 'c' == **word ? DECO_NOSPACE : DECO_SSPECIAL;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 12:15:24 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 12:15:24 -0000
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Unifying the escape-sequence parser.
  2011-04-08 12:16   ` Kristaps Dzonsons
@ 2011-04-08 12:50     ` Kristaps Dzonsons
  2011-04-08 13:15       ` Kristaps Dzonsons
  0 siblings, 1 reply; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-08 12:50 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 1232 bytes --]

On 04/08/2011 02:16 PM, Kristaps Dzonsons wrote:
>> Step 2.
>>
>> This finishes off the new escape-sequence parser and puts it into
>> mandoc_escape (mandoc.c, mandoc.h), then makes it the underlying engine
>> for a2roffdeco (out.c) (requiring a tiny change to term.c and html.c for
>> bailing out on bad sequences) and roff_res (roff.c).
>>
>> Now all escape-sequences are being parsed with the same engine! This
>> logic was being repeated in THREE different places, earlier (mandoc.c
>> for validation, out.c for output, and roff.c for predefined escapes).
>>
>> I've run this over all manuals I know of without problems, but it can
>> really use a close look-over with border cases.
>>
>> The next step is to clean out the out.c code, completely removing enum
>> roffdeco (putting that logic into mandoc.c, perhaps).
>
> Step 2b. I rolled back the roff part: the search/replace of predefined
> strings must happen prior to escape processing. I'd also forgotten to
> include mdoc_validate.c in the patch.

Step 3: fixed where I forgot to for->while in mdoc_validate.c's 
check_text() loop; removed DECO_SSPECIAL; cleaned up DECO_NOSPACE; 
cleaned up print_encode() in html.c.  The next will be removing the DECO 
stuff entirely.

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 23510 bytes --]

Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 12:47:36 -0000
@@ -230,7 +230,7 @@
 	if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
 		printf("&#%d;", cp);
 		return;
-	} else if (-1 == cp && DECO_SSPECIAL == d) {
+	} else if (-1 == cp && 1 == len) {
 		fwrite(p, 1, len, stdout);
 		return;
 	} else if (-1 == cp)
@@ -304,40 +304,41 @@
 	int		 len, nospace;
 	const char	*seq;
 	enum roffdeco	 deco;
-	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
+	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH };
 
 	nospace = 0;
 
-	for (; *p; p++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, rejs);
 
 		fwrite(p, 1, sz, stdout);
-		p += /* LINTED */
-			sz;
+		p += (int)sz;
 
-		if ('<' == *p) {
+		if ('\0' == *p)
+			break;
+
+		switch (*p++) {
+		case ('<'):
 			printf("&lt;");
 			continue;
-		} else if ('>' == *p) {
+		case ('>'):
 			printf("&gt;");
 			continue;
-		} else if ('&' == *p) {
+		case ('&'):
 			printf("&amp;");
 			continue;
-		} else if (ASCII_HYPH == *p) {
-			/*
-			 * Note: "soft hyphens" aren't graphically
-			 * displayed when not breaking the text; we want
-			 * them to be displayed.
-			 */
-			/*printf("&#173;");*/
+		case (ASCII_HYPH):
 			putchar('-');
 			continue;
-		} else if ('\0' == *p)
+		default:
+			break;
+		}
+
+		seq = p;
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
 			break;
 
-		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		p += len;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -346,8 +347,6 @@
 		case (DECO_RESERVED):
 			print_res(h, seq, sz);
 			break;
-		case (DECO_SSPECIAL):
-			/* FALLTHROUGH */
 		case (DECO_SPECIAL):
 			print_spec(h, deco, seq, sz);
 			break;
@@ -362,14 +361,13 @@
 				break;
 			print_metaf(h, deco);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *p)
+				nospace = 1;
+			break;
 		default:
 			break;
 		}
-
-		p += len - 1;
-
-		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
-			nospace = 1;
 	}
 
 	return(nospace);
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 12:47:37 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 12:47:37 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 12:47:37 -0000
@@ -35,198 +35,315 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz;
+	const char	*cp;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	if (start)
+		*start = cp;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('F'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONTFAM;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		if (start) 
+			*start = &cp[i];
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		if (start) 
+			*start = &cp[i];
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
-			break;
-		}
-		/* FALLTHROUGH */
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
 	default:
-		len = 1;
-		p--;
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
+	assert(ESCAPE_ERROR != gly);
+
+	if (start)
+		*start = &cp[i];
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+		if (sz)
+			*sz = *end - &cp[i];
+		(*end)++;
+		return(gly);
 	}
 
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
-}
+	assert(lim > 0);
 
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	if (sz)
+		*sz = lim;
+	*end = &cp[i] + lim;
+	return(gly);
+}
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 12:47:37 -0000
@@ -288,6 +288,16 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTFAM, /* a font family */
+	ESCAPE_NUMBERED /* a numbered glyph */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +319,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 12:47:37 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
-	for ( ; *p; p++, pos++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 12:47:37 -0000
@@ -178,237 +178,70 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
+	cp = start = *word;
+
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
+		break;
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_FONTFAM):
+		*d = DECO_FFONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		lim = 2;
 		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	}
+
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
+
+	if (1 == *sz && (DECO_FONT == *d || DECO_FFONT == *d))
+		switch (**word) {
 		case ('3'):
 			/* FALLTHROUGH */
 		case ('B'):
 			*d = DECO_BOLD;
-			return(i);
+			break;
 		case ('2'):
 			/* FALLTHROUGH */
 		case ('I'):
 			*d = DECO_ITALIC;
-			return(i);
+			break;
 		case ('P'):
 			*d = DECO_PREVIOUS;
-			return(i);
+			break;
 		case ('1'):
 			/* FALLTHROUGH */
 		case ('R'):
 			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
 		default:
-			i--;
-			lim = 1;
 			break;
 		}
 
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
+	if (1 == *sz && DECO_SPECIAL == *d && 'c' == **word)
 		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
-			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
-		}
-
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: out.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.h,v
retrieving revision 1.18
diff -u -r1.18 out.h
--- out.h	22 Mar 2011 10:13:01 -0000	1.18
+++ out.h	8 Apr 2011 12:47:37 -0000
@@ -35,7 +35,6 @@
 	DECO_NONE,
 	DECO_NUMBERED, /* numbered character */
 	DECO_SPECIAL, /* special character */
-	DECO_SSPECIAL, /* single-char special */
 	DECO_RESERVED, /* reserved word */
 	DECO_BOLD, /* bold font */
 	DECO_ITALIC, /* italic font */
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 12:47:37 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 12:47:37 -0000
@@ -366,7 +366,7 @@
 	rhs = chars_spec2str(p->symtab, word, len, &sz);
 	if (rhs) 
 		encode(p, rhs, sz);
-	else if (DECO_SSPECIAL == d)
+	else if (1 == len)
 		encode(p, word, len);
 }
 
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -497,8 +500,6 @@
 			res(p, seq, ssz);
 			break;
 		case (DECO_SPECIAL):
-			/* FALLTHROUGH */
-		case (DECO_SSPECIAL):
 			spec(p, deco, seq, ssz);
 			break;
 		case (DECO_BOLD):
@@ -513,12 +514,13 @@
 		case (DECO_PREVIOUS):
 			term_fontlast(p);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *word)
+				p->flags |= TERMP_NOSPACE;
+			break;
 		default:
 			break;
 		}
-
-		if (DECO_NOSPACE == deco && '\0' == *word)
-			p->flags |= TERMP_NOSPACE;
 	}
 }
 
@@ -620,13 +622,11 @@
 					(p->symtab, seq, ssz, &rsz);
 				break;
 			case (DECO_SPECIAL):
-				/* FALLTHROUGH */
-			case (DECO_SSPECIAL):
 				rhs = chars_spec2str
 					(p->symtab, seq, ssz, &rsz);
 
 				/* Allow for one-char escapes. */
-				if (DECO_SSPECIAL != d || rhs)
+				if (ssz != 1 || rhs)
 					break;
 
 				rhs = seq;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Unifying the escape-sequence parser.
  2011-04-08 12:50     ` Kristaps Dzonsons
@ 2011-04-08 13:15       ` Kristaps Dzonsons
  2011-04-08 13:56         ` Finished: unifying " Kristaps Dzonsons
  0 siblings, 1 reply; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-08 13:15 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 795 bytes --]

> Step 3: fixed where I forgot to for->while in mdoc_validate.c's
> check_text() loop; removed DECO_SSPECIAL; cleaned up DECO_NOSPACE;
> cleaned up print_encode() in html.c. The next will be removing the DECO
> stuff entirely.

Ok (step 4?), one last patch to show full fidelity between DECO and 
ESCAPE before I rip out DECO stuff.  This also comes with a fix of a 
yet-unnoticed bug: \F escapes were being treated as \f escapes in terms 
of recognising styles (bold, italic, etc.).  However, in the groff 
manual, \F accepts families (Times, Helvetica, etc.) while \f accepts 
styles (duh).  I've actually completely removed the FONTFAMILY notion, 
as we don't really support it (it can be put in later, if necessary, but 
it's unlikely as it can't be reliably displayed across output media).

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 24678 bytes --]

? chat.8
? config.h
? config.log
? foo.1
? foo.1.html
? foo.3
? foo.ps
? ksh.1
? man.txt
? mandoc
? mandoc-db.bak.c
? mandoc.db
? mandoc.index
? manuals.txt
? patch.escapes.txt
? patch.mdoc.txt
? patch.roff.txt
? patch.txt
Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 13:14:21 -0000
@@ -230,7 +230,7 @@
 	if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
 		printf("&#%d;", cp);
 		return;
-	} else if (-1 == cp && DECO_SSPECIAL == d) {
+	} else if (-1 == cp && 1 == len) {
 		fwrite(p, 1, len, stdout);
 		return;
 	} else if (-1 == cp)
@@ -304,40 +304,41 @@
 	int		 len, nospace;
 	const char	*seq;
 	enum roffdeco	 deco;
-	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
+	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH };
 
 	nospace = 0;
 
-	for (; *p; p++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, rejs);
 
 		fwrite(p, 1, sz, stdout);
-		p += /* LINTED */
-			sz;
+		p += (int)sz;
 
-		if ('<' == *p) {
+		if ('\0' == *p)
+			break;
+
+		switch (*p++) {
+		case ('<'):
 			printf("&lt;");
 			continue;
-		} else if ('>' == *p) {
+		case ('>'):
 			printf("&gt;");
 			continue;
-		} else if ('&' == *p) {
+		case ('&'):
 			printf("&amp;");
 			continue;
-		} else if (ASCII_HYPH == *p) {
-			/*
-			 * Note: "soft hyphens" aren't graphically
-			 * displayed when not breaking the text; we want
-			 * them to be displayed.
-			 */
-			/*printf("&#173;");*/
+		case (ASCII_HYPH):
 			putchar('-');
 			continue;
-		} else if ('\0' == *p)
+		default:
+			break;
+		}
+
+		seq = p;
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
 			break;
 
-		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		p += len;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -346,8 +347,6 @@
 		case (DECO_RESERVED):
 			print_res(h, seq, sz);
 			break;
-		case (DECO_SSPECIAL):
-			/* FALLTHROUGH */
 		case (DECO_SPECIAL):
 			print_spec(h, deco, seq, sz);
 			break;
@@ -362,14 +361,13 @@
 				break;
 			print_metaf(h, deco);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *p)
+				nospace = 1;
+			break;
 		default:
 			break;
 		}
-
-		p += len - 1;
-
-		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
-			nospace = 1;
 	}
 
 	return(nospace);
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 13:14:21 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 13:14:21 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 13:14:22 -0000
@@ -35,198 +35,362 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz, rlim;
+	const char	*cp, *rstart;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	rstart = cp;
+	if (start)
+		*start = rstart;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('F'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		rstart= &cp[i];
+		if (start) 
+			*start = rstart;
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		rstart = &cp[i];
+		if (start) 
+			*start = rstart;
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
+	default:
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
+	}
+
+	assert(ESCAPE_ERROR != gly);
+
+	rstart = &cp[i];
+	if (start)
+		*start = rstart;
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+	rlim = -1;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+
+		rlim = *end - &cp[i];
+		if (sz)
+			*sz = rlim;
+		(*end)++;
+		goto out;
+	}
+
+	assert(lim > 0);
+
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	rlim = lim;
+	if (sz)
+		*sz = rlim;
+
+	*end = &cp[i] + lim;
+
+out:
+	assert(rlim >= 0 && rstart);
+
+	/* Run post-processors. */
+
+	switch (gly) {
+	case (ESCAPE_FONT):
+		if (1 != rlim)
+			break;
+		switch (*rstart) {
+		case ('3'):
+			/* FALLTHROUGH */
+		case ('B'):
+			gly = ESCAPE_FONTBOLD;
+			break;
+		case ('2'):
+			/* FALLTHROUGH */
+		case ('I'):
+			gly = ESCAPE_FONTITALIC;
+			break;
+		case ('P'):
+			gly = ESCAPE_FONTPREV;
+			break;
+		case ('1'):
+			/* FALLTHROUGH */
+		case ('R'):
+			gly = ESCAPE_FONTROMAN;
 			break;
 		}
-		/* FALLTHROUGH */
+	case (ESCAPE_SPECIAL):
+		if (1 != rlim)
+			break;
+		if ('c' == *rstart)
+			gly = ESCAPE_NOSPACE;
+		break;
 	default:
-		len = 1;
-		p--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
-	}
-
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
+	return(gly);
 }
-
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 13:14:22 -0000
@@ -288,6 +288,20 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTBOLD,
+	ESCAPE_FONTITALIC,
+	ESCAPE_FONTROMAN,
+	ESCAPE_FONTPREV,
+	ESCAPE_NUMBERED, /* a numbered glyph */
+	ESCAPE_NOSPACE
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +323,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 13:14:22 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
-	for ( ; *p; p++, pos++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 13:14:22 -0000
@@ -178,237 +178,55 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
-		*d = DECO_SPECIAL;
-		lim = 2;
-		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		case ('3'):
-			/* FALLTHROUGH */
-		case ('B'):
-			*d = DECO_BOLD;
-			return(i);
-		case ('2'):
-			/* FALLTHROUGH */
-		case ('I'):
-			*d = DECO_ITALIC;
-			return(i);
-		case ('P'):
-			*d = DECO_PREVIOUS;
-			return(i);
-		case ('1'):
-			/* FALLTHROUGH */
-		case ('R'):
-			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
+	cp = start = *word;
 
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
 		break;
-	case ('['):
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		term = ']';
 		break;
-	case ('c'):
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	case (ESCAPE_FONTBOLD):
+		*d = DECO_BOLD;
+		break;
+	case (ESCAPE_FONTITALIC):
+		*d = DECO_ITALIC;
+		break;
+	case (ESCAPE_FONTROMAN):
+		*d = DECO_ROMAN;
+		break;
+	case (ESCAPE_FONTPREV):
+		*d = DECO_PREVIOUS;
+		break;
+	case (ESCAPE_NOSPACE):
 		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
-			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
 		break;
 	}
 
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
-		}
-
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: out.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.h,v
retrieving revision 1.18
diff -u -r1.18 out.h
--- out.h	22 Mar 2011 10:13:01 -0000	1.18
+++ out.h	8 Apr 2011 13:14:22 -0000
@@ -35,7 +35,6 @@
 	DECO_NONE,
 	DECO_NUMBERED, /* numbered character */
 	DECO_SPECIAL, /* special character */
-	DECO_SSPECIAL, /* single-char special */
 	DECO_RESERVED, /* reserved word */
 	DECO_BOLD, /* bold font */
 	DECO_ITALIC, /* italic font */
@@ -43,7 +42,6 @@
 	DECO_PREVIOUS, /* revert to previous font */
 	DECO_NOSPACE, /* suppress spacing */
 	DECO_FONT, /* font */
-	DECO_FFONT, /* font family */
 	DECO_MAX
 };
 
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 13:14:22 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 13:14:22 -0000
@@ -366,7 +366,7 @@
 	rhs = chars_spec2str(p->symtab, word, len, &sz);
 	if (rhs) 
 		encode(p, rhs, sz);
-	else if (DECO_SSPECIAL == d)
+	else if (1 == len)
 		encode(p, word, len);
 }
 
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
@@ -497,8 +500,6 @@
 			res(p, seq, ssz);
 			break;
 		case (DECO_SPECIAL):
-			/* FALLTHROUGH */
-		case (DECO_SSPECIAL):
 			spec(p, deco, seq, ssz);
 			break;
 		case (DECO_BOLD):
@@ -513,12 +514,13 @@
 		case (DECO_PREVIOUS):
 			term_fontlast(p);
 			break;
+		case (DECO_NOSPACE):
+			if ('\0' == *word)
+				p->flags |= TERMP_NOSPACE;
+			break;
 		default:
 			break;
 		}
-
-		if (DECO_NOSPACE == deco && '\0' == *word)
-			p->flags |= TERMP_NOSPACE;
 	}
 }
 
@@ -620,13 +622,11 @@
 					(p->symtab, seq, ssz, &rsz);
 				break;
 			case (DECO_SPECIAL):
-				/* FALLTHROUGH */
-			case (DECO_SSPECIAL):
 				rhs = chars_spec2str
 					(p->symtab, seq, ssz, &rsz);
 
 				/* Allow for one-char escapes. */
-				if (DECO_SSPECIAL != d || rhs)
+				if (ssz != 1 || rhs)
 					break;
 
 				rhs = seq;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Finished: unifying the escape-sequence parser.
  2011-04-08 13:15       ` Kristaps Dzonsons
@ 2011-04-08 13:56         ` Kristaps Dzonsons
  0 siblings, 0 replies; 6+ messages in thread
From: Kristaps Dzonsons @ 2011-04-08 13:56 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 1290 bytes --]

>> Step 3: fixed where I forgot to for->while in mdoc_validate.c's
>> check_text() loop; removed DECO_SSPECIAL; cleaned up DECO_NOSPACE;
>> cleaned up print_encode() in html.c. The next will be removing the DECO
>> stuff entirely.
>
> Ok (step 4?), one last patch to show full fidelity between DECO and
> ESCAPE before I rip out DECO stuff. This also comes with a fix of a
> yet-unnoticed bug: \F escapes were being treated as \f escapes in terms
> of recognising styles (bold, italic, etc.). However, in the groff
> manual, \F accepts families (Times, Helvetica, etc.) while \f accepts
> styles (duh). I've actually completely removed the FONTFAMILY notion, as
> we don't really support it (it can be put in later, if necessary, but
> it's unlikely as it can't be reliably displayed across output media).

Hi,

Here's the finished patch: enum roffdeco has been completely removed and 
all components are dipping into mandoc_escape() to get the correct 
subsequences.

This has resulted in some clean-up throughout the code, as all the mess 
of escapes is focussed in one place.  It's also given me a chance to 
clean up other areas in the code that danced around string handling 
(e.g., print_encode() in html.c, term_strlen() and term_word() in 
term.c, etc.).

Oks?  Comments?

Kristaps

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 30273 bytes --]

? chat.8
? config.h
? config.log
? eqn.7.html
? eqn.7.pdf
? eqn.7.ps
? eqn.7.txt
? eqn.7.xhtml
? foo.1
? foo.1.html
? foo.3
? foo.ps
? index.html
? ksh.1
? man.7.html
? man.7.pdf
? man.7.ps
? man.7.txt
? man.7.xhtml
? man.h.html
? man.txt
? mandoc
? mandoc-db.bak.c
? mandoc.1.html
? mandoc.1.pdf
? mandoc.1.ps
? mandoc.1.txt
? mandoc.1.xhtml
? mandoc.3.html
? mandoc.3.pdf
? mandoc.3.ps
? mandoc.3.txt
? mandoc.3.xhtml
? mandoc.db
? mandoc.h.html
? mandoc.index
? mandoc_char.7.html
? mandoc_char.7.pdf
? mandoc_char.7.ps
? mandoc_char.7.txt
? mandoc_char.7.xhtml
? manuals.txt
? mdoc.7.html
? mdoc.7.pdf
? mdoc.7.ps
? mdoc.7.txt
? mdoc.7.xhtml
? mdoc.h.html
? mdocml.md5
? mdocml.tar.gz
? patch.escapes.txt
? patch.mdoc.txt
? patch.roff.txt
? patch.txt
? roff.7.html
? roff.7.pdf
? roff.7.ps
? roff.7.txt
? roff.7.xhtml
? tbl.7.html
? tbl.7.pdf
? tbl.7.ps
? tbl.7.txt
? tbl.7.xhtml
Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 13:54:20 -0000
@@ -94,14 +94,13 @@
 };
 
 static	void		  print_num(struct html *, const char *, size_t);
-static	void		  print_spec(struct html *, enum roffdeco,
-				const char *, size_t);
+static	void		  print_spec(struct html *, const char *, size_t);
 static	void		  print_res(struct html *, const char *, size_t);
 static	void		  print_ctag(struct html *, enum htmltag);
 static	void		  print_doctype(struct html *);
 static	void		  print_xmltype(struct html *);
 static	int		  print_encode(struct html *, const char *, int);
-static	void		  print_metaf(struct html *, enum roffdeco);
+static	void		  print_metaf(struct html *, enum mandoc_esc);
 static	void		  print_attr(struct html *, 
 				const char *, const char *);
 static	void		 *ml_alloc(char *, enum htmltype);
@@ -221,7 +220,7 @@
 }
 
 static void
-print_spec(struct html *h, enum roffdeco d, const char *p, size_t len)
+print_spec(struct html *h, const char *p, size_t len)
 {
 	int		 cp;
 	const char	*rhs;
@@ -230,7 +229,7 @@
 	if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
 		printf("&#%d;", cp);
 		return;
-	} else if (-1 == cp && DECO_SSPECIAL == d) {
+	} else if (-1 == cp && 1 == len) {
 		fwrite(p, 1, len, stdout);
 		return;
 	} else if (-1 == cp)
@@ -260,21 +259,21 @@
 
 
 static void
-print_metaf(struct html *h, enum roffdeco deco)
+print_metaf(struct html *h, enum mandoc_esc deco)
 {
 	enum htmlfont	 font;
 
 	switch (deco) {
-	case (DECO_PREVIOUS):
+	case (ESCAPE_FONTPREV):
 		font = h->metal;
 		break;
-	case (DECO_ITALIC):
+	case (ESCAPE_FONTITALIC):
 		font = HTMLFONT_ITALIC;
 		break;
-	case (DECO_BOLD):
+	case (ESCAPE_FONTBOLD):
 		font = HTMLFONT_BOLD;
 		break;
-	case (DECO_ROMAN):
+	case (ESCAPE_FONTROMAN):
 		font = HTMLFONT_NONE;
 		break;
 	default:
@@ -303,73 +302,69 @@
 	size_t		 sz;
 	int		 len, nospace;
 	const char	*seq;
-	enum roffdeco	 deco;
+	enum mandoc_esc	 esc;
 	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
 
 	nospace = 0;
 
-	for (; *p; p++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, rejs);
 
 		fwrite(p, 1, sz, stdout);
-		p += /* LINTED */
-			sz;
+		p += (int)sz;
 
-		if ('<' == *p) {
+		if ('\0' == *p)
+			break;
+
+		switch (*p++) {
+		case ('<'):
 			printf("&lt;");
 			continue;
-		} else if ('>' == *p) {
+		case ('>'):
 			printf("&gt;");
 			continue;
-		} else if ('&' == *p) {
+		case ('&'):
 			printf("&amp;");
 			continue;
-		} else if (ASCII_HYPH == *p) {
-			/*
-			 * Note: "soft hyphens" aren't graphically
-			 * displayed when not breaking the text; we want
-			 * them to be displayed.
-			 */
-			/*printf("&#173;");*/
+		case (ASCII_HYPH):
 			putchar('-');
 			continue;
-		} else if ('\0' == *p)
+		default:
 			break;
+		}
 
-		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		esc = mandoc_escape(&p, &seq, &len);
+		if (ESCAPE_ERROR == esc)
+			break;
 
-		switch (deco) {
-		case (DECO_NUMBERED):
-			print_num(h, seq, sz);
+		switch (esc) {
+		case (ESCAPE_NUMBERED):
+			print_num(h, seq, len);
 			break;
-		case (DECO_RESERVED):
-			print_res(h, seq, sz);
+		case (ESCAPE_PREDEF):
+			print_res(h, seq, len);
 			break;
-		case (DECO_SSPECIAL):
-			/* FALLTHROUGH */
-		case (DECO_SPECIAL):
-			print_spec(h, deco, seq, sz);
+		case (ESCAPE_SPECIAL):
+			print_spec(h, seq, len);
 			break;
-		case (DECO_PREVIOUS):
+		case (ESCAPE_FONTPREV):
 			/* FALLTHROUGH */
-		case (DECO_BOLD):
+		case (ESCAPE_FONTBOLD):
 			/* FALLTHROUGH */
-		case (DECO_ITALIC):
+		case (ESCAPE_FONTITALIC):
 			/* FALLTHROUGH */
-		case (DECO_ROMAN):
+		case (ESCAPE_FONTROMAN):
 			if (norecurse)
 				break;
-			print_metaf(h, deco);
+			print_metaf(h, esc);
+			break;
+		case (ESCAPE_NOSPACE):
+			if ('\0' == *p)
+				nospace = 1;
 			break;
 		default:
 			break;
 		}
-
-		p += len - 1;
-
-		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
-			nospace = 1;
 	}
 
 	return(nospace);
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 13:54:20 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 13:54:20 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 13:54:20 -0000
@@ -35,198 +35,362 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz, rlim;
+	const char	*cp, *rstart;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	rstart = cp;
+	if (start)
+		*start = rstart;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('F'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		rstart= &cp[i];
+		if (start) 
+			*start = rstart;
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		rstart = &cp[i];
+		if (start) 
+			*start = rstart;
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
+	default:
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
+	}
+
+	assert(ESCAPE_ERROR != gly);
+
+	rstart = &cp[i];
+	if (start)
+		*start = rstart;
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+	rlim = -1;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+
+		rlim = *end - &cp[i];
+		if (sz)
+			*sz = rlim;
+		(*end)++;
+		goto out;
+	}
+
+	assert(lim > 0);
+
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	rlim = lim;
+	if (sz)
+		*sz = rlim;
+
+	*end = &cp[i] + lim;
+
+out:
+	assert(rlim >= 0 && rstart);
+
+	/* Run post-processors. */
+
+	switch (gly) {
+	case (ESCAPE_FONT):
+		if (1 != rlim)
+			break;
+		switch (*rstart) {
+		case ('3'):
+			/* FALLTHROUGH */
+		case ('B'):
+			gly = ESCAPE_FONTBOLD;
+			break;
+		case ('2'):
+			/* FALLTHROUGH */
+		case ('I'):
+			gly = ESCAPE_FONTITALIC;
+			break;
+		case ('P'):
+			gly = ESCAPE_FONTPREV;
+			break;
+		case ('1'):
+			/* FALLTHROUGH */
+		case ('R'):
+			gly = ESCAPE_FONTROMAN;
 			break;
 		}
-		/* FALLTHROUGH */
+	case (ESCAPE_SPECIAL):
+		if (1 != rlim)
+			break;
+		if ('c' == *rstart)
+			gly = ESCAPE_NOSPACE;
+		break;
 	default:
-		len = 1;
-		p--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
-	}
-
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
+	return(gly);
 }
-
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 13:54:21 -0000
@@ -288,6 +288,20 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0, /* bail! unparsable escape */
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a generic font mode */
+	ESCAPE_FONTBOLD, /* bold font mode */
+	ESCAPE_FONTITALIC, /* italic font mode */
+	ESCAPE_FONTROMAN, /* roman font mode */
+	ESCAPE_FONTPREV, /* previous font mode */
+	ESCAPE_NUMBERED, /* a numbered glyph */
+	ESCAPE_NOSPACE /* suppress space if the last on a line */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +323,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 13:54:21 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
-	for ( ; *p; p++, pos++) {
+	while ('\0' != *p) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 13:54:21 -0000
@@ -174,243 +174,6 @@
 	(void)strftime(p, sz, "%Y", &tm);
 }
 
-
-int
-a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
-{
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
-
-	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
-
-	switch ((c = wp[i++])) {
-	case ('('):
-		*d = DECO_SPECIAL;
-		lim = 2;
-		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		case ('3'):
-			/* FALLTHROUGH */
-		case ('B'):
-			*d = DECO_BOLD;
-			return(i);
-		case ('2'):
-			/* FALLTHROUGH */
-		case ('I'):
-			*d = DECO_ITALIC;
-			return(i);
-		case ('P'):
-			*d = DECO_PREVIOUS;
-			return(i);
-		case ('1'):
-			/* FALLTHROUGH */
-		case ('R'):
-			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
-		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
-			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
-		}
-
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
-
-	return(i);
-}
-
 /*
  * Calculate the abstract widths and decimal positions of columns in a
  * table.  This routine allocates the columns structures then runs over
Index: out.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.h,v
retrieving revision 1.18
diff -u -r1.18 out.h
--- out.h	22 Mar 2011 10:13:01 -0000	1.18
+++ out.h	8 Apr 2011 13:54:21 -0000
@@ -31,22 +31,6 @@
 	SCALE_MAX
 };
 
-enum	roffdeco {
-	DECO_NONE,
-	DECO_NUMBERED, /* numbered character */
-	DECO_SPECIAL, /* special character */
-	DECO_SSPECIAL, /* single-char special */
-	DECO_RESERVED, /* reserved word */
-	DECO_BOLD, /* bold font */
-	DECO_ITALIC, /* italic font */
-	DECO_ROMAN, /* "normal" undecorated font */
-	DECO_PREVIOUS, /* revert to previous font */
-	DECO_NOSPACE, /* suppress spacing */
-	DECO_FONT, /* font */
-	DECO_FFONT, /* font family */
-	DECO_MAX
-};
-
 enum	chars {
 	CHARS_ASCII, /* 7-bit ascii representation */
 	CHARS_HTML /* unicode values */
@@ -85,7 +69,6 @@
 	while (/* CONSTCOND */ 0)
 
 int	  	  a2roffsu(const char *, struct roffsu *, enum roffscale);
-int	  	  a2roffdeco(enum roffdeco *, const char **, size_t *);
 void	  	  time2a(time_t, char *, size_t);
 void	  	  tblcalc(struct rofftbl *tbl, const struct tbl_span *);
 
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 13:54:21 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 13:54:21 -0000
@@ -33,8 +33,7 @@
 #include "term.h"
 #include "main.h"
 
-static	void		  spec(struct termp *, enum roffdeco,
-				const char *, size_t);
+static	void		  spec(struct termp *, const char *, size_t);
 static	void		  res(struct termp *, const char *, size_t);
 static	void		  bufferc(struct termp *, char);
 static	void		  adjbuf(struct termp *p, size_t);
@@ -358,7 +357,7 @@
 
 
 static void
-spec(struct termp *p, enum roffdeco d, const char *word, size_t len)
+spec(struct termp *p, const char *word, size_t len)
 {
 	const char	*rhs;
 	size_t		 sz;
@@ -366,7 +365,7 @@
 	rhs = chars_spec2str(p->symtab, word, len, &sz);
 	if (rhs) 
 		encode(p, rhs, sz);
-	else if (DECO_SSPECIAL == d)
+	else if (1 == len)
 		encode(p, word, len);
 }
 
@@ -457,8 +456,9 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
-	enum roffdeco	 deco;
+	enum mandoc_esc	 esc;
 
 	if ( ! (TERMP_NOSPACE & p->flags)) {
 		if ( ! (TERMP_KEEP & p->flags)) {
@@ -478,7 +478,7 @@
 
 	p->flags &= ~(TERMP_SENTENCE | TERMP_IGNDELIM);
 
-	while (*word) {
+	while ('\0' != *word) {
 		if ((ssz = strcspn(word, "\\")) > 0)
 			encode(p, word, ssz);
 
@@ -486,39 +486,40 @@
 		if ('\\' != *word)
 			continue;
 
-		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		word++;
+		esc = mandoc_escape(&word, &seq, &sz);
+		if (ESCAPE_ERROR == esc)
+			break;
 
-		switch (deco) {
-		case (DECO_NUMBERED):
-			numbered(p, seq, ssz);
-			break;
-		case (DECO_RESERVED):
-			res(p, seq, ssz);
-			break;
-		case (DECO_SPECIAL):
-			/* FALLTHROUGH */
-		case (DECO_SSPECIAL):
-			spec(p, deco, seq, ssz);
+		switch (esc) {
+		case (ESCAPE_NUMBERED):
+			numbered(p, seq, sz);
+			break;
+		case (ESCAPE_PREDEF):
+			res(p, seq, sz);
+			break;
+		case (ESCAPE_SPECIAL):
+			spec(p, seq, sz);
 			break;
-		case (DECO_BOLD):
+		case (ESCAPE_FONTBOLD):
 			term_fontrepl(p, TERMFONT_BOLD);
 			break;
-		case (DECO_ITALIC):
+		case (ESCAPE_FONTITALIC):
 			term_fontrepl(p, TERMFONT_UNDER);
 			break;
-		case (DECO_ROMAN):
+		case (ESCAPE_FONTROMAN):
 			term_fontrepl(p, TERMFONT_NONE);
 			break;
-		case (DECO_PREVIOUS):
+		case (ESCAPE_FONTPREV):
 			term_fontlast(p);
 			break;
+		case (ESCAPE_NOSPACE):
+			if ('\0' == *word)
+				p->flags |= TERMP_NOSPACE;
+			break;
 		default:
 			break;
 		}
-
-		if (DECO_NOSPACE == deco && '\0' == *word)
-			p->flags |= TERMP_NOSPACE;
 	}
 }
 
@@ -600,33 +601,36 @@
 size_t
 term_strlen(const struct termp *p, const char *cp)
 {
-	size_t		 sz, ssz, rsz, i;
-	enum roffdeco	 d;
+	size_t		 sz, rsz, i;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 	const char	*seq, *rhs;
 
-	for (sz = 0; '\0' != *cp; )
-		/*
-		 * Account for escaped sequences within string length
-		 * calculations.  This follows the logic in term_word()
-		 * as we must calculate the width of produced strings.
-		 */
-		if ('\\' == *cp) {
-			seq = ++cp;
-			cp += a2roffdeco(&d, &seq, &ssz);
+	/*
+	 * Account for escaped sequences within string length
+	 * calculations.  This follows the logic in term_word() as we
+	 * must calculate the width of produced strings.
+	 */
+
+	sz = 0;
+	while ('\0' != *cp)
+		switch (*cp) {
+		case ('\\'):
+			++cp;
+			esc = mandoc_escape(&cp, &seq, &ssz);
+			if (ESCAPE_ERROR == esc)
+				return(sz);
 
-			switch (d) {
-			case (DECO_RESERVED):
+			switch (esc) {
+			case (ESCAPE_PREDEF):
 				rhs = chars_res2str
 					(p->symtab, seq, ssz, &rsz);
 				break;
-			case (DECO_SPECIAL):
-				/* FALLTHROUGH */
-			case (DECO_SSPECIAL):
+			case (ESCAPE_SPECIAL):
 				rhs = chars_spec2str
 					(p->symtab, seq, ssz, &rsz);
 
-				/* Allow for one-char escapes. */
-				if (DECO_SSPECIAL != d || rhs)
+				if (ssz != 1 || rhs)
 					break;
 
 				rhs = seq;
@@ -637,17 +641,24 @@
 				break;
 			}
 
-			if (rhs)
-				for (i = 0; i < rsz; i++)
-					sz += (*p->width)(p, *rhs++);
-		} else if (ASCII_NBRSP == *cp) {
+			if (NULL == rhs)
+				break;
+
+			for (i = 0; i < rsz; i++)
+				sz += (*p->width)(p, *rhs++);
+			break;
+		case (ASCII_NBRSP):
 			sz += (*p->width)(p, ' ');
 			cp++;
-		} else if (ASCII_HYPH == *cp) {
+			break;
+		case (ASCII_HYPH):
 			sz += (*p->width)(p, '-');
 			cp++;
-		} else
+			break;
+		default:
 			sz += (*p->width)(p, *cp++);
+			break;
+		}
 
 	return(sz);
 }

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-04-08 13:56 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-07 14:00 Unifying the escape-sequence parser Kristaps Dzonsons
2011-04-08 11:19 ` Kristaps Dzonsons
2011-04-08 12:16   ` Kristaps Dzonsons
2011-04-08 12:50     ` Kristaps Dzonsons
2011-04-08 13:15       ` Kristaps Dzonsons
2011-04-08 13:56         ` Finished: unifying " Kristaps Dzonsons

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).