Re: Unifying the escape-sequence parser.

tech@mandoc.bsd.lv
 help / color / mirror / Atom feed

From: Kristaps Dzonsons <kristaps@bsd.lv>
To: tech@mdocml.bsd.lv
Subject: Re: Unifying the escape-sequence parser.
Date: Fri, 08 Apr 2011 14:16:05 +0200	[thread overview]
Message-ID: <4D9EFC85.3040301@bsd.lv> (raw)
In-Reply-To: <4D9EEF2F.2030307@bsd.lv>

[-- Attachment #1: Type: text/plain, Size: 940 bytes --]

> Step 2.
>
> This finishes off the new escape-sequence parser and puts it into
> mandoc_escape (mandoc.c, mandoc.h), then makes it the underlying engine
> for a2roffdeco (out.c) (requiring a tiny change to term.c and html.c for
> bailing out on bad sequences) and roff_res (roff.c).
>
> Now all escape-sequences are being parsed with the same engine! This
> logic was being repeated in THREE different places, earlier (mandoc.c
> for validation, out.c for output, and roff.c for predefined escapes).
>
> I've run this over all manuals I know of without problems, but it can
> really use a close look-over with border cases.
>
> The next step is to clean out the out.c code, completely removing enum
> roffdeco (putting that logic into mandoc.c, perhaps).

Step 2b.  I rolled back the roff part: the search/replace of predefined 
strings must happen prior to escape processing.  I'd also forgotten to 
include mdoc_validate.c in the patch.

[-- Attachment #2: patch.escapes.txt --]
[-- Type: text/plain, Size: 20299 bytes --]

Index: html.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/html.c,v
retrieving revision 1.131
diff -u -r1.131 html.c
--- html.c	22 Mar 2011 14:05:45 -0000	1.131
+++ html.c	8 Apr 2011 12:15:23 -0000
@@ -337,7 +337,8 @@
 			break;
 
 		seq = ++p;
-		len = a2roffdeco(&deco, &seq, &sz);
+		if (0 == (len = a2roffdeco(&deco, &seq, &sz)))
+			break;
 
 		switch (deco) {
 		case (DECO_NUMBERED):
Index: libmandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmandoc.h,v
retrieving revision 1.17
diff -u -r1.17 libmandoc.h
--- libmandoc.h	28 Mar 2011 23:52:13 -0000	1.17
+++ libmandoc.h	8 Apr 2011 12:15:23 -0000
@@ -73,7 +73,6 @@
 			int, int, const char *);
 void		 mandoc_vmsg(enum mandocerr, struct mparse *, 
 			int, int, const char *, ...);
-int		 mandoc_special(char *);
 char		*mandoc_strdup(const char *);
 char		*mandoc_getarg(struct mparse *, char **, int, int *);
 char		*mandoc_normdate(struct mparse *, char *, int, int);
Index: man_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/man_validate.c,v
retrieving revision 1.67
diff -u -r1.67 man_validate.c
--- man_validate.c	22 Mar 2011 15:30:30 -0000	1.67
+++ man_validate.c	8 Apr 2011 12:15:23 -0000
@@ -54,7 +54,7 @@
 static	int	  check_part(CHKARGS);
 static	int	  check_root(CHKARGS);
 static	int	  check_sec(CHKARGS);
-static	int	  check_text(CHKARGS);
+static	void	  check_text(CHKARGS);
 
 static	int	  post_AT(CHKARGS);
 static	int	  post_fi(CHKARGS);
@@ -151,7 +151,8 @@
 
 	switch (m->last->type) {
 	case (MAN_TEXT): 
-		return(check_text(m, m->last));
+		check_text(m, m->last);
+		return(1);
 	case (MAN_ROOT):
 		return(check_root(m, m->last));
 	case (MAN_EQN):
@@ -204,43 +205,48 @@
 	return(1);
 }
 
-
-static int
+static void
 check_text(CHKARGS) 
 {
-	char		*p;
-	int		 pos, c;
+	char		*p, *pp, *cpp;
+	int		 pos;
 	size_t		 sz;
 
-	for (p = n->string, pos = n->pos + 1; *p; p++, pos++) {
-		sz = strcspn(p, "\t\\");
-		p += (int)sz;
+	p = n->string;
+	pos = n->pos + 1;
 
-		if ('\0' == *p)
-			break;
+	while ('\0' != *p) {
+		sz = strcspn(p, "\t\\");
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
-			if (MAN_LITERAL & m->flags)
-				continue;
-			man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			if ( ! (MAN_LITERAL & m->flags))
+				man_pmsg(m, n->line, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
 
-		/* Check the special character. */
+		pos++;
+		pp = ++p;
 
-		c = mandoc_special(p);
-		if (c) {
-			p += c - 1;
-			pos += c - 1;
-		} else
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			man_pmsg(m, n->line, pos, MANDOCERR_BADESCAPE);
-	}
+			break;
+		}
 
-	return(1);
-}
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
 
+		pos += pp - p;
+		p = pp;
+	}
+}
 
 #define	INEQ_DEFINE(x, ineq, name) \
 static int \
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.44
diff -u -r1.44 mandoc.c
--- mandoc.c	28 Mar 2011 23:52:13 -0000	1.44
+++ mandoc.c	8 Apr 2011 12:15:23 -0000
@@ -35,198 +35,315 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
+static	int	 numescape(const char *);
 
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions.  This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks. 
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	int		 i;
+	size_t		 sz;
+	const char	*cp;
+
+	i = 0;
+
+	/* The expression consists of a subexpression. */
+
+	if ('\\' == start[i]) {
+		cp = &start[++i];
+		/*
+		 * Read past the end of the subexpression.
+		 * Bail immediately on errors.
+		 */
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		return(i + cp - &start[i]);
+	} 
+
+	if ('(' != start[i++])
+		return(0);
+
+	/*
+	 * A parenthesised subexpression.  Read until the closing
+	 * parenthesis, making sure to handle any nested subexpressions
+	 * that might ruin our parse.
+	 */
+
+	while (')' != start[i]) {
+		sz = strcspn(&start[i], ")\\");
+		i += (int)sz;
+
+		if ('\0' == start[i])
+			return(-1);
+		else if ('\\' != start[i])
+			continue;
+
+		cp = &start[++i];
+		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+			return(-1);
+		i += cp - &start[i];
+	}
+
+	/* Read past the terminating ')'. */
+	return(++i);
+}
+
+/*
+ * Handle an escaped sequeence.  This should be called with any
+ * string subsequent a `\'.  Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence.  If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away.  If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz.  Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
+{
+	char		 c, term, numeric;
+	int		 i, lim, ssz;
+	const char	*cp;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	if (start)
+		*start = cp;
+	i = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
+	numeric = 0;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		lim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
-		/* FALLTHROUGH */
-	case ('X'):
-		/* FALLTHROUGH */
-	case ('x'):
-		/* FALLTHROUGH */
-	case ('S'):
-		/* FALLTHROUGH */
-	case ('R'):
-		/* FALLTHROUGH */
-	case ('N'):
-		/* FALLTHROUGH */
-	case ('l'):
-		/* FALLTHROUGH */
-	case ('L'):
-		/* FALLTHROUGH */
-	case ('H'):
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('D'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('a'):
+	case ('Y'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
-		term = '\'';
-		break;
-#endif
-	case ('h'):
+	case ('*'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_PREDEF;
 		/* FALLTHROUGH */
-	case ('v'):
+	case ('F'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONTFAM;
 		/* FALLTHROUGH */
-	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		if (start) 
+			*start = &cp[i];
 
-		switch (*p++) {
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+		break;
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
-
-			if (')' == *p++)
-				break;
-
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-		}
-
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
+		/* FALLTHROUGH */
+	case ('b'):
+		/* FALLTHROUGH */
+	case ('D'):
+		/* FALLTHROUGH */
+	case ('o'):
+		/* FALLTHROUGH */
+	case ('R'):
+		/* FALLTHROUGH */
+	case ('X'):
+		/* FALLTHROUGH */
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
+		term = '\'';
 		break;
-#if 0
-	case ('Y'):
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
 		/* FALLTHROUGH */
-	case ('V'):
+	case ('h'):
 		/* FALLTHROUGH */
-	case ('$'):
+	case ('H'):
 		/* FALLTHROUGH */
-	case ('n'):
+	case ('L'):
 		/* FALLTHROUGH */
-#endif
-	case ('k'):
+	case ('l'):
 		/* FALLTHROUGH */
-	case ('M'):
+	case ('N'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_NUMBERED;
 		/* FALLTHROUGH */
-	case ('m'):
+	case ('S'):
 		/* FALLTHROUGH */
-	case ('f'):
+	case ('v'):
 		/* FALLTHROUGH */
-	case ('F'):
+	case ('w'):
 		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = numeric = '\'';
+		break;
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
+	case ('s'):
+		gly = ESCAPE_IGNORE;
+
+		if (start) 
+			*start = &cp[i];
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			lim = 2;
 			break;
 		case ('['):
-			term = ']';
+			term = numeric = ']';
+			break;
+		case ('\''):
+			term = numeric = '\'';
 			break;
 		default:
-			len = 1;
-			p--;
+			lim = 1;
+			i--;
 			break;
 		}
+
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
-			break;
-		}
-		/* FALLTHROUGH */
+
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
 	default:
-		len = 1;
-		p--;
+		gly = ESCAPE_SPECIAL;
+		lim = 1;
+		i--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
+	assert(ESCAPE_ERROR != gly);
+
+	if (start)
+		*start = &cp[i];
+
+	/*
+	 * If a terminating block has been specified, we need to
+	 * handle the case of recursion, which could have their
+	 * own terminating blocks that mess up our parse.  This, by the
+	 * way, means that the "start" and "size" values will be
+	 * effectively meaningless.
+	 */
+
+	ssz = 0;
+	if (numeric && -1 == (ssz = numescape(&cp[i])))
+		return(ESCAPE_ERROR);
+
+	i += ssz;
+
+	/*
+	 * We have a character terminator.  Try to read up to that
+	 * character.  If we can't (i.e., we hit the nil), then return
+	 * an error; if we can, calculate our length, read past the
+	 * terminating character, and exit.
+	 */
+
+	if ('\0' != term) {
+		*end = strchr(&cp[i], term);
+		if ('\0' == *end)
+			return(ESCAPE_ERROR);
+		if (sz)
+			*sz = *end - &cp[i];
+		(*end)++;
+		return(gly);
 	}
 
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
-}
+	assert(lim > 0);
 
+	/*
+	 * We have a numeric limit.  If the string is shorter than that,
+	 * stop and return an error.  Else adjust our endpoint, length,
+	 * and return the current glyph.
+	 */
+
+	if ((size_t)lim > strlen(&cp[i]))
+		return(ESCAPE_ERROR);
+
+	if (sz)
+		*sz = lim;
+	*end = &cp[i] + lim;
+	return(gly);
+}
 
 void *
 mandoc_calloc(size_t num, size_t size)
Index: mandoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.h,v
retrieving revision 1.69
diff -u -r1.69 mandoc.h
--- mandoc.h	28 Mar 2011 21:49:42 -0000	1.69
+++ mandoc.h	8 Apr 2011 12:15:23 -0000
@@ -288,6 +288,16 @@
 	MPARSE_MAN /* assume -man */
 };
 
+enum	mandoc_esc {
+	ESCAPE_ERROR = 0,
+	ESCAPE_IGNORE, /* escape to be ignored */
+	ESCAPE_SPECIAL, /* a regular special character */
+	ESCAPE_PREDEF, /* a predefined special character */
+	ESCAPE_FONT, /* a font mode */
+	ESCAPE_FONTFAM, /* a font family */
+	ESCAPE_NUMBERED /* a numbered glyph */
+};
+
 typedef	void	(*mandocmsg)(enum mandocerr, enum mandoclevel,
 			const char *, int, int, const char *);
 
@@ -309,6 +319,8 @@
 void		 *mandoc_calloc(size_t, size_t);
 void		 *mandoc_malloc(size_t);
 void		 *mandoc_realloc(void *, size_t);
+
+enum mandoc_esc	  mandoc_escape(const char **, const char **, int *);
 
 __END_DECLS
 
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.166
diff -u -r1.166 mdoc_validate.c
--- mdoc_validate.c	3 Apr 2011 09:53:50 -0000	1.166
+++ mdoc_validate.c	8 Apr 2011 12:15:24 -0000
@@ -545,31 +545,39 @@
 static void
 check_text(struct mdoc *m, int ln, int pos, char *p)
 {
-	int		 c;
+	char		*cpp, *pp;
 	size_t		 sz;
 
 	for ( ; *p; p++, pos++) {
 		sz = strcspn(p, "\t\\");
-		p += (int)sz;
-
-		if ('\0' == *p)
-			break;
 
+		p += (int)sz;
 		pos += (int)sz;
 
 		if ('\t' == *p) {
 			if ( ! (MDOC_LITERAL & m->flags))
 				mdoc_pmsg(m, ln, pos, MANDOCERR_BADTAB);
+			p++;
+			pos++;
 			continue;
-		}
+		} else if ('\0' == *p)
+			break;
+
+		pos++;
+		pp = ++p;
 
-		if (0 == (c = mandoc_special(p))) {
+		if (ESCAPE_ERROR == mandoc_escape
+				((const char **)&pp, NULL, NULL)) {
 			mdoc_pmsg(m, ln, pos, MANDOCERR_BADESCAPE);
-			continue;
+			break;
 		}
 
-		p += c - 1;
-		pos += c - 1;
+		cpp = p;
+		while (NULL != (cpp = memchr(cpp, ASCII_HYPH, pp - cpp)))
+			*cpp = '-';
+
+		pos += pp - p;
+		p = pp;
 	}
 }
 
Index: out.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/out.c,v
retrieving revision 1.39
diff -u -r1.39 out.c
--- out.c	17 Mar 2011 08:49:34 -0000	1.39
+++ out.c	8 Apr 2011 12:15:24 -0000
@@ -178,237 +178,70 @@
 int
 a2roffdeco(enum roffdeco *d, const char **word, size_t *sz)
 {
-	int		 i, j, lim;
-	char		 term, c;
-	const char	*wp;
-	enum roffdeco	 dd;
+	const char	*cp, *start;
+	int		 ssz;
+	enum mandoc_esc	 esc;
 
 	*d = DECO_NONE;
-	lim = i = 0;
-	term = '\0';
-	wp = *word;
 
-	switch ((c = wp[i++])) {
-	case ('('):
+	cp = start = *word;
+
+	esc = mandoc_escape(&cp, word, &ssz);
+
+	switch (esc) {
+	case (ESCAPE_ERROR):
+		return(0);
+	case (ESCAPE_IGNORE):
+		break;
+	case (ESCAPE_NUMBERED):
+		*d = DECO_NUMBERED;
+		break;
+	case (ESCAPE_FONT):
+		*d = DECO_FONT;
+		break;
+	case (ESCAPE_FONTFAM):
+		*d = DECO_FFONT;
+		break;
+	case (ESCAPE_SPECIAL):
 		*d = DECO_SPECIAL;
-		lim = 2;
 		break;
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('f'):
-		*d = 'F' == c ? DECO_FFONT : DECO_FONT;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
+	case (ESCAPE_PREDEF):
+		*d = DECO_RESERVED;
+		break;
+	}
+
+	assert(ssz >= 0);
+	*sz = (size_t)ssz;
+	ssz = cp - start;
+
+	if (1 == *sz && (DECO_FONT == *d || DECO_FFONT == *d))
+		switch (**word) {
 		case ('3'):
 			/* FALLTHROUGH */
 		case ('B'):
 			*d = DECO_BOLD;
-			return(i);
+			break;
 		case ('2'):
 			/* FALLTHROUGH */
 		case ('I'):
 			*d = DECO_ITALIC;
-			return(i);
+			break;
 		case ('P'):
 			*d = DECO_PREVIOUS;
-			return(i);
+			break;
 		case ('1'):
 			/* FALLTHROUGH */
 		case ('R'):
 			*d = DECO_ROMAN;
-			return(i);
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('*'):
-		if ('*' == c)
-			*d = DECO_RESERVED;
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
-			break;
-		default:
-			i--;
-			lim = 1;
-			break;
-		}
-		break;
-
-	case ('N'):
-
-		/*
-		 * Sequence of characters:  backslash,  'N' (i = 0),
-		 * starting delimiter (i = 1), character number (i = 2).
-		 */
-
-		*word = wp + 2;
-		*sz = 0;
-
-		/*
-		 * Cannot use a digit as a starting delimiter;
-		 * but skip the digit anyway.
-		 */
-
-		if (isdigit((int)wp[1]))
-			return(2);
-
-		/*
-		 * Any non-digit terminates the character number.
-		 * That is, the terminating delimiter need not
-		 * match the starting delimiter.
-		 */
-
-		for (i = 2; isdigit((int)wp[i]); i++)
-			(*sz)++;
-
-		/*
-		 * This is only a numbered character
-		 * if the character number has at least one digit.
-		 */
-
-		if (*sz)
-			*d = DECO_NUMBERED;
-
-		/*
-		 * Skip the terminating delimiter, even if it does not
-		 * match, and even if there is no character number.
-		 */
-
-		return(++i);
-
-	case ('h'):
-		/* FALLTHROUGH */
-	case ('v'):
-		/* FALLTHROUGH */
-	case ('s'):
-		j = 0;
-		if ('+' == wp[i] || '-' == wp[i]) {
-			i++;
-			j = 1;
-		}
-
-		switch (wp[i++]) {
-		case ('('):
-			lim = 2;
-			break;
-		case ('['):
-			term = ']';
 			break;
-		case ('\''):
-			term = '\'';
-			break;
-		case ('0'):
-			j = 1;
-			/* FALLTHROUGH */
 		default:
-			i--;
-			lim = 1;
-			break;
-		}
-
-		if ('+' == wp[i] || '-' == wp[i]) {
-			if (j)
-				return(i);
-			i++;
-		} 
-
-		/* Handle embedded numerical subexp or escape. */
-
-		if ('(' == wp[i]) {
-			while (wp[i] && ')' != wp[i])
-				if ('\\' == wp[i++]) {
-					/* Handle embedded escape. */
-					*word = &wp[i];
-					i += a2roffdeco(&dd, word, sz);
-				}
-
-			if (')' == wp[i++])
-				break;
-
-			*d = DECO_NONE;
-			return(i - 1);
-		} else if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			i += a2roffdeco(&dd, word, sz);
-		}
-
-		break;
-	case ('['):
-		*d = DECO_SPECIAL;
-		term = ']';
-		break;
-	case ('c'):
-		*d = DECO_NOSPACE;
-		return(i);
-	case ('z'):
-		*d = DECO_NONE;
-		if ('\\' == wp[i]) {
-			*word = &wp[++i];
-			return(i + a2roffdeco(&dd, word, sz));
-		} else
-			lim = 1;
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == wp[i++]) {
-			term = '\'';
 			break;
-		} 
-		/* FALLTHROUGH */
-	default:
-		*d = DECO_SSPECIAL;
-		i--;
-		lim = 1;
-		break;
-	}
-
-	assert(term || lim);
-	*word = &wp[i];
-
-	if (term) {
-		j = i;
-		while (wp[i] && wp[i] != term)
-			i++;
-		if ('\0' == wp[i]) {
-			*d = DECO_NONE;
-			return(i);
 		}
 
-		assert(i >= j);
-		*sz = (size_t)(i - j);
-
-		return(i + 1);
-	}
-
-	assert(lim > 0);
-	*sz = (size_t)lim;
-
-	for (j = 0; wp[i] && j < lim; j++)
-		i++;
-	if (j < lim)
-		*d = DECO_NONE;
+	if (1 == *sz && DECO_SPECIAL == *d)
+		*d = 'c' == **word ? DECO_NOSPACE : DECO_SSPECIAL;
 
-	return(i);
+	return(ssz);
 }
 
 /*
Index: read.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/read.c,v
retrieving revision 1.11
diff -u -r1.11 read.c
--- read.c	4 Apr 2011 23:04:38 -0000	1.11
+++ read.c	8 Apr 2011 12:15:24 -0000
@@ -142,7 +142,7 @@
 	"tab in non-literal context",
 	"end of line whitespace",
 	"bad comment style",
-	"unknown escape sequence",
+	"bad escape sequence",
 	"unterminated quoted string",
 	
 	"generic error",
Index: term.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/term.c,v
retrieving revision 1.183
diff -u -r1.183 term.c
--- term.c	4 Apr 2011 21:14:12 -0000	1.183
+++ term.c	8 Apr 2011 12:15:24 -0000
@@ -457,6 +457,7 @@
 term_word(struct termp *p, const char *word)
 {
 	const char	*seq;
+	int		 sz;
 	size_t		 ssz;
 	enum roffdeco	 deco;
 
@@ -487,7 +488,9 @@
 			continue;
 
 		seq = ++word;
-		word += a2roffdeco(&deco, &seq, &ssz);
+		if (0 == (sz = a2roffdeco(&deco, &seq, &ssz)))
+			break;
+		word += sz;
 
 		switch (deco) {
 		case (DECO_NUMBERED):

next prev parent reply	other threads:[~2011-04-08 12:16 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-07 14:00 Kristaps Dzonsons
2011-04-08 11:19 ` Kristaps Dzonsons
2011-04-08 12:16   ` Kristaps Dzonsons [this message]
2011-04-08 12:50     ` Kristaps Dzonsons
2011-04-08 13:15       ` Kristaps Dzonsons
2011-04-08 13:56         ` Finished: unifying " Kristaps Dzonsons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D9EFC85.3040301@bsd.lv \
    --to=kristaps@bsd.lv \
    --cc=tech@mdocml.bsd.lv \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).