From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from krisdoz.my.domain (schwarze@localhost [127.0.0.1])
	by krisdoz.my.domain (8.14.5/8.14.5) with ESMTP id q4VMY6gj008029
	for <source@mdocml.bsd.lv>; Thu, 31 May 2012 18:34:06 -0400 (EDT)
Received: (from schwarze@localhost)
	by krisdoz.my.domain (8.14.5/8.14.3/Submit) id q4VMY6Nj031815;
	Thu, 31 May 2012 18:34:06 -0400 (EDT)
Date: Thu, 31 May 2012 18:34:06 -0400 (EDT)
Message-Id: <201205312234.q4VMY6Nj031815@krisdoz.my.domain>
X-Mailinglist: mdocml-source
Reply-To: source@mdocml.bsd.lv
MIME-Version: 1.0
From: schwarze@mdocml.bsd.lv
To: source@mdocml.bsd.lv
Subject: mdocml: Make recursive parsing of roff(7) escapes actually work in the 
X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/
Content-Type: text/plain; charset=utf-8

Log Message:
-----------
Make recursive parsing of roff(7) escapes actually work in the general case,
in particular when the inner escapes are preceded or followed by other terms.
While doing so, remove lots of bogus code that was trying to make pointless
distinctions between numeric and non-numeric escape sequences, while both
actually share the same syntax and we ignore the semantics anyway.

This prevents some of the strings defined in the pod2man(1) preamble
from producing garbage output, in particular in scandinavian words.
Of course, proper rendering of scandinavian national characters
cannot be expected even with these fixes.

"just commit" kristaps@

Modified Files:
--------------
    mdocml:
        mandoc.c

Revision Data
-------------
Index: mandoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandoc.c,v
retrieving revision 1.63
retrieving revision 1.64
diff -Lmandoc.c -Lmandoc.c -u -p -r1.63 -r1.64
--- mandoc.c
+++ mandoc.c
@@ -37,71 +37,13 @@
 
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
-static	int	 numescape(const char *);
 
-/*
- * Pass over recursive numerical expressions.  This context of this
- * function is important: it's only called within character-terminating
- * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
- * recursion: we don't care about what's in these blocks. 
- * This returns the number of characters skipped or -1 if an error
- * occurs (the caller should bail).
- */
-static int
-numescape(const char *start)
-{
-	int		 i;
-	size_t		 sz;
-	const char	*cp;
-
-	i = 0;
-
-	/* The expression consists of a subexpression. */
-
-	if ('\\' == start[i]) {
-		cp = &start[++i];
-		/*
-		 * Read past the end of the subexpression.
-		 * Bail immediately on errors.
-		 */
-		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-			return(-1);
-		return(i + cp - &start[i]);
-	} 
-
-	if ('(' != start[i++])
-		return(0);
-
-	/*
-	 * A parenthesised subexpression.  Read until the closing
-	 * parenthesis, making sure to handle any nested subexpressions
-	 * that might ruin our parse.
-	 */
-
-	while (')' != start[i]) {
-		sz = strcspn(&start[i], ")\\");
-		i += (int)sz;
-
-		if ('\0' == start[i])
-			return(-1);
-		else if ('\\' != start[i])
-			continue;
-
-		cp = &start[++i];
-		if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-			return(-1);
-		i += cp - &start[i];
-	}
-
-	/* Read past the terminating ')'. */
-	return(++i);
-}
 
 enum mandoc_esc
 mandoc_escape(const char **end, const char **start, int *sz)
 {
-	char		 c, term, numeric;
-	int		 i, lim, ssz, rlim;
+	char		 c, term;
+	int		 i, rlim;
 	const char	*cp, *rstart;
 	enum mandoc_esc	 gly; 
 
@@ -109,9 +51,9 @@ mandoc_escape(const char **end, const ch
 	rstart = cp;
 	if (start)
 		*start = rstart;
-	i = lim = 0;
+	i = rlim = 0;
 	gly = ESCAPE_ERROR;
-	term = numeric = '\0';
+	term = '\0';
 
 	switch ((c = cp[i++])) {
 	/*
@@ -121,7 +63,7 @@ mandoc_escape(const char **end, const ch
 	 */
 	case ('('):
 		gly = ESCAPE_SPECIAL;
-		lim = 2;
+		rlim = 2;
 		break;
 	case ('['):
 		gly = ESCAPE_SPECIAL;
@@ -183,13 +125,13 @@ mandoc_escape(const char **end, const ch
 
 		switch (cp[i++]) {
 		case ('('):
-			lim = 2;
+			rlim = 2;
 			break;
 		case ('['):
 			term = ']';
 			break;
 		default:
-			lim = 1;
+			rlim = 1;
 			i--;
 			break;
 		}
@@ -244,7 +186,7 @@ mandoc_escape(const char **end, const ch
 			gly = ESCAPE_IGNORE;
 		if ('\'' != cp[i++])
 			return(ESCAPE_ERROR);
-		term = numeric = '\'';
+		term = '\'';
 		break;
 
 	/*
@@ -284,16 +226,16 @@ mandoc_escape(const char **end, const ch
 
 		switch (cp[i++]) {
 		case ('('):
-			lim = 2;
+			rlim = 2;
 			break;
 		case ('['):
-			term = numeric = ']';
+			term = ']';
 			break;
 		case ('\''):
-			term = numeric = '\'';
+			term = '\'';
 			break;
 		default:
-			lim = 1;
+			rlim = 1;
 			i--;
 			break;
 		}
@@ -310,70 +252,47 @@ mandoc_escape(const char **end, const ch
 	 */
 	default:
 		gly = ESCAPE_SPECIAL;
-		lim = 1;
+		rlim = 1;
 		i--;
 		break;
 	}
 
 	assert(ESCAPE_ERROR != gly);
 
-	rstart = &cp[i];
+	*end = rstart = &cp[i];
 	if (start)
 		*start = rstart;
 
 	/*
-	 * If a terminating block has been specified, we need to
-	 * handle the case of recursion, which could have their
-	 * own terminating blocks that mess up our parse.  This, by the
-	 * way, means that the "start" and "size" values will be
-	 * effectively meaningless.
-	 */
-
-	ssz = 0;
-	if (numeric && -1 == (ssz = numescape(&cp[i])))
-		return(ESCAPE_ERROR);
-
-	i += ssz;
-	rlim = -1;
-
-	/*
-	 * We have a character terminator.  Try to read up to that
-	 * character.  If we can't (i.e., we hit the nil), then return
-	 * an error; if we can, calculate our length, read past the
-	 * terminating character, and exit.
+	 * Read up to the terminating character,
+	 * paying attention to nested escapes.
 	 */
 
 	if ('\0' != term) {
-		*end = strchr(&cp[i], term);
-		if ('\0' == *end)
+		while (**end != term) {
+			switch (**end) {
+			case ('\0'):
+				return(ESCAPE_ERROR);
+			case ('\\'):
+				(*end)++;
+				if (ESCAPE_ERROR ==
+				    mandoc_escape(end, NULL, NULL))
+					return(ESCAPE_ERROR);
+				break;
+			default:
+				(*end)++;
+				break;
+			}
+		}
+		rlim = (*end)++ - rstart;
+	} else {
+		assert(rlim > 0);
+		if ((size_t)rlim > strlen(rstart))
 			return(ESCAPE_ERROR);
-
-		rlim = *end - &cp[i];
-		if (sz)
-			*sz = rlim;
-		(*end)++;
-		goto out;
+		*end += rlim;
 	}
-
-	assert(lim > 0);
-
-	/*
-	 * We have a numeric limit.  If the string is shorter than that,
-	 * stop and return an error.  Else adjust our endpoint, length,
-	 * and return the current glyph.
-	 */
-
-	if ((size_t)lim > strlen(&cp[i]))
-		return(ESCAPE_ERROR);
-
-	rlim = lim;
 	if (sz)
 		*sz = rlim;
-
-	*end = &cp[i] + lim;
-
-out:
-	assert(rlim >= 0 && rstart);
 
 	/* Run post-processors. */
 
--
 To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv