From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from scc-mailout-kit-01.scc.kit.edu (scc-mailout-kit-01.scc.kit.edu [129.13.231.81])
	by fantadrom.bsd.lv (OpenSMTPD) with ESMTP id 2b216e00
	for <tech@mandoc.bsd.lv>;
	Tue, 2 Apr 2019 11:02:54 -0500 (EST)
Received: from asta-nat.asta.uni-karlsruhe.de ([172.22.63.82] helo=hekate.usta.de)
	by scc-mailout-kit-01.scc.kit.edu with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)
	(envelope-from <schwarze@usta.de>)
	id 1hBLsC-0003nm-88; Tue, 02 Apr 2019 18:02:53 +0200
Received: from donnerwolke.usta.de ([172.24.96.3])
	by hekate.usta.de with esmtp (Exim 4.77)
	(envelope-from <schwarze@usta.de>)
	id 1hBLsB-000741-Az; Tue, 02 Apr 2019 18:02:51 +0200
Received: from athene.usta.de ([172.24.96.10])
	by donnerwolke.usta.de with esmtp (Exim 4.84_2)
	(envelope-from <schwarze@usta.de>)
	id 1hBLsB-0007Mf-2i; Tue, 02 Apr 2019 18:02:51 +0200
Received: from localhost (athene.usta.de [local])
	by athene.usta.de (OpenSMTPD) with ESMTPA id 453c4fb8;
	Tue, 2 Apr 2019 18:02:51 +0200 (CEST)
Date: Tue, 2 Apr 2019 18:02:51 +0200
From: Ingo Schwarze <schwarze@usta.de>
To: Stephen Gregoratto <dev@sgregoratto.me>
Cc: tech@mandoc.bsd.lv
Subject: Re: Parsing errors, output regressions with new XML parser
Message-ID: <20190402160251.GD6369@athene.usta.de>
References: <20190330001919.rrbc2xxrx47upalg@BlackBox>
X-Mailinglist: mandoc-tech
Reply-To: tech@mandoc.bsd.lv
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20190330001919.rrbc2xxrx47upalg@BlackBox>
User-Agent: Mutt/1.8.0 (2017-02-23)

Hi Stephen,

Stephen Gregoratto wrote on Sat, Mar 30, 2019 at 11:19:19AM +1100:

> -  escaped XML chars aren't converted back into ASCII:

The commit below implements the basic functionality.
It can be polished in the future.

> Also, I noticed that cvsweb was down for most of yesterday. Scheduled 
> maintenance?

No, slowcgi(8) crashed.  I updated and restarted slowcgi(8), and it
should be back to normal operation now.

Yours,
  Ingo


Log Message:
-----------
Translate XML character entity references to roff character escape sequences.
Missing feature reported by Stephen Gregoratto <dev at sgregoratto dot me>.

Remaining known issues:
* Whitespace handling isn't perfect yet.
* Numeric character references aren't handled yet.
* The list of entities is still very incomplete.
* When it grows longer, we may have to switch to binary search.
* Local entities declared in the DTD are not yet handled.

Modified Files:
--------------
    docbook2mdoc:
        docbook2mdoc.c
        macro.c
        node.h
        parse.c

Revision Data
-------------
Index: node.h
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/node.h,v
retrieving revision 1.6
retrieving revision 1.7
diff -Lnode.h -Lnode.h -u -p -r1.6 -r1.7
--- node.h
+++ node.h
@@ -54,6 +54,7 @@ enum	nodeid {
 	NODE_EMPHASIS,
 	NODE_ENTRY,
 	NODE_ENVAR,
+	NODE_ESCAPE,
 	NODE_FIELDSYNOPSIS,
 	NODE_FILENAME,
 	NODE_FIRSTTERM,
Index: macro.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/macro.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -Lmacro.c -Lmacro.c -u -p -r1.2 -r1.3
--- macro.c
+++ macro.c
@@ -161,9 +161,10 @@ macro_addnode(struct format *f, struct p
 	 * that text, letting macro_addarg() decide about quoting.
 	 */
 
-	if (pn->node == NODE_TEXT ||
+	if (pn->node == NODE_TEXT || pn->node == NODE_ESCAPE ||
 	    ((pn = TAILQ_FIRST(&pn->childq)) != NULL &&
-	     pn->node == NODE_TEXT && TAILQ_NEXT(pn, child) == NULL)) {
+	     (pn->node == NODE_TEXT || pn->node == NODE_ESCAPE) &&
+	     TAILQ_NEXT(pn, child) == NULL)) {
 		macro_addarg(f, pn->b, flags);
 		return;
 	}
@@ -239,7 +240,7 @@ print_textnode(struct format *f, struct 
 {
 	struct pnode	*nc;
 
-	if (n->node == NODE_TEXT)
+	if (n->node == NODE_TEXT || n->node == NODE_ESCAPE)
 		print_text(f, n->b, ARG_SPACE);
 	else
 		TAILQ_FOREACH(nc, &n->childq, child)
Index: docbook2mdoc.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/docbook2mdoc.c,v
retrieving revision 1.78
retrieving revision 1.79
diff -Ldocbook2mdoc.c -Ldocbook2mdoc.c -u -p -r1.78 -r1.79
--- docbook2mdoc.c
+++ docbook2mdoc.c
@@ -643,6 +643,13 @@ pnode_print(struct format *p, struct pno
 	case NODE_ENVAR:
 		macro_open(p, "Ev");
 		break;
+	case NODE_ESCAPE:
+		if (p->linestate == LINE_NEW)
+			p->linestate = LINE_TEXT;
+		else
+			putchar(' ');
+		fputs(pn->b, stdout);
+		break;
 	case NODE_FILENAME:
 		macro_open(p, "Pa");
 		break;
Index: parse.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -Lparse.c -Lparse.c -u -p -r1.8 -r1.9
--- parse.c
+++ parse.c
@@ -189,6 +189,62 @@ static	const struct element elements[] =
 	{ NULL,			NODE_IGNORE }
 };
 
+struct	entity {
+	const char	*name;
+	const char	*roff;
+};
+
+/*
+ * XML character entity references found in the wild.
+ * Those that don't have an exact mandoc_char(7) representation
+ * are approximated, and the desired codepoint is given as a comment.
+ * Encoding them as \\[u...] would leave -Tascii out in the cold.
+ */
+static	const struct entity entities[] = {
+	{ "alpha",	"\\(*a" },
+	{ "amp",	"&" },
+	{ "apos",	"'" },
+	{ "auml",	"\\(:a" },
+	{ "beta",	"\\(*b" },
+	{ "circ",	"^" },      /* U+02C6 */
+	{ "copy",	"\\(co" },
+	{ "dagger",	"\\(dg" },
+	{ "Delta",	"\\(*D" },
+	{ "eacute",	"\\('e" },
+	{ "emsp",	"\\ " },    /* U+2003 */
+	{ "gt",		">" },
+	{ "hairsp",	"\\^" },
+	{ "kappa",	"\\(*k" },
+	{ "larr",	"\\(<-" },
+	{ "ldquo",	"\\(lq" },
+	{ "le",		"\\(<=" },
+	{ "lowbar",	"_" },
+	{ "lsqb",	"[" },
+	{ "lt",		"<" },
+	{ "mdash",	"\\(em" },
+	{ "minus",	"\\-" },
+	{ "ndash",	"\\(en" },
+	{ "nbsp",	"\\ " },
+	{ "num",	"#" },
+	{ "oslash",	"\\(/o" },
+	{ "ouml",	"\\(:o" },
+	{ "percnt",	"%" },
+	{ "quot",	"\\(dq" },
+	{ "rarr",	"\\(->" },
+	{ "rArr",	"\\(rA" },
+	{ "rdquo",	"\\(rq" },
+	{ "reg",	"\\(rg" },
+	{ "rho",	"\\(*r" },
+	{ "rsqb",	"]" },
+	{ "sigma",	"\\(*s" },
+	{ "shy",	"\\&" },     /* U+00AD */
+	{ "tau",	"\\(*t" },
+	{ "tilde",	"\\[u02DC]" },
+	{ "times",	"\\[tmu]" },
+	{ "uuml",	"\\(:u" },
+	{ NULL,		NULL }
+};
+
 static void
 error_msg(struct parse *p, const char *fmt, ...)
 {
@@ -275,6 +331,52 @@ pnode_trim(struct pnode *pn)
 			break;
 }
 
+static void
+xml_entity(struct parse *p, const char *name)
+{
+	const struct entity	*entity;
+	struct pnode		*dat;
+
+	if (p->del > 0)
+		return;
+
+	if (p->cur == NULL) {
+		error_msg(p, "discarding entity before document: &%s;", name);
+		return;
+	}
+
+	/* Close out the text node, if there is one. */
+	if (p->cur->node == NODE_TEXT) {
+		pnode_trim(p->cur);
+		p->cur = p->cur->parent;
+	}
+
+	if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
+		warn_msg(p, "entity after end of document: &%s;", name);
+
+	for (entity = entities; entity->name != NULL; entity++)
+		if (strcmp(name, entity->name) == 0)
+			break;
+
+	if (entity->roff == NULL) {
+		error_msg(p, "unknown entity &%s;", name);
+		return;
+	}
+
+	/* Create, append, and close out an entity node. */
+	if ((dat = calloc(1, sizeof(*dat))) == NULL ||
+	    (dat->b = dat->real = strdup(entity->roff)) == NULL) {
+		perror(NULL);
+		exit(1);
+	}
+	dat->node = NODE_ESCAPE;
+	dat->bsz = strlen(dat->b);
+	dat->parent = p->cur;
+	TAILQ_INIT(&dat->childq);
+	TAILQ_INIT(&dat->attrq);
+	TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
+}
+
 /*
  * Begin an element.
  */
@@ -573,14 +675,14 @@ parse_file(struct parse *p, int fd, cons
 			}
 
 			/*
-			 * The following three cases (in_arg, in_tag,
-			 * and starting a tag) all parse a word or
-			 * quoted string.  If that extends beyond the
+			 * The following four cases (in_arg, in_tag, and
+			 * starting an entity or a tag) all parse a word
+			 * or quoted string.  If that extends beyond the
 			 * read buffer and the last read(2) still got
 			 * data, they all break out of the token loop
 			 * to request more data from the read loop.
 			 *
-			 * Also, they all detect self-closing tags,
+			 * Also, three of them detect self-closing tags,
 			 * those ending with "/>", setting the flag
 			 * elem_end and calling xml_elem_end() at the
 			 * very end, after handling the attribute value,
@@ -689,10 +791,21 @@ parse_file(struct parse *p, int fd, cons
 				if (elem_end)
 					xml_elem_end(p, b + poff);
 
-			/* Process text up to the next tag. */
+			/* Process an entity. */
+
+			} else if (b[poff] == '&') {
+				if (advance(p, b, rlen, &pend, ";") &&
+				    rsz > 0)
+					break;
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				xml_entity(p, b + poff + 1);
+
+			/* Process text up to the next tag or entity. */
 
 			} else {
-				if (advance(p, b, rlen, &pend, "<") == 0)
+				if (advance(p, b, rlen, &pend, "<&") == 0)
 					p->ncol--;
 				xml_char(p, b + poff, pend - poff);
 			}
--
 To unsubscribe send an email to tech+unsubscribe@mandoc.bsd.lv