From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from localhost (fantadrom.bsd.lv [local]) by fantadrom.bsd.lv (OpenSMTPD) with ESMTPA id c534f4d9 for ; Tue, 2 Apr 2019 10:53:32 -0500 (EST) Date: Tue, 2 Apr 2019 10:53:32 -0500 (EST) X-Mailinglist: mandoc-source Reply-To: source@mandoc.bsd.lv MIME-Version: 1.0 From: schwarze@mandoc.bsd.lv To: source@mandoc.bsd.lv Subject: docbook2mdoc: Translate XML character entity references to roff X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/ Content-Type: text/plain; charset=utf-8 Message-ID: Log Message: ----------- Translate XML character entity references to roff character escape sequences. Missing feature reported by Stephen Gregoratto . Remaining known issues: * Whitespace handling isn't perfect yet. * Numeric character references aren't handled yet. * The list of entities is still very incomplete. * When it grows longer, we may have to switch to binary search. * Local entities declared in the DTD are not yet handled. Modified Files: -------------- docbook2mdoc: docbook2mdoc.c macro.c node.h parse.c Revision Data ------------- Index: node.h =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/node.h,v retrieving revision 1.6 retrieving revision 1.7 diff -Lnode.h -Lnode.h -u -p -r1.6 -r1.7 --- node.h +++ node.h @@ -54,6 +54,7 @@ enum nodeid { NODE_EMPHASIS, NODE_ENTRY, NODE_ENVAR, + NODE_ESCAPE, NODE_FIELDSYNOPSIS, NODE_FILENAME, NODE_FIRSTTERM, Index: macro.c =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/macro.c,v retrieving revision 1.2 retrieving revision 1.3 diff -Lmacro.c -Lmacro.c -u -p -r1.2 -r1.3 --- macro.c +++ macro.c @@ -161,9 +161,10 @@ macro_addnode(struct format *f, struct p * that text, letting macro_addarg() decide about quoting. */ - if (pn->node == NODE_TEXT || + if (pn->node == NODE_TEXT || pn->node == NODE_ESCAPE || ((pn = TAILQ_FIRST(&pn->childq)) != NULL && - pn->node == NODE_TEXT && TAILQ_NEXT(pn, child) == NULL)) { + (pn->node == NODE_TEXT || pn->node == NODE_ESCAPE) && + TAILQ_NEXT(pn, child) == NULL)) { macro_addarg(f, pn->b, flags); return; } @@ -239,7 +240,7 @@ print_textnode(struct format *f, struct { struct pnode *nc; - if (n->node == NODE_TEXT) + if (n->node == NODE_TEXT || n->node == NODE_ESCAPE) print_text(f, n->b, ARG_SPACE); else TAILQ_FOREACH(nc, &n->childq, child) Index: docbook2mdoc.c =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/docbook2mdoc.c,v retrieving revision 1.78 retrieving revision 1.79 diff -Ldocbook2mdoc.c -Ldocbook2mdoc.c -u -p -r1.78 -r1.79 --- docbook2mdoc.c +++ docbook2mdoc.c @@ -643,6 +643,13 @@ pnode_print(struct format *p, struct pno case NODE_ENVAR: macro_open(p, "Ev"); break; + case NODE_ESCAPE: + if (p->linestate == LINE_NEW) + p->linestate = LINE_TEXT; + else + putchar(' '); + fputs(pn->b, stdout); + break; case NODE_FILENAME: macro_open(p, "Pa"); break; Index: parse.c =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v retrieving revision 1.8 retrieving revision 1.9 diff -Lparse.c -Lparse.c -u -p -r1.8 -r1.9 --- parse.c +++ parse.c @@ -189,6 +189,62 @@ static const struct element elements[] = { NULL, NODE_IGNORE } }; +struct entity { + const char *name; + const char *roff; +}; + +/* + * XML character entity references found in the wild. + * Those that don't have an exact mandoc_char(7) representation + * are approximated, and the desired codepoint is given as a comment. + * Encoding them as \\[u...] would leave -Tascii out in the cold. + */ +static const struct entity entities[] = { + { "alpha", "\\(*a" }, + { "amp", "&" }, + { "apos", "'" }, + { "auml", "\\(:a" }, + { "beta", "\\(*b" }, + { "circ", "^" }, /* U+02C6 */ + { "copy", "\\(co" }, + { "dagger", "\\(dg" }, + { "Delta", "\\(*D" }, + { "eacute", "\\('e" }, + { "emsp", "\\ " }, /* U+2003 */ + { "gt", ">" }, + { "hairsp", "\\^" }, + { "kappa", "\\(*k" }, + { "larr", "\\(<-" }, + { "ldquo", "\\(lq" }, + { "le", "\\(<=" }, + { "lowbar", "_" }, + { "lsqb", "[" }, + { "lt", "<" }, + { "mdash", "\\(em" }, + { "minus", "\\-" }, + { "ndash", "\\(en" }, + { "nbsp", "\\ " }, + { "num", "#" }, + { "oslash", "\\(/o" }, + { "ouml", "\\(:o" }, + { "percnt", "%" }, + { "quot", "\\(dq" }, + { "rarr", "\\(->" }, + { "rArr", "\\(rA" }, + { "rdquo", "\\(rq" }, + { "reg", "\\(rg" }, + { "rho", "\\(*r" }, + { "rsqb", "]" }, + { "sigma", "\\(*s" }, + { "shy", "\\&" }, /* U+00AD */ + { "tau", "\\(*t" }, + { "tilde", "\\[u02DC]" }, + { "times", "\\[tmu]" }, + { "uuml", "\\(:u" }, + { NULL, NULL } +}; + static void error_msg(struct parse *p, const char *fmt, ...) { @@ -275,6 +331,52 @@ pnode_trim(struct pnode *pn) break; } +static void +xml_entity(struct parse *p, const char *name) +{ + const struct entity *entity; + struct pnode *dat; + + if (p->del > 0) + return; + + if (p->cur == NULL) { + error_msg(p, "discarding entity before document: &%s;", name); + return; + } + + /* Close out the text node, if there is one. */ + if (p->cur->node == NODE_TEXT) { + pnode_trim(p->cur); + p->cur = p->cur->parent; + } + + if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) + warn_msg(p, "entity after end of document: &%s;", name); + + for (entity = entities; entity->name != NULL; entity++) + if (strcmp(name, entity->name) == 0) + break; + + if (entity->roff == NULL) { + error_msg(p, "unknown entity &%s;", name); + return; + } + + /* Create, append, and close out an entity node. */ + if ((dat = calloc(1, sizeof(*dat))) == NULL || + (dat->b = dat->real = strdup(entity->roff)) == NULL) { + perror(NULL); + exit(1); + } + dat->node = NODE_ESCAPE; + dat->bsz = strlen(dat->b); + dat->parent = p->cur; + TAILQ_INIT(&dat->childq); + TAILQ_INIT(&dat->attrq); + TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); +} + /* * Begin an element. */ @@ -573,14 +675,14 @@ parse_file(struct parse *p, int fd, cons } /* - * The following three cases (in_arg, in_tag, - * and starting a tag) all parse a word or - * quoted string. If that extends beyond the + * The following four cases (in_arg, in_tag, and + * starting an entity or a tag) all parse a word + * or quoted string. If that extends beyond the * read buffer and the last read(2) still got * data, they all break out of the token loop * to request more data from the read loop. * - * Also, they all detect self-closing tags, + * Also, three of them detect self-closing tags, * those ending with "/>", setting the flag * elem_end and calling xml_elem_end() at the * very end, after handling the attribute value, @@ -689,10 +791,21 @@ parse_file(struct parse *p, int fd, cons if (elem_end) xml_elem_end(p, b + poff); - /* Process text up to the next tag. */ + /* Process an entity. */ + + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";") && + rsz > 0) + break; + b[pend] = '\0'; + if (pend < rlen) + pend++; + xml_entity(p, b + poff + 1); + + /* Process text up to the next tag or entity. */ } else { - if (advance(p, b, rlen, &pend, "<") == 0) + if (advance(p, b, rlen, &pend, "<&") == 0) p->ncol--; xml_char(p, b + poff, pend - poff); } -- To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv