* docbook2mdoc: The expat library aborts parsing as soon as it encounters
@ 2019-03-28 12:21 schwarze
0 siblings, 0 replies; only message in thread
From: schwarze @ 2019-03-28 12:21 UTC (permalink / raw)
To: source
Log Message:
-----------
The expat library aborts parsing as soon as it encounters invalid
input, and the basic design of the library practically precludes
fixing it. However, whether the input is well-formed XML or not
is totally irrelevant, and in fact, i have seen real-world documents
from X.org that expat rejects as not well-formed. Kristaps reports
the same from OpenGL.
We really want to parse *ANYTHING* whatsoever without ever throwing
a fatal error - after all, the point is to convert legacy documents
to a better format, and nitpicking about the syntax merely alienates
users (including myself).
Consequently, ditch expat and write a parser from scratch, optimized
for robustness on invalid input.
Oh, and by the way, it only requires 200 lines of code,
compared to 15,000 lines in expat - an economy of 98.5%
at the sime time as being much more useful in practice.
Modified Files:
--------------
docbook2mdoc:
Makefile
index.xml
main.c
node.c
node.h
parse.c
Revision Data
-------------
Index: Makefile
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/Makefile,v
retrieving revision 1.20
retrieving revision 1.21
diff -LMakefile -LMakefile -u -p -r1.20 -r1.21
--- Makefile
+++ Makefile
@@ -10,7 +10,7 @@ OBJS = node.o parse.o macro.o docbook2md
all: docbook2mdoc
docbook2mdoc: $(OBJS)
- $(CC) -o $@ $(OBJS) -lexpat
+ $(CC) -o $@ $(OBJS)
www: index.html docbook2mdoc.1.html docbook2mdoc-$(VERSION).tgz README.txt
Index: node.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/node.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -Lnode.c -Lnode.c -u -p -r1.1 -r1.2
--- node.c
+++ node.c
@@ -143,8 +143,8 @@ pnode_getattr_raw(struct pnode *pn, enum
return defval;
TAILQ_FOREACH(ap, &pn->attrq, child)
if (ap->key == key)
- return ap->val == ATTRVAL__MAX ? ap->rawval :
- attrvals[ap->val];
+ return ap->val != ATTRVAL__MAX ? attrvals[ap->val] :
+ ap->rawval != NULL ? ap->rawval : defval;
return defval;
}
Index: parse.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -Lparse.c -Lparse.c -u -p -r1.4 -r1.5
--- parse.c
+++ parse.c
@@ -17,8 +17,8 @@
*/
#include <assert.h>
#include <ctype.h>
-#include <expat.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <unistd.h>
@@ -34,11 +34,16 @@
* Keep this as simple and small as possible.
*/
struct parse {
- XML_Parser xml;
const char *fname; /* Name of the input file. */
struct ptree *tree; /* Complete parse result. */
struct pnode *cur; /* Current node in the tree. */
+ enum nodeid ncur; /* Type of the current node. */
+ int line; /* Line number in the input file. */
+ int col; /* Column number in the input file. */
+ int nline; /* Line number of next token. */
+ int ncol; /* Column number of next token. */
int del; /* Levels of nested nodes being deleted. */
+ int attr; /* The most recent attribute is valid. */
int warn;
};
@@ -179,7 +184,7 @@ static const struct element elements[] =
{ "wordasword", NODE_WORDASWORD },
{ "xi:include", NODE_DELETE_WARN },
{ "year", NODE_YEAR },
- { NULL, NODE__MAX }
+ { NULL, NODE_IGNORE }
};
/*
@@ -188,29 +193,21 @@ static const struct element elements[] =
* Otherwise, create a new one as a child of the current node.
*/
static void
-xml_char(void *arg, const XML_Char *p, int sz)
+xml_char(struct parse *ps, const char *p, int sz)
{
- struct parse *ps;
struct pnode *dat;
- int i;
- ps = arg;
- if (ps->del > 0 || ps->tree->flags & TREE_FAIL)
+ if (ps->del > 0)
return;
- /*
- * Only create a new node if there is non-whitespace text.
- * Strip all leading whitespace.
- */
- if (ps->cur->node != NODE_TEXT) {
- for (i = 0; i < sz; i++)
- if (isspace((unsigned char)p[i]) == 0)
- break;
- if (i == sz)
- return;
- p += i;
- sz -= i;
+ if (ps->cur == NULL) {
+ fprintf(stderr, "%s:%d:%d: discarding text before docum"
+ "ent: %.*s\n", ps->fname, ps->line, ps->col, sz, p);
+ ps->tree->flags |= TREE_FAIL;
+ return;
+ }
+ if (ps->cur->node != NODE_TEXT) {
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
@@ -223,6 +220,12 @@ xml_char(void *arg, const XML_Char *p, i
ps->cur = dat;
}
+ if (ps->tree->flags & TREE_CLOSED &&
+ ps->cur->parent == ps->tree->root && ps->warn)
+ fprintf(stderr, "%s:%d:%d: warning: "
+ "text after end of document: %.*s\n",
+ ps->fname, ps->line, ps->col, sz, p);
+
/* Append to the current text node. */
assert(sz >= 0);
@@ -248,20 +251,14 @@ pnode_trim(struct pnode *pn)
/*
* Begin an element.
- * If the name is unknown, abort parsing.
*/
static void
-xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
+xml_elem_start(struct parse *ps, const char *name)
{
- struct parse *ps;
- const struct element *elem;
- enum attrkey key;
- struct pnode *dat;
- struct pattr *pattr;
- const XML_Char **att;
+ const struct element *elem;
+ struct pnode *dat;
- ps = arg;
- if (ps->tree->flags & TREE_FAIL)
+ if (*name == '!' || *name == '?')
return;
/*
@@ -284,20 +281,18 @@ xml_elem_start(void *arg, const XML_Char
break;
if (elem->name == NULL) {
- fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
- ps->fname, XML_GetCurrentLineNumber(ps->xml),
- XML_GetCurrentColumnNumber(ps->xml), name);
+ fprintf(stderr, "%s:%d:%d: unknown element <%s>\n",
+ ps->fname, ps->line, ps->col, name);
ps->tree->flags |= TREE_FAIL;
- return;
}
+ ps->ncur = elem->node;
- switch (elem->node) {
+ switch (ps->ncur) {
case NODE_DELETE_WARN:
if (ps->warn)
- fprintf(stderr, "%s:%zu:%zu: warning: "
- "skipping element <%s>\n", ps->fname,
- XML_GetCurrentLineNumber(ps->xml),
- XML_GetCurrentColumnNumber(ps->xml), name);
+ fprintf(stderr, "%s:%d:%d: warning: "
+ "skipping element <%s>\n",
+ ps->fname, ps->line, ps->col, name);
/* FALLTHROUGH */
case NODE_DELETE:
ps->del = 1;
@@ -311,6 +306,12 @@ xml_elem_start(void *arg, const XML_Char
break;
}
+ if (ps->tree->flags & TREE_CLOSED &&
+ ps->cur->parent == NULL && ps->warn)
+ fprintf(stderr, "%s:%d:%d: warning: "
+ "element after end of document: %s\n",
+ ps->fname, ps->line, ps->col, name);
+
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
@@ -326,26 +327,48 @@ xml_elem_start(void *arg, const XML_Char
ps->cur = dat;
if (ps->tree->root == NULL)
ps->tree->root = dat;
+}
- /*
- * Process attributes.
- */
- for (att = atts; *att != NULL; att += 2) {
- if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
- if (ps->warn)
- fprintf(stderr, "%s:%zu:%zu: warning: "
- "unknown attribute \"%s\"\n",
- ps->fname,
- XML_GetCurrentLineNumber(ps->xml),
- XML_GetCurrentColumnNumber(ps->xml),
- *att);
- continue;
- }
- pattr = calloc(1, sizeof(*pattr));
- pattr->key = key;
- if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
- pattr->rawval = strdup(att[1]);
- TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
+static void
+xml_attrkey(struct parse *ps, const char *name)
+{
+ struct pattr *attr;
+ enum attrkey key;
+
+ if (ps->del > 0 || *name == '\0')
+ return;
+ if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
+ if (ps->warn)
+ fprintf(stderr, "%s:%d:%d: warning: "
+ "unknown attribute \"%s\"\n",
+ ps->fname, ps->line, ps->col, name);
+ ps->attr = 0;
+ return;
+ }
+ if ((attr = calloc(1, sizeof(*attr))) == NULL) {
+ perror(NULL);
+ exit(1);
+ }
+ attr->key = key;
+ attr->val = ATTRVAL__MAX;
+ attr->rawval = NULL;
+ TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
+ ps->attr = 1;
+}
+
+static void
+xml_attrval(struct parse *ps, const char *name)
+{
+ struct pattr *attr;
+
+ if (ps->del > 0 || ps->attr == 0)
+ return;
+ if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
+ return;
+ if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
+ (attr->rawval = strdup(name)) == NULL) {
+ perror(NULL);
+ exit(1);
}
}
@@ -354,14 +377,10 @@ xml_elem_start(void *arg, const XML_Char
* If we're at a text node, roll that one up first.
*/
static void
-xml_elem_end(void *arg, const XML_Char *name)
+xml_elem_end(struct parse *ps, const char *name)
{
- struct parse *ps;
- const struct element *elem;
-
- ps = arg;
- if (ps->tree->flags & TREE_FAIL)
- return;
+ const struct element *elem;
+ enum nodeid node;
/*
* An ancestor is excluded from the tree;
@@ -373,25 +392,47 @@ xml_elem_end(void *arg, const XML_Char *
}
/* Close out the text node, if there is one. */
- if (ps->del == 0 && ps->cur->node == NODE_TEXT) {
+ if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
pnode_trim(ps->cur);
ps->cur = ps->cur->parent;
}
- for (elem = elements; elem->name != NULL; elem++)
- if (strcmp(elem->name, name) == 0)
- break;
+ if (name != NULL) {
+ for (elem = elements; elem->name != NULL; elem++)
+ if (strcmp(elem->name, name) == 0)
+ break;
+ node = elem->node;
+ } else
+ node = ps->ncur;
- switch (elem->node) {
+ switch (node) {
case NODE_DELETE_WARN:
case NODE_DELETE:
- ps->del--;
+ if (ps->del > 0)
+ ps->del--;
break;
case NODE_IGNORE:
break;
default:
- assert(elem->node == ps->cur->node);
- ps->cur = ps->cur->parent;
+ if (ps->cur == NULL || node != ps->cur->node) {
+ if (ps->warn)
+ fprintf(stderr, "%s:%d:%d: warning: "
+ "element not open: </%s>\n",
+ ps->fname, ps->line, ps->col, name);
+ break;
+ }
+
+ /*
+ * Refrain from actually closing the document element.
+ * If no more content follows, no harm is done, but if
+ * some content still follows, simply processing it is
+ * obviously better than discarding it or crashing.
+ */
+
+ if (ps->cur->parent == NULL)
+ ps->tree->flags |= TREE_CLOSED;
+ else
+ ps->cur = ps->cur->parent;
break;
}
assert(ps->del == 0);
@@ -409,16 +450,7 @@ parse_alloc(int warn)
free(p);
return NULL;
}
-
- if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
- free(p->tree);
- free(p);
- return NULL;
- }
p->warn = warn;
- XML_SetCharacterDataHandler(p->xml, xml_char);
- XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
- XML_SetUserData(p->xml, p);
return p;
}
@@ -427,7 +459,6 @@ parse_free(struct parse *p)
{
if (p == NULL)
return;
- XML_ParserFree(p->xml);
if (p->tree != NULL) {
pnode_unlink(p->tree->root);
free(p->tree);
@@ -435,28 +466,219 @@ parse_free(struct parse *p)
free(p);
}
+/*
+ * Advance the pend pointer to the next character in the charset.
+ * If the charset starts with a space, it stands for any whitespace.
+ * Update the new input file position, used for messages.
+ * Do not overrun the buffer b of length rlen.
+ * When reaching the end, NUL-terminate the buffer and return 1;
+ * otherwise, return 0.
+ */
+static int
+advance(struct parse *p, char *b, size_t rlen, size_t *pend,
+ const char *charset)
+{
+ int space;
+
+ if (*charset == ' ') {
+ space = 1;
+ charset++;
+ } else
+ space = 0;
+
+ p->nline = p->line;
+ p->ncol = p->col;
+ while (*pend < rlen) {
+ if (b[*pend] == '\n') {
+ p->nline++;
+ p->ncol = 1;
+ } else
+ p->ncol++;
+ if (space && isspace((unsigned char)b[*pend]))
+ break;
+ if (strchr(charset, b[*pend]) != NULL)
+ break;
+ ++*pend;
+ }
+ if (*pend == rlen) {
+ b[rlen] = '\0';
+ return 1;
+ } else
+ return 0;
+}
+
struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
char b[4096];
- ssize_t ssz;
+ ssize_t rsz; /* Return value from read(2). */
+ size_t rlen; /* Number of bytes in b[]. */
+ size_t poff; /* Parse offset in b[]. */
+ size_t pend; /* Offset of the end of the current word. */
+ int in_tag, in_arg, in_quotes, elem_end;
p->fname = fname;
- do {
- if ((ssz = read(fd, b, sizeof(b))) < 0) {
- perror(fname);
- pnode_unlink(p->tree->root);
- p->tree->root = p->cur = NULL;
- p->tree->flags |= TREE_FAIL;
- return NULL;
- }
- if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
- fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
- XML_GetCurrentLineNumber(p->xml),
- XML_GetCurrentColumnNumber(p->xml),
- XML_ErrorString(XML_GetErrorCode(p->xml)));
- p->tree->flags |= TREE_FAIL;
+ p->nline = 1;
+ p->ncol = 1;
+ rlen = 0;
+ in_tag = in_arg = in_quotes = 0;
+
+ /*
+ * Read loop.
+ *
+ * We have to enter the read loop once more even on EOF
+ * because the previous token may have been incomplete,
+ * such that it asked for more input.
+ * Once rsz is 0, incomplete tokens will no longer ask
+ * for more input but instead use whatever there is,
+ * and then exit the read loop.
+ * The minus one on the size limit for read(2) is needed
+ * such that advance() can set b[rlen] to NUL when needed.
+ */
+
+ while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
+ if ((rlen += rsz) == 0)
+ break;
+
+ /* Token loop. */
+
+ pend = 0;
+ for (;;) {
+
+ /* Proceed to the next token, skipping whitespace. */
+
+ p->line = p->nline;
+ p->col = p->ncol;
+ if ((poff = pend) == rlen)
+ break;
+ if (isspace((unsigned char)b[pend])) {
+ if (b[pend++] == '\n') {
+ p->nline++;
+ p->ncol = 1;
+ } else
+ p->ncol++;
+ continue;
+ }
+
+ /*
+ * The following three cases (in_arg, in_tag,
+ * and starting a tag) all parse a word or
+ * quoted string. If that extends beyond the
+ * read buffer and the last read(2) still got
+ * data, they all break out of the token loop
+ * to request more data from the read loop.
+ *
+ * Also, they all detect self-closing tags,
+ * those ending with "/>", setting the flag
+ * elem_end and calling xml_elem_end() at the
+ * very end, after handling the attribute value,
+ * attribute name, or tag name, respectively.
+ */
+
+ /* Parse an attribute value. */
+
+ if (in_arg) {
+ if (in_quotes == 0 && b[pend] == '"') {
+ in_quotes = 1;
+ p->ncol++;
+ pend++;
+ continue;
+ }
+ if (advance(p, b, rlen, &pend,
+ in_quotes ? "\"" : " >") && rsz > 0)
+ break;
+ in_arg = in_quotes = elem_end = 0;
+ if (b[pend] == '>') {
+ in_tag = 0;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ xml_attrval(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, NULL);
+
+ /* Look for an attribute name. */
+
+ } else if (in_tag) {
+ if (advance(p, b, rlen, &pend, " =>") &&
+ rsz > 0)
+ break;
+ elem_end = 0;
+ switch (b[pend]) {
+ case '>':
+ in_tag = 0;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ break;
+ case '=':
+ in_arg = 1;
+ break;
+ default:
+ break;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ xml_attrkey(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, NULL);
+
+ /* Begin an opening or closing tag. */
+
+ } else if (b[poff] == '<') {
+ if (advance(p, b, rlen, &pend, " >") &&
+ rsz > 0)
+ break;
+ elem_end = 0;
+ if (b[pend] != '>')
+ in_tag = 1;
+ else if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ if (b[++poff] == '/') {
+ elem_end = 1;
+ poff++;
+ } else
+ xml_elem_start(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, b + poff);
+
+ /* Process text up to the next tag. */
+
+ } else {
+ if (advance(p, b, rlen, &pend, "<") == 0)
+ p->ncol--;
+ xml_char(p, b + poff, pend - poff);
+ }
}
- } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
+
+ /* Buffer exhausted; shift left and re-fill. */
+
+ assert(poff > 0);
+ memmove(b, b + poff, rlen - poff);
+ rlen -= poff;
+ }
+ if (rsz < 0) {
+ perror(fname);
+ p->tree->flags |= TREE_FAIL;
+ }
+ if (p->cur != NULL && p->cur->node == NODE_TEXT) {
+ pnode_trim(p->cur);
+ p->cur = p->cur->parent;
+ }
+ if ((p->tree->flags & TREE_CLOSED) == 0 && p->warn)
+ fprintf(stderr, "%s:%d:%d: warning: document not closed\n",
+ p->fname, p->line, p->col);
return p->tree;
}
Index: main.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/main.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -Lmain.c -Lmain.c -u -p -r1.2 -r1.3
--- main.c
+++ main.c
@@ -86,8 +86,8 @@ main(int argc, char *argv[])
rc = 0;
ptree_print(tree);
if (tree->flags & TREE_FAIL)
- fputs("\nThe output is incomplete, see "
- "the parse error reported above.\n\n",
+ fputs("\nThe output may be incomplete, see"
+ " the parse error reported above.\n\n",
stderr);
pnode_unlink(tree->root);
tree->root = NULL;
Index: node.h
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/node.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -Lnode.h -Lnode.h -u -p -r1.4 -r1.5
--- node.h
+++ node.h
@@ -212,6 +212,7 @@ struct ptree {
int flags;
#define TREE_FAIL (1 << 0) /* A fatal parse error occurred. */
#define TREE_EQN (1 << 1) /* The document needs inline eqn(7). */
+#define TREE_CLOSED (1 << 2) /* The document element was closed. */
};
Index: index.xml
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/index.xml,v
retrieving revision 1.12
retrieving revision 1.13
diff -Lindex.xml -Lindex.xml -u -p -r1.12 -r1.13
--- index.xml
+++ index.xml
@@ -3,7 +3,7 @@
<head>
<meta name="viewport" content="initial-scale=1.0, user-scalable=no" />
<meta charset='utf-8' />
- <title>docbook2mdoc | Convert DocBook refentry to mdoc</title>
+ <title>docbook2mdoc | Convert DocBook to mdoc</title>
<style>
html, body { margin: 0; padding: 0; }
header { margin-top: 1em; }
@@ -25,7 +25,7 @@
<header>
<span class="nm">docbook2mdoc</span>
—
- <span class="nd">Convert DocBook refentry to mdoc</span>
+ <span class="nd">Convert DocBook to mdoc</span>
<nav>
<span>version <span>@VERSION@</span></span>
<span><a href="snapshots/docbook2mdoc.tgz">Sources</a></span>
@@ -40,13 +40,11 @@
Unlike most DocBook utilities, it's a standalone <a rel="license"
href="https://www.isc.org/downloads/software-support-policy/isc-license/">ISC</a>-licensed ISO C utility
that should compile on any modern UNIX system.
- The only requirement is <a href="http://expat.sourceforge.net/">libexpat</a> (for parsing XML), which is
- installed by default on most systems.
</p>
<p>
<span class="nm">docbook2mdoc</span> is experimental: it still has many missing elements.
However, it works with the DocBook reference examples and documents found in the wild.
- Contact <a href="http://kristaps.bsd.lv">Kristaps</a> with questions or missing nodes; or better yet, download
+ Contact <a href="https://mandoc.bsd.lv/contact.html">the developers</a> with questions or missing nodes; or better yet, download
the source and add new elements yourself as described in the <a href="README.txt">README</a>.
</p>
<h2>
--
To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-03-28 12:21 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-28 12:21 docbook2mdoc: The expat library aborts parsing as soon as it encounters schwarze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).