* docbook2mdoc: Split parse_file() into parse_file() to fill the parse
@ 2019-04-05 14:38 schwarze
0 siblings, 0 replies; only message in thread
From: schwarze @ 2019-04-05 14:38 UTC (permalink / raw)
To: source
Log Message:
-----------
Split parse_file() into parse_file() to fill the parse buffer
and parse_string() to handle it,
to allow parsing from alternative sources in a subsequent step.
Only advance line and column number when parsing from the main input file.
Represent parsing state as an enum rather than with multiple flags.
Modified Files:
--------------
docbook2mdoc:
parse.c
Revision Data
-------------
Index: parse.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -Lparse.c -Lparse.c -u -p -r1.13 -r1.14
--- parse.c
+++ parse.c
@@ -30,6 +30,14 @@
* The implementation of the DocBook parser.
*/
+enum pstate {
+ PARSE_ELEM,
+ PARSE_TAG,
+ PARSE_ARG,
+ PARSE_SQ,
+ PARSE_DQ
+};
+
/*
* Global parse state.
* Keep this as simple and small as possible.
@@ -582,6 +590,19 @@ parse_free(struct parse *p)
free(p);
}
+static void
+increment(struct parse *p, char *b, size_t *pend, int refill)
+{
+ if (refill) {
+ if (b[*pend] == '\n') {
+ p->nline++;
+ p->ncol = 1;
+ } else
+ p->ncol++;
+ }
+ ++*pend;
+}
+
/*
* Advance the pend pointer to the next character in the charset.
* If the charset starts with a space, it stands for any whitespace.
@@ -592,7 +613,7 @@ parse_free(struct parse *p)
*/
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
- const char *charset)
+ const char *charset, int refill)
{
int space;
@@ -602,50 +623,199 @@ advance(struct parse *p, char *b, size_t
} else
space = 0;
- p->nline = p->line;
- p->ncol = p->col;
+ if (refill) {
+ p->nline = p->line;
+ p->ncol = p->col;
+ }
while (*pend < rlen) {
- if (b[*pend] == '\n') {
- p->nline++;
- p->ncol = 1;
- } else
- p->ncol++;
if (space && isspace((unsigned char)b[*pend]))
break;
if (strchr(charset, b[*pend]) != NULL)
break;
- ++*pend;
+ increment(p, b, pend, refill);
}
if (*pend == rlen) {
b[rlen] = '\0';
- return 1;
+ return refill;
} else
return 0;
}
+size_t
+parse_string(struct parse *p, char *b, size_t rlen,
+ enum pstate *pstate, int refill)
+{
+ char *cp;
+ size_t poff; /* Parse offset in b[]. */
+ size_t pend; /* Offset of the end of the current word. */
+ int elem_end;
+
+ pend = 0;
+ for (;;) {
+
+ /* Proceed to the next token, skipping whitespace. */
+
+ if (refill) {
+ p->line = p->nline;
+ p->col = p->ncol;
+ }
+ if ((poff = pend) == rlen)
+ break;
+ if (isspace((unsigned char)b[pend])) {
+ increment(p, b, &pend, refill);
+ continue;
+ }
+
+ /*
+ * The following four cases (ARG, TAG, and starting an
+ * entity or a tag) all parse a word or quoted string.
+ * If that extends beyond the read buffer and the last
+ * read(2) still got data, they all break out of the
+ * token loop to request more data from the read loop.
+ *
+ * Also, three of them detect self-closing tags, those
+ * ending with "/>", setting the flag elem_end and
+ * calling xml_elem_end() at the very end, after
+ * handling the attribute value, attribute name, or
+ * tag name, respectively.
+ */
+
+ /* Parse an attribute value. */
+
+ if (*pstate >= PARSE_ARG) {
+ if (*pstate == PARSE_ARG &&
+ (b[pend] == '\'' || b[pend] == '"')) {
+ *pstate = b[pend] == '"' ?
+ PARSE_DQ : PARSE_SQ;
+ increment(p, b, &pend, refill);
+ continue;
+ }
+ if (advance(p, b, rlen, &pend,
+ *pstate == PARSE_DQ ? "\"" :
+ *pstate == PARSE_SQ ? "'" : " >", refill))
+ break;
+ *pstate = PARSE_TAG;
+ elem_end = 0;
+ if (b[pend] == '>') {
+ *pstate = PARSE_ELEM;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ increment(p, b, &pend, refill);
+ xml_attrval(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, NULL);
+
+ /* Look for an attribute name. */
+
+ } else if (*pstate == PARSE_TAG) {
+ if (advance(p, b, rlen, &pend, " =>", refill))
+ break;
+ elem_end = 0;
+ switch (b[pend]) {
+ case '>':
+ *pstate = PARSE_ELEM;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ break;
+ case '=':
+ *pstate = PARSE_ARG;
+ break;
+ default:
+ break;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ increment(p, b, &pend, refill);
+ xml_attrkey(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, NULL);
+
+ /* Begin an opening or closing tag. */
+
+ } else if (b[poff] == '<') {
+ if (advance(p, b, rlen, &pend, " >", refill))
+ break;
+ if (pend > poff + 3 &&
+ strncmp(b + poff, "<!--", 4) == 0) {
+
+ /* Skip a comment. */
+
+ cp = strstr(b + pend - 2, "-->");
+ if (cp == NULL) {
+ if (refill)
+ break;
+ cp = b + rlen;
+ } else
+ cp += 3;
+ while (b + pend < cp)
+ increment(p, b, &pend, refill);
+ continue;
+ }
+ elem_end = 0;
+ if (b[pend] != '>')
+ *pstate = PARSE_TAG;
+ else if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ increment(p, b, &pend, refill);
+ if (b[++poff] == '/') {
+ elem_end = 1;
+ poff++;
+ } else
+ xml_elem_start(p, b + poff);
+ if (elem_end)
+ xml_elem_end(p, b + poff);
+
+ /* Process an entity. */
+
+ } else if (b[poff] == '&') {
+ if (advance(p, b, rlen, &pend, ";", refill))
+ break;
+ b[pend] = '\0';
+ if (pend < rlen)
+ increment(p, b, &pend, refill);
+ xml_entity(p, b + poff + 1);
+
+ /* Process text up to the next tag, entity, or EOL. */
+
+ } else {
+ advance(p, b, rlen, &pend, "<&", refill);
+ xml_char(p, b + poff, pend - poff);
+ }
+ }
+ return poff;
+}
+
struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
char b[4096];
- char *cp;
ssize_t rsz; /* Return value from read(2). */
- size_t rlen; /* Number of bytes in b[]. */
+ size_t rlen; /* Number of bytes in b[]. */
size_t poff; /* Parse offset in b[]. */
- size_t pend; /* Offset of the end of the current word. */
- int in_tag, in_arg, in_quotes, elem_end;
+ enum pstate pstate;
p->fname = fname;
p->nline = 1;
p->ncol = 1;
+ pstate = PARSE_ELEM;
rlen = 0;
- in_tag = in_arg = in_quotes = 0;
/*
* Read loop.
*
- * We have to enter the read loop once more even on EOF
- * because the previous token may have been incomplete,
- * such that it asked for more input.
+ * If the previous token was incomplete and asked for more
+ * input, we have to enter the read loop once more even on EOF.
* Once rsz is 0, incomplete tokens will no longer ask
* for more input but instead use whatever there is,
* and then exit the read loop.
@@ -653,174 +823,13 @@ parse_file(struct parse *p, int fd, cons
* such that advance() can set b[rlen] to NUL when needed.
*/
- while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
- if ((rlen += rsz) == 0)
- break;
-
- /* Token loop. */
-
- pend = 0;
- for (;;) {
-
- /* Proceed to the next token, skipping whitespace. */
-
- p->line = p->nline;
- p->col = p->ncol;
- if ((poff = pend) == rlen)
- break;
- if (isspace((unsigned char)b[pend])) {
- if (b[pend++] == '\n') {
- p->nline++;
- p->ncol = 1;
- } else
- p->ncol++;
- continue;
- }
-
- /*
- * The following four cases (in_arg, in_tag, and
- * starting an entity or a tag) all parse a word
- * or quoted string. If that extends beyond the
- * read buffer and the last read(2) still got
- * data, they all break out of the token loop
- * to request more data from the read loop.
- *
- * Also, three of them detect self-closing tags,
- * those ending with "/>", setting the flag
- * elem_end and calling xml_elem_end() at the
- * very end, after handling the attribute value,
- * attribute name, or tag name, respectively.
- */
-
- /* Parse an attribute value. */
-
- if (in_arg) {
- if (in_quotes == 0 &&
- (b[pend] == '\'' || b[pend] == '"')) {
- in_quotes = b[pend] == '"' ? 2 : 1;
- p->ncol++;
- pend++;
- continue;
- }
- if (advance(p, b, rlen, &pend,
- in_quotes == 2 ? "\"" :
- in_quotes == 1 ? "'" : " >") && rsz > 0)
- break;
- in_arg = in_quotes = elem_end = 0;
- if (b[pend] == '>') {
- in_tag = 0;
- if (pend > 0 && b[pend - 1] == '/') {
- b[pend - 1] = '\0';
- elem_end = 1;
- }
- }
- b[pend] = '\0';
- if (pend < rlen)
- pend++;
- xml_attrval(p, b + poff);
- if (elem_end)
- xml_elem_end(p, NULL);
-
- /* Look for an attribute name. */
-
- } else if (in_tag) {
- if (advance(p, b, rlen, &pend, " =>") &&
- rsz > 0)
- break;
- elem_end = 0;
- switch (b[pend]) {
- case '>':
- in_tag = 0;
- if (pend > 0 && b[pend - 1] == '/') {
- b[pend - 1] = '\0';
- elem_end = 1;
- }
- break;
- case '=':
- in_arg = 1;
- break;
- default:
- break;
- }
- b[pend] = '\0';
- if (pend < rlen)
- pend++;
- xml_attrkey(p, b + poff);
- if (elem_end)
- xml_elem_end(p, NULL);
-
- /* Begin an opening or closing tag. */
-
- } else if (b[poff] == '<') {
- if (advance(p, b, rlen, &pend, " >") &&
- rsz > 0)
- break;
- if (pend > poff + 3 &&
- strncmp(b + poff, "<!--", 4) == 0) {
-
- /* Skip a comment. */
-
- cp = strstr(b + pend - 2, "-->");
- if (cp == NULL) {
- if (rsz > 0) {
- pend = rlen;
- break;
- }
- cp = b + rlen;
- } else
- cp += 3;
- while (b + pend < cp) {
- if (b[++pend] == '\n') {
- p->nline++;
- p->ncol = 1;
- } else
- p->ncol++;
- }
- continue;
- }
- elem_end = 0;
- if (b[pend] != '>')
- in_tag = 1;
- else if (pend > 0 && b[pend - 1] == '/') {
- b[pend - 1] = '\0';
- elem_end = 1;
- }
- b[pend] = '\0';
- if (pend < rlen)
- pend++;
- if (b[++poff] == '/') {
- elem_end = 1;
- poff++;
- } else
- xml_elem_start(p, b + poff);
- if (elem_end)
- xml_elem_end(p, b + poff);
-
- /* Process an entity. */
-
- } else if (b[poff] == '&') {
- if (advance(p, b, rlen, &pend, ";") &&
- rsz > 0)
- break;
- b[pend] = '\0';
- if (pend < rlen)
- pend++;
- xml_entity(p, b + poff + 1);
-
- /* Process text up to the next tag or entity. */
-
- } else {
- if (advance(p, b, rlen, &pend, "<&") == 0)
- p->ncol--;
- xml_char(p, b + poff, pend - poff);
- }
- }
-
+ while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
+ (rlen += rsz) > 0) {
+ poff = parse_string(p, b, rlen, &pstate, rsz > 0);
/* Buffer exhausted; shift left and re-fill. */
-
assert(poff > 0);
- memmove(b, b + poff, rlen - poff);
rlen -= poff;
+ memmove(b, b + poff, rlen);
}
if (rsz < 0) {
perror(fname);
--
To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-04-05 14:38 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-05 14:38 docbook2mdoc: Split parse_file() into parse_file() to fill the parse schwarze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).