From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from localhost (fantadrom.bsd.lv [local]) by fantadrom.bsd.lv (OpenSMTPD) with ESMTPA id 7c41fdfa for ; Fri, 5 Apr 2019 09:38:07 -0500 (EST) Date: Fri, 5 Apr 2019 09:38:07 -0500 (EST) X-Mailinglist: mandoc-source Reply-To: source@mandoc.bsd.lv MIME-Version: 1.0 From: schwarze@mandoc.bsd.lv To: source@mandoc.bsd.lv Subject: docbook2mdoc: Split parse_file() into parse_file() to fill the parse X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/ Content-Type: text/plain; charset=utf-8 Message-ID: Log Message: ----------- Split parse_file() into parse_file() to fill the parse buffer and parse_string() to handle it, to allow parsing from alternative sources in a subsequent step. Only advance line and column number when parsing from the main input file. Represent parsing state as an enum rather than with multiple flags. Modified Files: -------------- docbook2mdoc: parse.c Revision Data ------------- Index: parse.c =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v retrieving revision 1.13 retrieving revision 1.14 diff -Lparse.c -Lparse.c -u -p -r1.13 -r1.14 --- parse.c +++ parse.c @@ -30,6 +30,14 @@ * The implementation of the DocBook parser. */ +enum pstate { + PARSE_ELEM, + PARSE_TAG, + PARSE_ARG, + PARSE_SQ, + PARSE_DQ +}; + /* * Global parse state. * Keep this as simple and small as possible. @@ -582,6 +590,19 @@ parse_free(struct parse *p) free(p); } +static void +increment(struct parse *p, char *b, size_t *pend, int refill) +{ + if (refill) { + if (b[*pend] == '\n') { + p->nline++; + p->ncol = 1; + } else + p->ncol++; + } + ++*pend; +} + /* * Advance the pend pointer to the next character in the charset. * If the charset starts with a space, it stands for any whitespace. @@ -592,7 +613,7 @@ parse_free(struct parse *p) */ static int advance(struct parse *p, char *b, size_t rlen, size_t *pend, - const char *charset) + const char *charset, int refill) { int space; @@ -602,50 +623,199 @@ advance(struct parse *p, char *b, size_t } else space = 0; - p->nline = p->line; - p->ncol = p->col; + if (refill) { + p->nline = p->line; + p->ncol = p->col; + } while (*pend < rlen) { - if (b[*pend] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; if (space && isspace((unsigned char)b[*pend])) break; if (strchr(charset, b[*pend]) != NULL) break; - ++*pend; + increment(p, b, pend, refill); } if (*pend == rlen) { b[rlen] = '\0'; - return 1; + return refill; } else return 0; } +size_t +parse_string(struct parse *p, char *b, size_t rlen, + enum pstate *pstate, int refill) +{ + char *cp; + size_t poff; /* Parse offset in b[]. */ + size_t pend; /* Offset of the end of the current word. */ + int elem_end; + + pend = 0; + for (;;) { + + /* Proceed to the next token, skipping whitespace. */ + + if (refill) { + p->line = p->nline; + p->col = p->ncol; + } + if ((poff = pend) == rlen) + break; + if (isspace((unsigned char)b[pend])) { + increment(p, b, &pend, refill); + continue; + } + + /* + * The following four cases (ARG, TAG, and starting an + * entity or a tag) all parse a word or quoted string. + * If that extends beyond the read buffer and the last + * read(2) still got data, they all break out of the + * token loop to request more data from the read loop. + * + * Also, three of them detect self-closing tags, those + * ending with "/>", setting the flag elem_end and + * calling xml_elem_end() at the very end, after + * handling the attribute value, attribute name, or + * tag name, respectively. + */ + + /* Parse an attribute value. */ + + if (*pstate >= PARSE_ARG) { + if (*pstate == PARSE_ARG && + (b[pend] == '\'' || b[pend] == '"')) { + *pstate = b[pend] == '"' ? + PARSE_DQ : PARSE_SQ; + increment(p, b, &pend, refill); + continue; + } + if (advance(p, b, rlen, &pend, + *pstate == PARSE_DQ ? "\"" : + *pstate == PARSE_SQ ? "'" : " >", refill)) + break; + *pstate = PARSE_TAG; + elem_end = 0; + if (b[pend] == '>') { + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrval(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); + + /* Look for an attribute name. */ + + } else if (*pstate == PARSE_TAG) { + if (advance(p, b, rlen, &pend, " =>", refill)) + break; + elem_end = 0; + switch (b[pend]) { + case '>': + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + break; + case '=': + *pstate = PARSE_ARG; + break; + default: + break; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrkey(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); + + /* Begin an opening or closing tag. */ + + } else if (b[poff] == '<') { + if (advance(p, b, rlen, &pend, " >", refill)) + break; + if (pend > poff + 3 && + strncmp(b + poff, ""); + if (cp == NULL) { + if (refill) + break; + cp = b + rlen; + } else + cp += 3; + while (b + pend < cp) + increment(p, b, &pend, refill); + continue; + } + elem_end = 0; + if (b[pend] != '>') + *pstate = PARSE_TAG; + else if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + if (b[++poff] == '/') { + elem_end = 1; + poff++; + } else + xml_elem_start(p, b + poff); + if (elem_end) + xml_elem_end(p, b + poff); + + /* Process an entity. */ + + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";", refill)) + break; + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_entity(p, b + poff + 1); + + /* Process text up to the next tag, entity, or EOL. */ + + } else { + advance(p, b, rlen, &pend, "<&", refill); + xml_char(p, b + poff, pend - poff); + } + } + return poff; +} + struct ptree * parse_file(struct parse *p, int fd, const char *fname) { char b[4096]; - char *cp; ssize_t rsz; /* Return value from read(2). */ - size_t rlen; /* Number of bytes in b[]. */ + size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ - size_t pend; /* Offset of the end of the current word. */ - int in_tag, in_arg, in_quotes, elem_end; + enum pstate pstate; p->fname = fname; p->nline = 1; p->ncol = 1; + pstate = PARSE_ELEM; rlen = 0; - in_tag = in_arg = in_quotes = 0; /* * Read loop. * - * We have to enter the read loop once more even on EOF - * because the previous token may have been incomplete, - * such that it asked for more input. + * If the previous token was incomplete and asked for more + * input, we have to enter the read loop once more even on EOF. * Once rsz is 0, incomplete tokens will no longer ask * for more input but instead use whatever there is, * and then exit the read loop. @@ -653,174 +823,13 @@ parse_file(struct parse *p, int fd, cons * such that advance() can set b[rlen] to NUL when needed. */ - while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { - if ((rlen += rsz) == 0) - break; - - /* Token loop. */ - - pend = 0; - for (;;) { - - /* Proceed to the next token, skipping whitespace. */ - - p->line = p->nline; - p->col = p->ncol; - if ((poff = pend) == rlen) - break; - if (isspace((unsigned char)b[pend])) { - if (b[pend++] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; - continue; - } - - /* - * The following four cases (in_arg, in_tag, and - * starting an entity or a tag) all parse a word - * or quoted string. If that extends beyond the - * read buffer and the last read(2) still got - * data, they all break out of the token loop - * to request more data from the read loop. - * - * Also, three of them detect self-closing tags, - * those ending with "/>", setting the flag - * elem_end and calling xml_elem_end() at the - * very end, after handling the attribute value, - * attribute name, or tag name, respectively. - */ - - /* Parse an attribute value. */ - - if (in_arg) { - if (in_quotes == 0 && - (b[pend] == '\'' || b[pend] == '"')) { - in_quotes = b[pend] == '"' ? 2 : 1; - p->ncol++; - pend++; - continue; - } - if (advance(p, b, rlen, &pend, - in_quotes == 2 ? "\"" : - in_quotes == 1 ? "'" : " >") && rsz > 0) - break; - in_arg = in_quotes = elem_end = 0; - if (b[pend] == '>') { - in_tag = 0; - if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - } - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_attrval(p, b + poff); - if (elem_end) - xml_elem_end(p, NULL); - - /* Look for an attribute name. */ - - } else if (in_tag) { - if (advance(p, b, rlen, &pend, " =>") && - rsz > 0) - break; - elem_end = 0; - switch (b[pend]) { - case '>': - in_tag = 0; - if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - break; - case '=': - in_arg = 1; - break; - default: - break; - } - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_attrkey(p, b + poff); - if (elem_end) - xml_elem_end(p, NULL); - - /* Begin an opening or closing tag. */ - - } else if (b[poff] == '<') { - if (advance(p, b, rlen, &pend, " >") && - rsz > 0) - break; - if (pend > poff + 3 && - strncmp(b + poff, ""); - if (cp == NULL) { - if (rsz > 0) { - pend = rlen; - break; - } - cp = b + rlen; - } else - cp += 3; - while (b + pend < cp) { - if (b[++pend] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; - } - continue; - } - elem_end = 0; - if (b[pend] != '>') - in_tag = 1; - else if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - b[pend] = '\0'; - if (pend < rlen) - pend++; - if (b[++poff] == '/') { - elem_end = 1; - poff++; - } else - xml_elem_start(p, b + poff); - if (elem_end) - xml_elem_end(p, b + poff); - - /* Process an entity. */ - - } else if (b[poff] == '&') { - if (advance(p, b, rlen, &pend, ";") && - rsz > 0) - break; - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_entity(p, b + poff + 1); - - /* Process text up to the next tag or entity. */ - - } else { - if (advance(p, b, rlen, &pend, "<&") == 0) - p->ncol--; - xml_char(p, b + poff, pend - poff); - } - } - + while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && + (rlen += rsz) > 0) { + poff = parse_string(p, b, rlen, &pstate, rsz > 0); /* Buffer exhausted; shift left and re-fill. */ - assert(poff > 0); - memmove(b, b + poff, rlen - poff); rlen -= poff; + memmove(b, b + poff, rlen); } if (rsz < 0) { perror(fname); -- To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv