source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* docbook2mdoc: Handle DOCTYPE declarations containing ENTITY definitions.
@ 2019-04-08 14:38 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2019-04-08 14:38 UTC (permalink / raw)
  To: source

Log Message:
-----------
Handle DOCTYPE declarations containing ENTITY definitions.
Also make <sbr> self-closing even without a trailing slash.

Modified Files:
--------------
    docbook2mdoc:
        node.c
        node.h
        parse.c

Revision Data
-------------
Index: node.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/node.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -Lnode.c -Lnode.c -u -p -r1.4 -r1.5
--- node.c
+++ node.c
@@ -29,11 +29,15 @@ static	const char *const attrkeys[ATTRKE
 	"class",
 	"close",
 	"cols",
+	"DEFINITION",
 	"endterm",
 	"id",
 	"linkend",
+	"NAME",
 	"open",
+	"PUBLIC",
 	"rep",
+	"SYSTEM",
 	"url",
 	"xlink:href"
 };
Index: node.h
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/node.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -Lnode.h -Lnode.h -u -p -r1.12 -r1.13
--- node.h
+++ node.h
@@ -48,9 +48,11 @@ enum	nodeid {
 	NODE_CONTRIB,
 	NODE_COPYRIGHT,
 	NODE_DATE,
+	NODE_DOCTYPE,
 	NODE_EDITOR,
 	NODE_EMAIL,
 	NODE_EMPHASIS,
+	NODE_ENTITY,
 	NODE_ENTRY,
 	NODE_ENVAR,
 	NODE_ERRORNAME,
@@ -155,11 +157,15 @@ enum	attrkey {
 	ATTRKEY_CLASS,
 	ATTRKEY_CLOSE,
 	ATTRKEY_COLS,
+	ATTRKEY_DEFINITION,
 	ATTRKEY_ENDTERM,
 	ATTRKEY_ID,
 	ATTRKEY_LINKEND,
+	ATTRKEY_NAME,
 	ATTRKEY_OPEN,
+	ATTRKEY_PUBLIC,
 	ATTRKEY_REP,
+	ATTRKEY_SYSTEM,
 	ATTRKEY_URL,
 	ATTRKEY_XLINK_HREF,
 	ATTRKEY__MAX
Index: parse.c
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/parse.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -Lparse.c -Lparse.c -u -p -r1.22 -r1.23
--- parse.c
+++ parse.c
@@ -45,6 +45,7 @@ enum	pstate {
 struct	parse {
 	const char	*fname;  /* Name of the input file. */
 	struct ptree	*tree;   /* Complete parse result. */
+	struct pnode	*doctype;
 	struct pnode	*cur;	 /* Current node in the tree. */
 	enum nodeid	 ncur;   /* Type of the current node. */
 	int		 line;   /* Line number in the input file. */
@@ -52,9 +53,11 @@ struct	parse {
 	int		 nline;  /* Line number of next token. */
 	int		 ncol;   /* Column number of next token. */
 	int		 del;    /* Levels of nested nodes being deleted. */
-	int		 spc;	 /* Whitespace before the next element. */
-	int		 attr;   /* The most recent attribute is valid. */
-	int		 warn;
+	int		 flags;
+#define	PFLAG_WARN	 (1 << 0)  /* Print warning messages. */
+#define	PFLAG_SPC	 (1 << 1)  /* Whitespace before the next element. */
+#define	PFLAG_ATTR	 (1 << 2)  /* The most recent attribute is valid. */
+#define	PFLAG_EEND	 (1 << 3)  /* This element is self-closing. */
 };
 
 struct	element {
@@ -87,9 +90,12 @@ static	const struct element elements[] =
 	{ "contrib",		NODE_CONTRIB },
 	{ "copyright",		NODE_COPYRIGHT },
 	{ "date",		NODE_DATE },
+	{ "!doctype",		NODE_DOCTYPE },
+	{ "!DOCTYPE",		NODE_DOCTYPE },
 	{ "editor",		NODE_EDITOR },
 	{ "email",		NODE_EMAIL },
 	{ "emphasis",		NODE_EMPHASIS },
+	{ "!ENTITY",		NODE_ENTITY },
 	{ "entry",		NODE_ENTRY },
 	{ "envar",		NODE_ENVAR },
 	{ "errorname",		NODE_ERRORNAME },
@@ -265,6 +271,10 @@ static	const struct entity entities[] = 
 	{ NULL,		NULL }
 };
 
+static size_t	 parse_string(struct parse *, char *, size_t,
+			 enum pstate *, int);
+
+
 static void
 error_msg(struct parse *p, const char *fmt, ...)
 {
@@ -283,7 +293,7 @@ warn_msg(struct parse *p, const char *fm
 {
 	va_list		 ap;
 
-	if (p->warn == 0)
+	if ((p->flags & PFLAG_WARN) == 0)
 		return;
 
 	fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
@@ -318,7 +328,7 @@ xml_char(struct parse *ps, const char *p
 			exit(1);
 		}
 		dat->node = NODE_TEXT;
-		dat->spc = ps->spc;
+		dat->spc = (ps->flags & PFLAG_SPC) != 0;
 		dat->parent = ps->cur;
 		TAILQ_INIT(&dat->childq);
 		TAILQ_INIT(&dat->attrq);
@@ -333,18 +343,18 @@ xml_char(struct parse *ps, const char *p
 	/* Append to the current text node. */
 
 	assert(sz >= 0);
-	newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
+	newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz;
 	ps->cur->b = realloc(ps->cur->b, newsz + 1);
 	if (ps->cur->b == NULL) {
 		perror(NULL);
 		exit(1);
 	}
-	if (ps->cur->bsz && ps->spc)
+	if (ps->cur->bsz && (ps->flags & PFLAG_SPC))
 		ps->cur->b[ps->cur->bsz++] = ' ';
 	memcpy(ps->cur->b + ps->cur->bsz, p, sz);
 	ps->cur->b[ps->cur->bsz = newsz] = '\0';
 	ps->cur->real = ps->cur->b;
-	ps->spc = 0;
+	ps->flags &= ~PFLAG_SPC;
 }
 
 /*
@@ -360,7 +370,7 @@ pnode_closetext(struct parse *p)
 	p->cur = n->parent;
 	while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
 		n->b[--n->bsz] = '\0';
-		p->spc = 1;
+		p->flags |= PFLAG_SPC;
 	}
 }
 
@@ -369,6 +379,9 @@ xml_entity(struct parse *p, const char *
 {
 	const struct entity	*entity;
 	struct pnode		*dat;
+	const char		*ccp;
+	char			*cp;
+	enum pstate		 pstate;
 
 	if (p->del > 0)
 		return;
@@ -388,6 +401,25 @@ xml_entity(struct parse *p, const char *
 			break;
 
 	if (entity->roff == NULL) {
+		if (p->doctype != NULL) {
+			TAILQ_FOREACH(dat, &p->doctype->childq, child) {
+				if ((ccp = pnode_getattr_raw(dat,
+				     ATTRKEY_NAME, NULL)) == NULL ||
+				    strcmp(ccp, name) != 0 ||
+				    (ccp = pnode_getattr_raw(dat,
+				     ATTRKEY_DEFINITION, NULL)) == NULL)
+					continue;
+				if ((cp = strdup(ccp)) == NULL) {
+					perror(NULL);
+					exit(1);
+				}
+				pstate = PARSE_ELEM;
+				parse_string(p, cp, strlen(cp), &pstate, 0);
+				p->flags &= ~PFLAG_SPC;
+				free(cp);
+				return;
+			}
+		}
 		error_msg(p, "unknown entity &%s;", name);
 		return;
 	}
@@ -400,12 +432,12 @@ xml_entity(struct parse *p, const char *
 	}
 	dat->node = NODE_ESCAPE;
 	dat->bsz = strlen(dat->b);
-	dat->spc = p->spc;
+	dat->spc = (p->flags & PFLAG_SPC) != 0;
 	dat->parent = p->cur;
 	TAILQ_INIT(&dat->childq);
 	TAILQ_INIT(&dat->attrq);
 	TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
-	p->spc = 0;
+	p->flags &= ~PFLAG_SPC;
 }
 
 /*
@@ -417,15 +449,13 @@ xml_elem_start(struct parse *ps, const c
 	const struct element	*elem;
 	struct pnode		*dat;
 
-	if (*name == '!' || *name == '?')
-		return;
-
 	/*
 	 * An ancestor is excluded from the tree;
 	 * keep track of the number of levels excluded.
 	 */
 	if (ps->del > 0) {
-		ps->del++;
+		if (*name != '!' && *name != '?')
+			ps->del++;
 		return;
 	}
 
@@ -435,8 +465,11 @@ xml_elem_start(struct parse *ps, const c
 		if (strcmp(elem->name, name) == 0)
 			break;
 
-	if (elem->name == NULL)
+	if (elem->name == NULL) {
+		if (*name == '!' || *name == '?')
+			return;
 		error_msg(ps, "unknown element <%s>", name);
+	}
 
 	ps->ncur = elem->node;
 
@@ -470,6 +503,11 @@ xml_elem_start(struct parse *ps, const c
 	 */
 
 	switch (dat->node = elem->node) {
+	case NODE_DOCTYPE:
+	case NODE_ENTITY:
+	case NODE_SBR:
+		ps->flags |= PFLAG_EEND;
+		/* FALLTHROUGH */
 	case NODE_APPENDIX:
 	case NODE_AUTHORGROUP:
 	case NODE_BLOCKQUOTE:
@@ -493,7 +531,6 @@ xml_elem_start(struct parse *ps, const c
 	case NODE_REFNAMEDIV:
 	case NODE_REFSYNOPSISDIV:
 	case NODE_ROW:
-	case NODE_SBR:
 	case NODE_SCREEN:
 	case NODE_SECTION:
 	case NODE_SYNOPSIS:
@@ -506,7 +543,7 @@ xml_elem_start(struct parse *ps, const c
 		dat->spc = 1;
 		break;
 	default:
-		dat->spc = ps->spc;
+		dat->spc = (ps->flags & PFLAG_SPC) != 0;
 		break;
 	}
 	dat->parent = ps->cur;
@@ -517,7 +554,12 @@ xml_elem_start(struct parse *ps, const c
 		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
 
 	ps->cur = dat;
-	if (ps->tree->root == NULL)
+	if (dat->node == NODE_DOCTYPE) {
+		if (ps->doctype == NULL)
+			ps->doctype = dat;
+		else
+			error_msg(ps, "duplicate doctype");
+	} else if (dat->parent == NULL && ps->tree->root == NULL)
 		ps->tree->root = dat;
 }
 
@@ -525,12 +567,21 @@ static void
 xml_attrkey(struct parse *ps, const char *name)
 {
 	struct pattr	*attr;
+	const char	*value;
 	enum attrkey	 key;
 
 	if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
 		return;
+
+	if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) &&
+	    TAILQ_FIRST(&ps->cur->attrq) == NULL) {
+		value = name;
+		name = "NAME";
+	} else
+		value = NULL;
+
 	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
-		ps->attr = 0;
+		ps->flags &= ~PFLAG_ATTR;
 		return;
 	}
 	if ((attr = calloc(1, sizeof(*attr))) == NULL) {
@@ -539,9 +590,19 @@ xml_attrkey(struct parse *ps, const char
 	}
 	attr->key = key;
 	attr->val = ATTRVAL__MAX;
-	attr->rawval = NULL;
+	if (value == NULL) {
+		attr->rawval = NULL;
+		ps->flags |= PFLAG_ATTR;
+	} else {
+		if ((attr->rawval = strdup(value)) == NULL) {
+			perror(NULL);
+			exit(1);
+		}
+		ps->flags &= ~PFLAG_ATTR;
+	}
 	TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
-	ps->attr = 1;
+	if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
+		xml_attrkey(ps, "DEFINITION");
 }
 
 static void
@@ -549,7 +610,8 @@ xml_attrval(struct parse *ps, const char
 {
 	struct pattr	*attr;
 
-	if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0)
+	if (ps->del > 0 || ps->ncur == NODE_IGNORE ||
+	    (ps->flags & PFLAG_ATTR) == 0)
 		return;
 	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
 		return;
@@ -598,6 +660,9 @@ xml_elem_end(struct parse *ps, const cha
 		break;
 	case NODE_IGNORE:
 		break;
+	case NODE_DOCTYPE:
+		ps->flags &= ~PFLAG_EEND;
+		/* FALLTHROUGH */
 	default:
 		if (ps->cur == NULL || node != ps->cur->node) {
 			warn_msg(ps, "element not open: </%s>", name);
@@ -611,11 +676,13 @@ xml_elem_end(struct parse *ps, const cha
 		 * obviously better than discarding it or crashing.
 		 */
 
-		if (ps->cur->parent == NULL)
-			ps->tree->flags |= TREE_CLOSED;
-		else
+		if (ps->cur->parent != NULL || node == NODE_DOCTYPE) {
 			ps->cur = ps->cur->parent;
-		ps->spc = 0;
+			if (ps->cur != NULL)
+				ps->ncur = ps->cur->node;
+		} else
+			ps->tree->flags |= TREE_CLOSED;
+		ps->flags &= ~PFLAG_SPC;
 		break;
 	}
 	assert(ps->del == 0);
@@ -633,7 +700,10 @@ parse_alloc(int warn)
 		free(p);
 		return NULL;
 	}
-	p->warn = warn;
+	if (warn)
+		p->flags |= PFLAG_WARN;
+	else
+		p->flags &= ~PFLAG_WARN;
 	return p;
 }
 
@@ -709,7 +779,6 @@ parse_string(struct parse *p, char *b, s
 	size_t		 pend;  /* Offset of the end of the current word. */
 	int		 elem_end;
 
-	p->spc = 0;
 	pend = 0;
 	for (;;) {
 
@@ -722,7 +791,7 @@ parse_string(struct parse *p, char *b, s
 		if ((poff = pend) == rlen)
 			break;
 		if (isspace((unsigned char)b[pend])) {
-			p->spc = 1;
+			p->flags |= PFLAG_SPC;
 			increment(p, b, &pend, refill);
 			continue;
 		}
@@ -763,6 +832,8 @@ parse_string(struct parse *p, char *b, s
 					b[pend - 1] = '\0';
 					elem_end = 1;
 				}
+				if (p->flags & PFLAG_EEND)
+					elem_end = 1;
 			}
 			b[pend] = '\0';
 			if (pend < rlen)
@@ -774,6 +845,23 @@ parse_string(struct parse *p, char *b, s
 		/* Look for an attribute name. */
 
 		} else if (*pstate == PARSE_TAG) {
+			switch (p->ncur) {
+			case NODE_DOCTYPE:
+				if (b[pend] == '[') {
+					*pstate = PARSE_ELEM;
+					increment(p, b, &pend, refill);
+					continue;
+				}
+				/* FALLTHROUGH */
+			case NODE_ENTITY:
+				if (b[pend] == '"' || b[pend] == '\'') {
+					*pstate = PARSE_ARG;
+					continue;
+				}
+				break;
+			default:
+				break;
+			}
 			if (advance(p, b, rlen, &pend, " =>", refill))
 				break;
 			elem_end = 0;
@@ -784,6 +872,8 @@ parse_string(struct parse *p, char *b, s
 					b[pend - 1] = '\0';
 					elem_end = 1;
 				}
+				if (p->flags & PFLAG_EEND)
+					elem_end = 1;
 				break;
 			case '=':
 				*pstate = PARSE_ARG;
@@ -832,11 +922,21 @@ parse_string(struct parse *p, char *b, s
 			if (b[++poff] == '/') {
 				elem_end = 1;
 				poff++;
-			} else
+			} else {
 				xml_elem_start(p, b + poff);
+				if (*pstate == PARSE_ELEM &&
+				    p->flags & PFLAG_EEND)
+					elem_end = 1;
+			}
 			if (elem_end)
 				xml_elem_end(p, b + poff);
 
+		/* Close a doctype. */
+
+		} else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
+			*pstate = PARSE_TAG;
+			increment(p, b, &pend, refill);
+
 		/* Process an entity. */
 
 		} else if (b[poff] == '&') {
@@ -899,5 +999,6 @@ parse_file(struct parse *p, int fd, cons
 	pnode_closetext(p);
 	if ((p->tree->flags & TREE_CLOSED) == 0)
 		warn_msg(p, "document not closed");
+	pnode_unlink(p->doctype);
 	return p->tree;
 }
--
 To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-04-08 14:38 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-08 14:38 docbook2mdoc: Handle DOCTYPE declarations containing ENTITY definitions schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).