docbook2mdoc: Add a utility for docbook2mdoc developers to collect

source@mandoc.bsd.lv
 help / color / mirror / Atom feed

* docbook2mdoc: Add a utility for docbook2mdoc developers to collect
@ 2019-03-29 15:55 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2019-03-29 15:55 UTC (permalink / raw)
  To: source

Log Message:
-----------
Add a utility for docbook2mdoc developers
to collect element usage and parenting statistics,
to help decide which nodes should be most urgently worked on.

Modified Files:
--------------
    docbook2mdoc:
        Makefile

Added Files:
-----------
    docbook2mdoc:
        statistics.c

Revision Data
-------------
--- /dev/null
+++ statistics.c
@@ -0,0 +1,288 @@
+/* $Id: statistics.c,v 1.1 2019/03/29 15:55:28 schwarze Exp $ */
+/*
+ * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Count parent-child element relations in a corpus of DocBook documents.
+ *
+ * Read absolute or relative input file names from standard input,
+ * one per line. 
+ * For each parent-child relation, print the total number of occurrences,
+ * the parent name, and the child name, separated by tab characters
+ * and followed by a newline character.
+ *
+ * Typical usage:
+ * statistics < filenames.txt | sort -n
+ * statistics < filenames.txt | grep '\<listitem\>' | sort -n
+ */
+
+struct entry {
+	char	*parent;
+	char	*child;
+	int	 count;
+};
+
+static struct entry	 *table;
+static size_t		  tablesz;
+static size_t		  tablei;
+
+static char		**stack;
+static size_t		  stacksz;
+static size_t		  stacki;
+
+
+/*
+ * Count one instance of a parent-child relation.
+ */
+static void
+table_add(const char *parent, const char *child)
+{
+	size_t	 i;
+
+	/* If the table entry already exists, increment its count. */
+
+	for (i = 0; i < tablei; i++) {
+		if (strcmp(parent, table[i].parent) == 0 &&
+		    strcmp(child, table[i].child) == 0) {
+			table[i].count++;
+			return;
+		}
+	}
+
+	/* If the table is full, make room. */
+
+	if (tablei == tablesz) {
+		tablesz += 64;
+		table = reallocarray(table, tablesz, sizeof(*table));
+		if (table == NULL)
+			err(1, NULL);
+	}
+
+	/* Add a new entry to the table. */
+
+	if ((table[tablei].parent = strdup(parent)) == NULL)
+		err(1, NULL);
+	if ((table[tablei].child = strdup(child)) == NULL)
+		err(1, NULL);
+	table[tablei++].count = 1;
+}
+
+/*
+ * Enter an element.
+ */
+static void
+stack_push(const char *name)
+{
+	if (stacki == stacksz) {
+		stacksz += 8;
+		stack = reallocarray(stack, stacksz, sizeof(*stack));
+		if (stack == NULL)
+			err(1, NULL);
+	}
+	if ((stack[stacki++] = strdup(name)) == NULL)
+		err(1, NULL);
+}
+
+/*
+ * Exit an element.
+ */
+static void
+stack_pop(const char *name)
+{
+	if (stacki > 0 && (name == NULL ||
+	    strcmp(name, stack[stacki - 1]) == 0))
+		free(stack[--stacki]);
+}
+
+/*
+ * Simplified version from parse.c.
+ */
+static int
+advance(char *b, size_t rlen, size_t *pend, const char *charset)
+{
+	int		 space;
+
+	if (*charset == ' ') {
+		space = 1;
+		charset++;
+	} else
+		space = 0;
+
+	while (*pend < rlen) {
+		if (space && isspace((unsigned char)b[*pend]))
+			break;
+		if (strchr(charset, b[*pend]) != NULL)
+			break;
+		++*pend;
+	}
+	if (*pend == rlen) {
+		b[rlen] = '\0';
+		return 1;
+	} else
+		return 0;
+}
+
+/*
+ * Simplified version from parse.c.
+ */
+static void
+parse_file(int fd, char *fname)
+{
+	char		 b[4096];
+	ssize_t		 rsz;	/* Return value from read(2). */
+	size_t		 rlen;  /* Number of bytes in b[]. */
+	size_t		 poff;  /* Parse offset in b[]. */
+	size_t		 pend;  /* Offset of the end of the current word. */
+	int		 in_tag, in_arg, in_quotes, elem_end;
+
+	rlen = 0;
+	in_tag = in_arg = in_quotes = 0;
+	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
+		if ((rlen += rsz) == 0)
+			break;
+		pend = 0;
+		for (;;) {
+			if ((poff = pend) == rlen)
+				break;
+			if (isspace((unsigned char)b[pend])) {
+				pend++;
+				continue;
+			}
+			if (in_arg) {
+				if (in_quotes == 0 && b[pend] == '"') {
+					in_quotes = 1;
+					pend++;
+					continue;
+				}
+				if (advance(b, rlen, &pend,
+				    in_quotes ? "\"" : " >") && rsz > 0)
+					break;
+				in_arg = in_quotes = elem_end = 0;
+				if (b[pend] == '>') {
+					in_tag = 0;
+					if (pend > 0 && b[pend - 1] == '/') {
+						b[pend - 1] = '\0';
+						elem_end = 1;
+					}
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				if (elem_end)
+					stack_pop(NULL);
+			} else if (in_tag) {
+				if (advance(b, rlen, &pend, " =>") && rsz > 0)
+					break;
+				elem_end = 0;
+				switch (b[pend]) {
+				case '>':
+					in_tag = 0;
+					if (pend > 0 && b[pend - 1] == '/') {
+						b[pend - 1] = '\0';
+						elem_end = 1;
+					}
+					break;
+				case '=':
+					in_arg = 1;
+					break;
+				default:
+					break;
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				if (elem_end)
+					stack_pop(NULL);
+			} else if (b[poff] == '<') {
+				if (advance(b, rlen, &pend, " >") && rsz > 0)
+					break;
+				elem_end = 0;
+				if (b[pend] != '>')
+					in_tag = 1;
+				else if (pend > 0 && b[pend - 1] == '/') {
+					b[pend - 1] = '\0';
+					elem_end = 1;
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				if (b[++poff] == '/') {
+					elem_end = 1;
+					poff++;
+				} else if (b[poff] != '!' && b[poff] != '?') {
+					table_add(stacki > 0 ?
+					    stack[stacki - 1] : "",
+					    b + poff);
+					stack_push(b + poff);
+				}
+				if (elem_end)
+					stack_pop(b + poff);
+			} else {
+				advance(b, rlen, &pend, "<");
+				if (stacki > 0)
+					table_add(stack[stacki - 1], "TEXT");
+			}
+		}
+		assert(poff > 0);
+		memmove(b, b + poff, rlen - poff);
+		rlen -= poff;
+	}
+	if (rsz < 0)
+		perror(fname);
+}
+
+int
+main(int argc, char *argv[])
+{
+	char		*fname;
+	size_t		 fsz, i;
+	ssize_t		 rsz;
+	int		 fd;
+
+	fd = -1;
+	fname = NULL;
+
+	/* Loop over input files. */
+	while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
+		if (fname[rsz - 1] == '\n')
+			fname[--rsz] = '\0';
+		if ((fd = open(fname, O_RDONLY, 0)) == -1)
+			err(1, "%s", fname);
+		parse_file(fd, fname);
+		close(fd);
+	}
+
+	/* Cleanup and error handling. */
+	free(fname);
+	if (ferror(stdin))
+		err(1, "standard input");
+	if (fd == -1)
+		errx(1, "No input file names found on standard input");
+
+	/* Dump results. */
+	for (i = 0; i < tablei; i++)
+		printf("%d\t%s\t%s\n", table[i].count,
+		    table[i].parent, table[i].child);
+	return 0;
+}
Index: Makefile
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/Makefile,v
retrieving revision 1.21
retrieving revision 1.22
diff -LMakefile -LMakefile -u -p -r1.21 -r1.22
--- Makefile
+++ Makefile
@@ -12,6 +12,9 @@ all: docbook2mdoc
 docbook2mdoc: $(OBJS)
 	$(CC) -o $@ $(OBJS)
 
+statistics: statistics.o
+	$(CC) -o $@ statistics.c
+
 www: index.html docbook2mdoc.1.html docbook2mdoc-$(VERSION).tgz README.txt
 
 install: all
@@ -52,6 +55,7 @@ README.txt: README
 
 clean:
 	rm -f docbook2mdoc $(OBJS) docbook2mdoc.core
+	rm -f statistics statistics.o
 	rm -rf docbook2mdoc.dSYM
 	rm -f index.html docbook2mdoc.1.html README.txt
 	rm -f docbook2mdoc-$(VERSION).tgz
--
 To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-03-29 15:55 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-29 15:55 docbook2mdoc: Add a utility for docbook2mdoc developers to collect schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).