* docbook2mdoc: Add a utility for docbook2mdoc developers to collect
@ 2019-03-29 15:55 schwarze
0 siblings, 0 replies; only message in thread
From: schwarze @ 2019-03-29 15:55 UTC (permalink / raw)
To: source
Log Message:
-----------
Add a utility for docbook2mdoc developers
to collect element usage and parenting statistics,
to help decide which nodes should be most urgently worked on.
Modified Files:
--------------
docbook2mdoc:
Makefile
Added Files:
-----------
docbook2mdoc:
statistics.c
Revision Data
-------------
--- /dev/null
+++ statistics.c
@@ -0,0 +1,288 @@
+/* $Id: statistics.c,v 1.1 2019/03/29 15:55:28 schwarze Exp $ */
+/*
+ * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Count parent-child element relations in a corpus of DocBook documents.
+ *
+ * Read absolute or relative input file names from standard input,
+ * one per line.
+ * For each parent-child relation, print the total number of occurrences,
+ * the parent name, and the child name, separated by tab characters
+ * and followed by a newline character.
+ *
+ * Typical usage:
+ * statistics < filenames.txt | sort -n
+ * statistics < filenames.txt | grep '\<listitem\>' | sort -n
+ */
+
+struct entry {
+ char *parent;
+ char *child;
+ int count;
+};
+
+static struct entry *table;
+static size_t tablesz;
+static size_t tablei;
+
+static char **stack;
+static size_t stacksz;
+static size_t stacki;
+
+
+/*
+ * Count one instance of a parent-child relation.
+ */
+static void
+table_add(const char *parent, const char *child)
+{
+ size_t i;
+
+ /* If the table entry already exists, increment its count. */
+
+ for (i = 0; i < tablei; i++) {
+ if (strcmp(parent, table[i].parent) == 0 &&
+ strcmp(child, table[i].child) == 0) {
+ table[i].count++;
+ return;
+ }
+ }
+
+ /* If the table is full, make room. */
+
+ if (tablei == tablesz) {
+ tablesz += 64;
+ table = reallocarray(table, tablesz, sizeof(*table));
+ if (table == NULL)
+ err(1, NULL);
+ }
+
+ /* Add a new entry to the table. */
+
+ if ((table[tablei].parent = strdup(parent)) == NULL)
+ err(1, NULL);
+ if ((table[tablei].child = strdup(child)) == NULL)
+ err(1, NULL);
+ table[tablei++].count = 1;
+}
+
+/*
+ * Enter an element.
+ */
+static void
+stack_push(const char *name)
+{
+ if (stacki == stacksz) {
+ stacksz += 8;
+ stack = reallocarray(stack, stacksz, sizeof(*stack));
+ if (stack == NULL)
+ err(1, NULL);
+ }
+ if ((stack[stacki++] = strdup(name)) == NULL)
+ err(1, NULL);
+}
+
+/*
+ * Exit an element.
+ */
+static void
+stack_pop(const char *name)
+{
+ if (stacki > 0 && (name == NULL ||
+ strcmp(name, stack[stacki - 1]) == 0))
+ free(stack[--stacki]);
+}
+
+/*
+ * Simplified version from parse.c.
+ */
+static int
+advance(char *b, size_t rlen, size_t *pend, const char *charset)
+{
+ int space;
+
+ if (*charset == ' ') {
+ space = 1;
+ charset++;
+ } else
+ space = 0;
+
+ while (*pend < rlen) {
+ if (space && isspace((unsigned char)b[*pend]))
+ break;
+ if (strchr(charset, b[*pend]) != NULL)
+ break;
+ ++*pend;
+ }
+ if (*pend == rlen) {
+ b[rlen] = '\0';
+ return 1;
+ } else
+ return 0;
+}
+
+/*
+ * Simplified version from parse.c.
+ */
+static void
+parse_file(int fd, char *fname)
+{
+ char b[4096];
+ ssize_t rsz; /* Return value from read(2). */
+ size_t rlen; /* Number of bytes in b[]. */
+ size_t poff; /* Parse offset in b[]. */
+ size_t pend; /* Offset of the end of the current word. */
+ int in_tag, in_arg, in_quotes, elem_end;
+
+ rlen = 0;
+ in_tag = in_arg = in_quotes = 0;
+ while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
+ if ((rlen += rsz) == 0)
+ break;
+ pend = 0;
+ for (;;) {
+ if ((poff = pend) == rlen)
+ break;
+ if (isspace((unsigned char)b[pend])) {
+ pend++;
+ continue;
+ }
+ if (in_arg) {
+ if (in_quotes == 0 && b[pend] == '"') {
+ in_quotes = 1;
+ pend++;
+ continue;
+ }
+ if (advance(b, rlen, &pend,
+ in_quotes ? "\"" : " >") && rsz > 0)
+ break;
+ in_arg = in_quotes = elem_end = 0;
+ if (b[pend] == '>') {
+ in_tag = 0;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ if (elem_end)
+ stack_pop(NULL);
+ } else if (in_tag) {
+ if (advance(b, rlen, &pend, " =>") && rsz > 0)
+ break;
+ elem_end = 0;
+ switch (b[pend]) {
+ case '>':
+ in_tag = 0;
+ if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ break;
+ case '=':
+ in_arg = 1;
+ break;
+ default:
+ break;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ if (elem_end)
+ stack_pop(NULL);
+ } else if (b[poff] == '<') {
+ if (advance(b, rlen, &pend, " >") && rsz > 0)
+ break;
+ elem_end = 0;
+ if (b[pend] != '>')
+ in_tag = 1;
+ else if (pend > 0 && b[pend - 1] == '/') {
+ b[pend - 1] = '\0';
+ elem_end = 1;
+ }
+ b[pend] = '\0';
+ if (pend < rlen)
+ pend++;
+ if (b[++poff] == '/') {
+ elem_end = 1;
+ poff++;
+ } else if (b[poff] != '!' && b[poff] != '?') {
+ table_add(stacki > 0 ?
+ stack[stacki - 1] : "",
+ b + poff);
+ stack_push(b + poff);
+ }
+ if (elem_end)
+ stack_pop(b + poff);
+ } else {
+ advance(b, rlen, &pend, "<");
+ if (stacki > 0)
+ table_add(stack[stacki - 1], "TEXT");
+ }
+ }
+ assert(poff > 0);
+ memmove(b, b + poff, rlen - poff);
+ rlen -= poff;
+ }
+ if (rsz < 0)
+ perror(fname);
+}
+
+int
+main(int argc, char *argv[])
+{
+ char *fname;
+ size_t fsz, i;
+ ssize_t rsz;
+ int fd;
+
+ fd = -1;
+ fname = NULL;
+
+ /* Loop over input files. */
+ while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
+ if (fname[rsz - 1] == '\n')
+ fname[--rsz] = '\0';
+ if ((fd = open(fname, O_RDONLY, 0)) == -1)
+ err(1, "%s", fname);
+ parse_file(fd, fname);
+ close(fd);
+ }
+
+ /* Cleanup and error handling. */
+ free(fname);
+ if (ferror(stdin))
+ err(1, "standard input");
+ if (fd == -1)
+ errx(1, "No input file names found on standard input");
+
+ /* Dump results. */
+ for (i = 0; i < tablei; i++)
+ printf("%d\t%s\t%s\n", table[i].count,
+ table[i].parent, table[i].child);
+ return 0;
+}
Index: Makefile
===================================================================
RCS file: /home/cvs/mdocml/docbook2mdoc/Makefile,v
retrieving revision 1.21
retrieving revision 1.22
diff -LMakefile -LMakefile -u -p -r1.21 -r1.22
--- Makefile
+++ Makefile
@@ -12,6 +12,9 @@ all: docbook2mdoc
docbook2mdoc: $(OBJS)
$(CC) -o $@ $(OBJS)
+statistics: statistics.o
+ $(CC) -o $@ statistics.c
+
www: index.html docbook2mdoc.1.html docbook2mdoc-$(VERSION).tgz README.txt
install: all
@@ -52,6 +55,7 @@ README.txt: README
clean:
rm -f docbook2mdoc $(OBJS) docbook2mdoc.core
+ rm -f statistics statistics.o
rm -rf docbook2mdoc.dSYM
rm -f index.html docbook2mdoc.1.html README.txt
rm -f docbook2mdoc-$(VERSION).tgz
--
To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-03-29 15:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-29 15:55 docbook2mdoc: Add a utility for docbook2mdoc developers to collect schwarze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).