From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from localhost (fantadrom.bsd.lv [local]) by fantadrom.bsd.lv (OpenSMTPD) with ESMTPA id 778eadda for ; Fri, 29 Mar 2019 10:55:59 -0500 (EST) Date: Fri, 29 Mar 2019 10:55:59 -0500 (EST) X-Mailinglist: mandoc-source Reply-To: source@mandoc.bsd.lv MIME-Version: 1.0 From: schwarze@mandoc.bsd.lv To: source@mandoc.bsd.lv Subject: docbook2mdoc: Add a utility for docbook2mdoc developers to collect X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/ Content-Type: text/plain; charset=utf-8 Message-ID: Log Message: ----------- Add a utility for docbook2mdoc developers to collect element usage and parenting statistics, to help decide which nodes should be most urgently worked on. Modified Files: -------------- docbook2mdoc: Makefile Added Files: ----------- docbook2mdoc: statistics.c Revision Data ------------- --- /dev/null +++ statistics.c @@ -0,0 +1,288 @@ +/* $Id: statistics.c,v 1.1 2019/03/29 15:55:28 schwarze Exp $ */ +/* + * Copyright (c) 2019 Ingo Schwarze + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Count parent-child element relations in a corpus of DocBook documents. + * + * Read absolute or relative input file names from standard input, + * one per line. + * For each parent-child relation, print the total number of occurrences, + * the parent name, and the child name, separated by tab characters + * and followed by a newline character. + * + * Typical usage: + * statistics < filenames.txt | sort -n + * statistics < filenames.txt | grep '\' | sort -n + */ + +struct entry { + char *parent; + char *child; + int count; +}; + +static struct entry *table; +static size_t tablesz; +static size_t tablei; + +static char **stack; +static size_t stacksz; +static size_t stacki; + + +/* + * Count one instance of a parent-child relation. + */ +static void +table_add(const char *parent, const char *child) +{ + size_t i; + + /* If the table entry already exists, increment its count. */ + + for (i = 0; i < tablei; i++) { + if (strcmp(parent, table[i].parent) == 0 && + strcmp(child, table[i].child) == 0) { + table[i].count++; + return; + } + } + + /* If the table is full, make room. */ + + if (tablei == tablesz) { + tablesz += 64; + table = reallocarray(table, tablesz, sizeof(*table)); + if (table == NULL) + err(1, NULL); + } + + /* Add a new entry to the table. */ + + if ((table[tablei].parent = strdup(parent)) == NULL) + err(1, NULL); + if ((table[tablei].child = strdup(child)) == NULL) + err(1, NULL); + table[tablei++].count = 1; +} + +/* + * Enter an element. + */ +static void +stack_push(const char *name) +{ + if (stacki == stacksz) { + stacksz += 8; + stack = reallocarray(stack, stacksz, sizeof(*stack)); + if (stack == NULL) + err(1, NULL); + } + if ((stack[stacki++] = strdup(name)) == NULL) + err(1, NULL); +} + +/* + * Exit an element. + */ +static void +stack_pop(const char *name) +{ + if (stacki > 0 && (name == NULL || + strcmp(name, stack[stacki - 1]) == 0)) + free(stack[--stacki]); +} + +/* + * Simplified version from parse.c. + */ +static int +advance(char *b, size_t rlen, size_t *pend, const char *charset) +{ + int space; + + if (*charset == ' ') { + space = 1; + charset++; + } else + space = 0; + + while (*pend < rlen) { + if (space && isspace((unsigned char)b[*pend])) + break; + if (strchr(charset, b[*pend]) != NULL) + break; + ++*pend; + } + if (*pend == rlen) { + b[rlen] = '\0'; + return 1; + } else + return 0; +} + +/* + * Simplified version from parse.c. + */ +static void +parse_file(int fd, char *fname) +{ + char b[4096]; + ssize_t rsz; /* Return value from read(2). */ + size_t rlen; /* Number of bytes in b[]. */ + size_t poff; /* Parse offset in b[]. */ + size_t pend; /* Offset of the end of the current word. */ + int in_tag, in_arg, in_quotes, elem_end; + + rlen = 0; + in_tag = in_arg = in_quotes = 0; + while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { + if ((rlen += rsz) == 0) + break; + pend = 0; + for (;;) { + if ((poff = pend) == rlen) + break; + if (isspace((unsigned char)b[pend])) { + pend++; + continue; + } + if (in_arg) { + if (in_quotes == 0 && b[pend] == '"') { + in_quotes = 1; + pend++; + continue; + } + if (advance(b, rlen, &pend, + in_quotes ? "\"" : " >") && rsz > 0) + break; + in_arg = in_quotes = elem_end = 0; + if (b[pend] == '>') { + in_tag = 0; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + } + b[pend] = '\0'; + if (pend < rlen) + pend++; + if (elem_end) + stack_pop(NULL); + } else if (in_tag) { + if (advance(b, rlen, &pend, " =>") && rsz > 0) + break; + elem_end = 0; + switch (b[pend]) { + case '>': + in_tag = 0; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + break; + case '=': + in_arg = 1; + break; + default: + break; + } + b[pend] = '\0'; + if (pend < rlen) + pend++; + if (elem_end) + stack_pop(NULL); + } else if (b[poff] == '<') { + if (advance(b, rlen, &pend, " >") && rsz > 0) + break; + elem_end = 0; + if (b[pend] != '>') + in_tag = 1; + else if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + b[pend] = '\0'; + if (pend < rlen) + pend++; + if (b[++poff] == '/') { + elem_end = 1; + poff++; + } else if (b[poff] != '!' && b[poff] != '?') { + table_add(stacki > 0 ? + stack[stacki - 1] : "", + b + poff); + stack_push(b + poff); + } + if (elem_end) + stack_pop(b + poff); + } else { + advance(b, rlen, &pend, "<"); + if (stacki > 0) + table_add(stack[stacki - 1], "TEXT"); + } + } + assert(poff > 0); + memmove(b, b + poff, rlen - poff); + rlen -= poff; + } + if (rsz < 0) + perror(fname); +} + +int +main(int argc, char *argv[]) +{ + char *fname; + size_t fsz, i; + ssize_t rsz; + int fd; + + fd = -1; + fname = NULL; + + /* Loop over input files. */ + while ((rsz = getline(&fname, &fsz, stdin)) != -1) { + if (fname[rsz - 1] == '\n') + fname[--rsz] = '\0'; + if ((fd = open(fname, O_RDONLY, 0)) == -1) + err(1, "%s", fname); + parse_file(fd, fname); + close(fd); + } + + /* Cleanup and error handling. */ + free(fname); + if (ferror(stdin)) + err(1, "standard input"); + if (fd == -1) + errx(1, "No input file names found on standard input"); + + /* Dump results. */ + for (i = 0; i < tablei; i++) + printf("%d\t%s\t%s\n", table[i].count, + table[i].parent, table[i].child); + return 0; +} Index: Makefile =================================================================== RCS file: /home/cvs/mdocml/docbook2mdoc/Makefile,v retrieving revision 1.21 retrieving revision 1.22 diff -LMakefile -LMakefile -u -p -r1.21 -r1.22 --- Makefile +++ Makefile @@ -12,6 +12,9 @@ all: docbook2mdoc docbook2mdoc: $(OBJS) $(CC) -o $@ $(OBJS) +statistics: statistics.o + $(CC) -o $@ statistics.c + www: index.html docbook2mdoc.1.html docbook2mdoc-$(VERSION).tgz README.txt install: all @@ -52,6 +55,7 @@ README.txt: README clean: rm -f docbook2mdoc $(OBJS) docbook2mdoc.core + rm -f statistics statistics.o rm -rf docbook2mdoc.dSYM rm -f index.html docbook2mdoc.1.html README.txt rm -f docbook2mdoc-$(VERSION).tgz -- To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv