tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
* Improve mandocdb catpage/man heuristics.
@ 2011-12-27 23:57 Kristaps Dzonsons
  2011-12-28  1:27 ` Ingo Schwarze
  0 siblings, 1 reply; 2+ messages in thread
From: Kristaps Dzonsons @ 2011-12-27 23:57 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 583 bytes --]

Hi,

This improves the mandocdb(8) catpage heuristic to, well, more or less 
as good as it's going to get.  It now reads multiple lines into a 
buffer, joining the lines with a space.

While here, I removed the 70-character limit.  I recoded this into 
apropos.c's and whatis.c's printf(3) statements.  We should really 
consider a better way: if not 70-char, to the COLUMN limit?

Lastly, I added an extra man(7) heuristic for separating names and 
descriptions, namely the \-\- I observed in some POD manuals.  This 
cleans up "apropos -s 3p ~.*" quite a lot.

Thoughts?

Kristaps

[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 4147 bytes --]

Index: apropos.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/apropos.c,v
retrieving revision 1.24
diff -u -r1.24 apropos.c
--- apropos.c	12 Dec 2011 02:00:49 -0000	1.24
+++ apropos.c	27 Dec 2011 23:50:39 -0000
@@ -127,11 +127,11 @@
 	qsort(res, sz, sizeof(struct res), cmp);
 
 	for (i = 0; i < (int)sz; i++)
-		printf("%s(%s%s%s) - %s\n", res[i].title,
+		printf("%s(%s%s%s) - %.*s\n", res[i].title,
 				res[i].cat,
 				*res[i].arch ? "/" : "",
 				*res[i].arch ? res[i].arch : "",
-				res[i].desc);
+				70, res[i].desc);
 }
 
 static int
Index: mandocdb.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandocdb.c,v
retrieving revision 1.41
diff -u -r1.41 mandocdb.c
--- mandocdb.c	25 Dec 2011 19:31:25 -0000	1.41
+++ mandocdb.c	27 Dec 2011 23:50:39 -0000
@@ -23,6 +23,7 @@
 #include <sys/types.h>
 
 #include <assert.h>
+#include <ctype.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <getopt.h>
@@ -129,8 +130,8 @@
 static	void		  ofile_dirbuild(const char *, const char *,
 				const char *, int, struct of **);
 static	void		  ofile_free(struct of *);
-static	void		  pformatted(DB *, struct buf *, struct buf *,
-				const struct of *);
+static	void		  pformatted(DB *, struct buf *, 
+				struct buf *, const struct of *);
 static	int		  pman_node(MAN_ARGS);
 static	void		  pmdoc_node(MDOC_ARGS);
 static	int		  pmdoc_head(MDOC_ARGS);
@@ -1319,6 +1320,8 @@
 
 			if (0 == strncmp(start, "-", 1))
 				start += 1;
+			else if (0 == strncmp(start, "\\-\\-", 4))
+				start += 4;
 			else if (0 == strncmp(start, "\\-", 2))
 				start += 2;
 			else if (0 == strncmp(start, "\\(en", 4))
@@ -1349,12 +1352,12 @@
  * By necessity, this involves rather crude guesswork.
  */
 static void
-pformatted(DB *hash, struct buf *buf, struct buf *dbuf,
-		 const struct of *of)
+pformatted(DB *hash, struct buf *buf, 
+		struct buf *dbuf, const struct of *of)
 {
 	FILE		*stream;
-	char		*line, *p;
-	size_t		 len, plen;
+	char		*line, *p, *title;
+	size_t		 len, plen, titlesz;
 
 	if (NULL == (stream = fopen(of->fname, "r"))) {
 		if (warnings)
@@ -1387,6 +1390,32 @@
 	while (NULL != (line = fgetln(stream, &len)))
 		if ('\n' != *line && ' ' != *line)
 			break;
+	
+	/*
+	 * Read up until the next section into a buffer.
+	 * Strip the leading and trailing newline from each read line,
+	 * appending a trailing space.
+	 * Ignore empty (whitespace-only) lines.
+	 */
+
+	titlesz = 0;
+	title = NULL;
+
+	while (NULL != (line = fgetln(stream, &len))) {
+		if (' ' != *line || '\n' != line[(int)len - 1])
+			break;
+		while (len > 0 && isspace((unsigned char)*line)) {
+			line++;
+			len--;
+		}
+		if (1 == len)
+			continue;
+		title = mandoc_realloc(title, titlesz + len);
+		memcpy(title + titlesz, line, len);
+		titlesz += len;
+		title[(int)titlesz - 1] = ' ';
+	}
+
 
 	/*
 	 * If no page content can be found, or the input line
@@ -1395,18 +1424,19 @@
 	 * description.
 	 */
 
-	line = fgetln(stream, &len);
-	if (NULL == line || ' ' != *line || '\n' != line[(int)len - 1]) {
+	if (NULL == title || '\0' == *title) {
 		if (warnings)
 			fprintf(stderr, "%s: cannot find NAME section\n",
 					of->fname);
 		buf_appendb(dbuf, buf->cp, buf->size);
 		hash_put(hash, buf, TYPE_Nd);
 		fclose(stream);
+		free(title);
 		return;
 	}
 
-	line[(int)--len] = '\0';
+	title = mandoc_realloc(title, titlesz + 1);
+	title[(int)titlesz] = '\0';
 
 	/*
 	 * Skip to the first dash.
@@ -1414,20 +1444,17 @@
 	 * bytes).
 	 */
 
-	if (NULL != (p = strstr(line, "- "))) {
+	if (NULL != (p = strstr(title, "- "))) {
 		for (p += 2; ' ' == *p || '\b' == *p; p++)
 			/* Skip to next word. */ ;
 	} else {
 		if (warnings)
 			fprintf(stderr, "%s: no dash in title line\n",
 					of->fname);
-		p = line;
+		p = title;
 	}
 
-	if ((plen = strlen(p)) > 70) {
-		plen = 70;
-		p[plen] = '\0';
-	}
+	plen = strlen(p);
 
 	/* Strip backspace-encoding from line. */
 
@@ -1446,6 +1473,7 @@
 	buf_appendb(buf, p, plen + 1);
 	hash_put(hash, buf, TYPE_Nd);
 	fclose(stream);
+	free(title);
 }
 
 static void

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2011-12-28  1:27 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-27 23:57 Improve mandocdb catpage/man heuristics Kristaps Dzonsons
2011-12-28  1:27 ` Ingo Schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).