tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
* Improve mandocdb catpage/man heuristics.
@ 2011-12-27 23:57 Kristaps Dzonsons
  2011-12-28  1:27 ` Ingo Schwarze
  0 siblings, 1 reply; 2+ messages in thread
From: Kristaps Dzonsons @ 2011-12-27 23:57 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 583 bytes --]

Hi,

This improves the mandocdb(8) catpage heuristic to, well, more or less 
as good as it's going to get.  It now reads multiple lines into a 
buffer, joining the lines with a space.

While here, I removed the 70-character limit.  I recoded this into 
apropos.c's and whatis.c's printf(3) statements.  We should really 
consider a better way: if not 70-char, to the COLUMN limit?

Lastly, I added an extra man(7) heuristic for separating names and 
descriptions, namely the \-\- I observed in some POD manuals.  This 
cleans up "apropos -s 3p ~.*" quite a lot.

Thoughts?

Kristaps

[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 4147 bytes --]

Index: apropos.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/apropos.c,v
retrieving revision 1.24
diff -u -r1.24 apropos.c
--- apropos.c	12 Dec 2011 02:00:49 -0000	1.24
+++ apropos.c	27 Dec 2011 23:50:39 -0000
@@ -127,11 +127,11 @@
 	qsort(res, sz, sizeof(struct res), cmp);
 
 	for (i = 0; i < (int)sz; i++)
-		printf("%s(%s%s%s) - %s\n", res[i].title,
+		printf("%s(%s%s%s) - %.*s\n", res[i].title,
 				res[i].cat,
 				*res[i].arch ? "/" : "",
 				*res[i].arch ? res[i].arch : "",
-				res[i].desc);
+				70, res[i].desc);
 }
 
 static int
Index: mandocdb.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandocdb.c,v
retrieving revision 1.41
diff -u -r1.41 mandocdb.c
--- mandocdb.c	25 Dec 2011 19:31:25 -0000	1.41
+++ mandocdb.c	27 Dec 2011 23:50:39 -0000
@@ -23,6 +23,7 @@
 #include <sys/types.h>
 
 #include <assert.h>
+#include <ctype.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <getopt.h>
@@ -129,8 +130,8 @@
 static	void		  ofile_dirbuild(const char *, const char *,
 				const char *, int, struct of **);
 static	void		  ofile_free(struct of *);
-static	void		  pformatted(DB *, struct buf *, struct buf *,
-				const struct of *);
+static	void		  pformatted(DB *, struct buf *, 
+				struct buf *, const struct of *);
 static	int		  pman_node(MAN_ARGS);
 static	void		  pmdoc_node(MDOC_ARGS);
 static	int		  pmdoc_head(MDOC_ARGS);
@@ -1319,6 +1320,8 @@
 
 			if (0 == strncmp(start, "-", 1))
 				start += 1;
+			else if (0 == strncmp(start, "\\-\\-", 4))
+				start += 4;
 			else if (0 == strncmp(start, "\\-", 2))
 				start += 2;
 			else if (0 == strncmp(start, "\\(en", 4))
@@ -1349,12 +1352,12 @@
  * By necessity, this involves rather crude guesswork.
  */
 static void
-pformatted(DB *hash, struct buf *buf, struct buf *dbuf,
-		 const struct of *of)
+pformatted(DB *hash, struct buf *buf, 
+		struct buf *dbuf, const struct of *of)
 {
 	FILE		*stream;
-	char		*line, *p;
-	size_t		 len, plen;
+	char		*line, *p, *title;
+	size_t		 len, plen, titlesz;
 
 	if (NULL == (stream = fopen(of->fname, "r"))) {
 		if (warnings)
@@ -1387,6 +1390,32 @@
 	while (NULL != (line = fgetln(stream, &len)))
 		if ('\n' != *line && ' ' != *line)
 			break;
+	
+	/*
+	 * Read up until the next section into a buffer.
+	 * Strip the leading and trailing newline from each read line,
+	 * appending a trailing space.
+	 * Ignore empty (whitespace-only) lines.
+	 */
+
+	titlesz = 0;
+	title = NULL;
+
+	while (NULL != (line = fgetln(stream, &len))) {
+		if (' ' != *line || '\n' != line[(int)len - 1])
+			break;
+		while (len > 0 && isspace((unsigned char)*line)) {
+			line++;
+			len--;
+		}
+		if (1 == len)
+			continue;
+		title = mandoc_realloc(title, titlesz + len);
+		memcpy(title + titlesz, line, len);
+		titlesz += len;
+		title[(int)titlesz - 1] = ' ';
+	}
+
 
 	/*
 	 * If no page content can be found, or the input line
@@ -1395,18 +1424,19 @@
 	 * description.
 	 */
 
-	line = fgetln(stream, &len);
-	if (NULL == line || ' ' != *line || '\n' != line[(int)len - 1]) {
+	if (NULL == title || '\0' == *title) {
 		if (warnings)
 			fprintf(stderr, "%s: cannot find NAME section\n",
 					of->fname);
 		buf_appendb(dbuf, buf->cp, buf->size);
 		hash_put(hash, buf, TYPE_Nd);
 		fclose(stream);
+		free(title);
 		return;
 	}
 
-	line[(int)--len] = '\0';
+	title = mandoc_realloc(title, titlesz + 1);
+	title[(int)titlesz] = '\0';
 
 	/*
 	 * Skip to the first dash.
@@ -1414,20 +1444,17 @@
 	 * bytes).
 	 */
 
-	if (NULL != (p = strstr(line, "- "))) {
+	if (NULL != (p = strstr(title, "- "))) {
 		for (p += 2; ' ' == *p || '\b' == *p; p++)
 			/* Skip to next word. */ ;
 	} else {
 		if (warnings)
 			fprintf(stderr, "%s: no dash in title line\n",
 					of->fname);
-		p = line;
+		p = title;
 	}
 
-	if ((plen = strlen(p)) > 70) {
-		plen = 70;
-		p[plen] = '\0';
-	}
+	plen = strlen(p);
 
 	/* Strip backspace-encoding from line. */
 
@@ -1446,6 +1473,7 @@
 	buf_appendb(buf, p, plen + 1);
 	hash_put(hash, buf, TYPE_Nd);
 	fclose(stream);
+	free(title);
 }
 
 static void

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: Improve mandocdb catpage/man heuristics.
  2011-12-27 23:57 Improve mandocdb catpage/man heuristics Kristaps Dzonsons
@ 2011-12-28  1:27 ` Ingo Schwarze
  0 siblings, 0 replies; 2+ messages in thread
From: Ingo Schwarze @ 2011-12-28  1:27 UTC (permalink / raw)
  To: tech

Hi Kristaps,

these are clear improvements, so i have committed them to OpenBSD.

Kristaps Dzonsons wrote on Wed, Dec 28, 2011 at 01:57:44AM +0200:

> This improves the mandocdb(8) catpage heuristic to, well, more or
> less as good as it's going to get.  It now reads multiple lines into
> a buffer, joining the lines with a space.

Actually, some man(7) pages have a similar problem, see for
example curs_extend(3) which contains

  .SH NAME
  \fBcurses_version\fP,
  \fBuse_extended_names\fP \- miscellaneous curses extensions

These need to take multiple lines into account, to, but maybe
a simpler algorithm than in pformatted is sufficient:

  After .SH NAME, skip all lines until you find "- ",
  then use everything until EOL as .Nd.

> While here, I removed the 70-character limit.

Yes, i'm not married to that.  Probably it's wrong to have this
at all.  I only put it in when first writing the code to avoid
getting distracted from the main tasks by badly formatted corner
case pages.  Now that the basic infrastructure is in place, we
can figure out whether such corner cases really exist, and how
many of them, and what to do about them.  Truncating is a very
naive - well, i hardly dare say "solution".

> I recoded this into apropos.c's and whatis.c's printf(3)
> statements.

Already an improvement.

> We should really
> consider a better way: if not 70-char, to the COLUMN limit?

Not sure yet, open for suggestions.
I'm not even sure how big the problem is...

> Lastly, I added an extra man(7) heuristic for separating names and
> descriptions, namely the \-\- I observed in some POD manuals.  This
> cleans up "apropos -s 3p ~.*" quite a lot.

Maybe man(7) can use similar heuristics as cat, just assuming
that "- " starts the description?

Thanks,
  Ingo
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2011-12-28  1:27 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-27 23:57 Improve mandocdb catpage/man heuristics Kristaps Dzonsons
2011-12-28  1:27 ` Ingo Schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).