tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mandocdb: handle formatted manuals
@ 2011-11-19  0:56 Ingo Schwarze
  2011-11-24 10:25 ` Kristaps Dzonsons
  0 siblings, 1 reply; 5+ messages in thread
From: Ingo Schwarze @ 2011-11-19  0:56 UTC (permalink / raw)
  To: tech

Hi,

right, extracting information from formatted manuals is a rather
dirty business and never going to be that reliable, but there is
no choice:  Sometimes, nothing else is available, and we have to
deal with it.  Of course, on OpenBSD, we could leave that dirty
work to espie@'s OpenBSD::Makewhatis perl modules, but i'd rather
have a portable solution, and i'd rather not have makewhatis(8)
split into two pieces.  I still hope that mandocdb(8) can replace
makewhatis(8) completely (except for the pkg_add(1)/pkg_delete(1)/
pkg_create(1)-integration of course, which is not going to be
portable given how different pkg_add and pkgsrc are).

So here is what i did on my train ride from the p2k11 ports hackathon
in Budapest back to Karlsruhe (including the one hour lockup in
Hegyeshalom when the locomotive stopped working, grrr):

 * Even without -a, walk the cat* dirs in addition to man*.
 * Only use those cats where men^Wmans are not available
   because mans are just greater than cats.

There is still a lot of room for improvement, several features of
OpenBSD::Makewhatis are not yet implemented.  However, this is
already working in most respects, and i'd like to put it in for
in-tree polishing.

Yours,
  Ingo

P.S.
I have seen Joerg's patch flurry.
Thanks for doing that work!
I'll look at all of them and do the integration
as soon as i find time.


Index: mandocdb.c
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/mandocdb.c,v
retrieving revision 1.9
diff -u -p -r1.9 mandocdb.c
--- mandocdb.c	17 Nov 2011 15:38:27 -0000	1.9
+++ mandocdb.c	19 Nov 2011 00:28:00 -0000
@@ -1,6 +1,7 @@
 /*	$Id: mandocdb.c,v 1.9 2011/11/17 15:38:27 schwarze Exp $ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -15,6 +16,8 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 
 #include <assert.h>
 #include <dirent.h>
@@ -36,6 +39,9 @@
 #define	MANDOC_BUFSZ	  BUFSIZ
 #define	MANDOC_SLOP	  1024
 
+#define	MANDOC_SRC	  0x1
+#define	MANDOC_FORM	  0x2
+
 /* Tiny list for files.  No need to bring in QUEUE. */
 
 struct	of {
@@ -43,6 +49,7 @@ struct	of {
 	char		 *sec;
 	char		 *arch;
 	char		 *title;
+	int		  src_form;
 	struct of	 *next; /* NULL for last one */
 	struct of	 *first; /* first in list */
 };
@@ -92,8 +99,11 @@ static	void		  index_prune(const struct 
 static	void		  ofile_argbuild(char *[], int, int, int,
 				struct of **);
 static	int		  ofile_dirbuild(const char *, const char *,
-				const char *, int, int, struct of **);
+				const char *, int, int, int,
+				struct of **);
 static	void		  ofile_free(struct of *);
+static	void		  pformatted(DB *, struct buf *, struct buf *,
+				const struct of *);
 static	int		  pman_node(MAN_ARGS);
 static	void		  pmdoc_node(MDOC_ARGS);
 static	void		  pmdoc_An(MDOC_ARGS);
@@ -424,7 +434,7 @@ mandocdb(int argc, char *argv[])
 		of = NULL;
 
 		if ( ! ofile_dirbuild(argv[i], NULL, NULL,
-				use_all, verb, &of)) 
+				0, use_all, verb, &of)) 
 			exit((int)MANDOCLEVEL_SYSERR);
 
 		if (NULL == of)
@@ -477,6 +487,11 @@ index_merge(const struct of *of, struct 
 
 	for (rec = 0; of; of = of->next) {
 		fn = of->fname;
+
+		/*
+		 * Reclaim an empty index record, if available.
+		 */
+
 		if (reccur > 0) {
 			--reccur;
 			rec = recs[(int)reccur];
@@ -488,15 +503,33 @@ index_merge(const struct of *of, struct 
 
 		mparse_reset(mp);
 		hash_reset(&hash);
+		mdoc = NULL;
+		man = NULL;
 
-		if (mparse_readfd(mp, -1, fn) >= MANDOCLEVEL_FATAL) {
-			fprintf(stderr, "%s: Parse failure\n", fn);
-			continue;
-		}
+		/*
+		 * Try interpreting the file as mdoc(7) or man(7)
+		 * source code, unless it is already known to be
+		 * formatted.  Fall back to formatted mode.
+		 */
 
-		mparse_result(mp, &mdoc, &man);
-		if (NULL == mdoc && NULL == man)
-			continue;
+		if ((MANDOC_SRC & of->src_form ||
+		    ! (MANDOC_FORM & of->src_form)) &&
+		    MANDOCLEVEL_FATAL > mparse_readfd(mp, -1, fn))
+			mparse_result(mp, &mdoc, &man);
+
+		if (NULL != mdoc) {
+			msec = mdoc_meta(mdoc)->msec;
+			arch = mdoc_meta(mdoc)->arch;
+			mtitle = mdoc_meta(mdoc)->title;
+		} else if (NULL != man) {
+			msec = man_meta(man)->msec;
+			arch = NULL;
+			mtitle = man_meta(man)->title;
+		} else {
+			msec = of->sec;
+			arch = of->arch;
+			mtitle = of->title;
+		}
 
 		/*
 		 * By default, skip a file if the manual section
@@ -504,11 +537,6 @@ index_merge(const struct of *of, struct 
 		 * with the directory where the file is located.
 		 */
 
-		msec = NULL != mdoc ? 
-			mdoc_meta(mdoc)->msec : man_meta(man)->msec;
-		arch = NULL != mdoc ? 
-			mdoc_meta(mdoc)->arch : NULL;
-
 		if (0 == use_all) {
 			assert(of->sec);
 			assert(msec);
@@ -533,9 +561,6 @@ index_merge(const struct of *of, struct 
 		 * because the one in the file usually is all caps.
 		 */
 
-		mtitle = NULL != mdoc ? 
-			mdoc_meta(mdoc)->title : man_meta(man)->title;
-
 		assert(of->title);
 		assert(mtitle);
 
@@ -565,8 +590,10 @@ index_merge(const struct of *of, struct 
 		if (mdoc)
 			pmdoc_node(hash, buf, dbuf,
 				mdoc_node(mdoc), mdoc_meta(mdoc));
-		else 
+		else if (man)
 			pman_node(hash, buf, dbuf, man_node(man));
+		else
+			pformatted(hash, buf, dbuf, of);
 
 		/*
 		 * Copy from the in-memory hashtable of pending keywords
@@ -1217,13 +1244,89 @@ pman_node(MAN_ARGS)
 	return(0);
 }
 
+/*
+ * Parse a formatted manual page.
+ * By necessity, this involves rather crude guesswork.
+ */
+static void
+pformatted(DB *hash, struct buf *buf, struct buf *dbuf,
+		 const struct of *of)
+{
+	FILE		*stream;
+	char		*line, *p;
+	size_t		 len, plen;
+
+	if (NULL == (stream = fopen(of->fname, "r"))) {
+		perror(of->fname);
+		return;
+	}
+
+	/*
+	 * Always use the title derived from the filename up front,
+	 * do not even try to find it in the file.  This also makes
+	 * sure we don't end up with an orphan index record, even if
+	 * the file content turns out to be completely unintelligible.
+	 */
+
+	buf->len = 0;
+	buf_append(buf, of->title);
+	hash_put(hash, buf, TYPE_Nm);
+
+	while (NULL != (line = fgetln(stream, &len)) && '\n' != *line)
+		/* Skip to first blank line. */ ;
+
+	while (NULL != (line = fgetln(stream, &len)) &&
+			('\n' == *line || ' ' == *line))
+		/* Skip to first section header. */ ;
+
+	/*
+	 * If no page content can be found,
+	 * reuse the page title as the page description.
+	 */
+
+	if (NULL == (line = fgetln(stream, &len))) {
+		buf_appendb(dbuf, buf->cp, buf->size);
+		hash_put(hash, buf, TYPE_Nd);
+		fclose(stream);
+		return;
+	}
+	fclose(stream);
+
+	/*
+	 * If there is a dash, skip to the text following it.
+	 */
+
+	for (p = line, plen = len; plen; p++, plen--)
+		if ('-' == *p)
+			break;
+	for ( ; plen; p++, plen--)
+		if ('-' != *p && ' ' != *p && 8 != *p)
+			break;
+	if (0 == plen) {
+		p = line;
+		plen = len;
+	}
+
+	/*
+	 * Copy the rest of the line, but no more than 70 bytes.
+	 */
+
+	if (70 < plen)
+		plen = 70;
+	p[plen-1] = '\0';
+	buf_appendb(dbuf, p, plen);
+	buf->len = 0;
+	buf_appendb(buf, p, plen);
+	hash_put(hash, buf, TYPE_Nd);
+}
+
 static void
 ofile_argbuild(char *argv[], int argc, int use_all, int verb,
 		struct of **of)
 {
 	char		 buf[MAXPATHLEN];
 	char		*sec, *arch, *title, *p;
-	int		 i;
+	int		 i, src_form;
 	struct of	*nof;
 
 	for (i = 0; i < argc; i++) {
@@ -1231,7 +1334,8 @@ ofile_argbuild(char *argv[], int argc, i
 		/*
 		 * Try to infer the manual section, architecture and
 		 * page title from the path, assuming it looks like
-		 *   man*[/<arch>]/<title>.<section>
+		 *   man*[/<arch>]/<title>.<section>   or
+		 *   cat<section>[/<arch>]/<title>.0
 		 */
 
 		if (strlcpy(buf, argv[i], sizeof(buf)) >= sizeof(buf)) {
@@ -1239,11 +1343,16 @@ ofile_argbuild(char *argv[], int argc, i
 			continue;
 		}
 		sec = arch = title = NULL;
+		src_form = 0;
 		p = strrchr(buf, '\0');
 		while (p-- > buf) {
 			if (NULL == sec && '.' == *p) {
 				sec = p + 1;
 				*p = '\0';
+				if ('0' == *sec)
+					src_form |= MANDOC_FORM;
+				else if ('1' <= *sec && '9' >= *sec)
+					src_form |= MANDOC_SRC;
 				continue;
 			}
 			if ('/' != *p)
@@ -1253,8 +1362,13 @@ ofile_argbuild(char *argv[], int argc, i
 				*p = '\0';
 				continue;
 			}
-			if (strncmp("man", p + 1, 3))
+			if (strncmp("man", p + 1, 3)) {
+				src_form |= MANDOC_SRC;
+				arch = p + 1;
+			} else if (strncmp("cat", p + 1, 3)) {
+				src_form |= MANDOC_FORM;
 				arch = p + 1;
+			}
 			break;
 		}
 		if (NULL == title)
@@ -1271,6 +1385,7 @@ ofile_argbuild(char *argv[], int argc, i
 		if (NULL != arch)
 			nof->arch = mandoc_strdup(arch);
 		nof->title = mandoc_strdup(title);
+		nof->src_form = src_form;
 
 		/*
 		 * Add the structure to the list.
@@ -1299,15 +1414,17 @@ ofile_argbuild(char *argv[], int argc, i
  */
 static int
 ofile_dirbuild(const char *dir, const char* psec, const char *parch,
-		int use_all, int verb, struct of **of)
+		int p_src_form, int use_all, int verb, struct of **of)
 {
 	char		 buf[MAXPATHLEN];
+	struct stat	 sb;
 	size_t		 sz;
 	DIR		*d;
 	const char	*fn, *sec, *arch;
-	char		*suffix;
+	char		*p, *q, *suffix;
 	struct of	*nof;
 	struct dirent	*dp;
+	int		 src_form;
 
 	if (NULL == (d = opendir(dir))) {
 		perror(dir);
@@ -1320,19 +1437,26 @@ ofile_dirbuild(const char *dir, const ch
 		if ('.' == *fn)
 			continue;
 
+		src_form = p_src_form;
+
 		if (DT_DIR == dp->d_type) {
 			sec = psec;
 			arch = parch;
 
 			/*
 			 * By default, only use directories called:
-			 *   man<section>/[<arch>/]
+			 *   man<section>/[<arch>/]   or
+			 *   cat<section>/[<arch>/]
 			 */
 
 			if (NULL == sec) {
-				if(0 == strncmp("man", fn, 3))
+				if(0 == strncmp("man", fn, 3)) {
+					src_form |= MANDOC_SRC;
 					sec = fn + 3;
-				else if (use_all)
+				} else if (0 == strncmp("cat", fn, 3)) {
+					src_form |= MANDOC_FORM;
+					sec = fn + 3;
+				} else if (use_all)
 					sec = fn;
 				else
 					continue;
@@ -1356,7 +1480,7 @@ ofile_dirbuild(const char *dir, const ch
 				printf("%s: Scanning\n", buf);
 
 			if ( ! ofile_dirbuild(buf, sec, arch,
-					use_all, verb, of))
+					src_form, use_all, verb, of))
 				return(0);
 		}
 		if (DT_REG != dp->d_type ||
@@ -1375,8 +1499,56 @@ ofile_dirbuild(const char *dir, const ch
 		if (0 == use_all) {
 			if (NULL == suffix)
 				continue;
-			if (strcmp(suffix + 1, psec))
+			if ((MANDOC_SRC & src_form &&
+					 strcmp(suffix + 1, psec)) ||
+			    (MANDOC_FORM & src_form &&
+					 strcmp(suffix + 1, "0")))
+					continue;
+		}
+		if (NULL != suffix) {
+			if ('0' == suffix[1])
+				src_form |= MANDOC_FORM;
+			else if ('1' <= suffix[1] && '9' >= suffix[1])
+				src_form |= MANDOC_SRC;
+		}
+
+
+		/*
+		 * Skip formatted manuals if a source version is
+		 * available.  Ignore the age: it is very unlikely
+		 * that people install newer formatted base manuals
+		 * when they used to have source manuals before,
+		 * and in ports, old manuals get removed on update.
+		 */
+		if (0 == use_all && MANDOC_FORM & src_form &&
+				NULL != psec) {
+			buf[0] = '\0';
+			strlcat(buf, dir, MAXPATHLEN);
+			p = strrchr(buf, '/');
+			if (NULL == p)
+				p = buf;
+			else
+				p++;
+			if (0 == strncmp("cat", p, 3))
+				memcpy(p, "man", 3);
+			strlcat(buf, "/", MAXPATHLEN);
+			sz = strlcat(buf, fn, MAXPATHLEN);
+			if (sz >= MAXPATHLEN) {
+				fprintf(stderr, "%s: Path too long\n", buf);
 				continue;
+			}
+			q = strrchr(buf, '.');
+			if (NULL != q && p < q++) {
+				*q = '\0';
+				sz = strlcat(buf, psec, MAXPATHLEN);
+				if (sz >= MAXPATHLEN) {
+					fprintf(stderr,
+					    "%s: Path too long\n", buf);
+					continue;
+				}
+				if (0 == stat(buf, &sb))
+					continue;
+			}
 		}
 
 		buf[0] = '\0';
@@ -1385,7 +1557,7 @@ ofile_dirbuild(const char *dir, const ch
 		sz = strlcat(buf, fn, MAXPATHLEN);
 		if (sz >= MAXPATHLEN) {
 			fprintf(stderr, "%s: Path too long\n", dir);
-			return(0);
+			continue;
 		}
 
 		nof = mandoc_calloc(1, sizeof(struct of));
@@ -1394,6 +1566,7 @@ ofile_dirbuild(const char *dir, const ch
 			nof->sec = mandoc_strdup(psec);
 		if (NULL != parch)
 			nof->arch = mandoc_strdup(parch);
+		nof->src_form = src_form;
 
 		/*
 		 * Remember the file name without the extension,
@@ -1404,9 +1577,12 @@ ofile_dirbuild(const char *dir, const ch
 			*suffix = '\0';
 		nof->title = mandoc_strdup(fn);
 
+		/*
+		 * Add the structure to the list.
+		 */
+
 		if (verb > 2)
 			printf("%s: Scheduling\n", buf);
-
 		if (NULL == *of) {
 			*of = nof;
 			(*of)->first = nof;
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-11-26 12:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-19  0:56 mandocdb: handle formatted manuals Ingo Schwarze
2011-11-24 10:25 ` Kristaps Dzonsons
2011-11-26 11:54   ` Ingo Schwarze
2011-11-26 12:01     ` Kristaps Dzonsons
2011-11-26 12:41       ` Ingo Schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).