tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
From: Ingo Schwarze <schwarze@usta.de>
To: tech@mdocml.bsd.lv
Subject: Re: mandocdb: full set of search types
Date: Wed, 16 Nov 2011 17:59:35 +0100	[thread overview]
Message-ID: <20111116165935.GO31182@iris.usta.de> (raw)
In-Reply-To: <4EC3093E.3030504@bsd.lv>

Hi Kristaps,

Kristaps Dzonsons wrote on Wed, Nov 16, 2011 at 01:52:14AM +0100:
> On 16/11/2011 01:39, Ingo Schwarze wrote:

[...]
>> Before enabling mandocdb in pkg_add(8), i'd like to get the database
>> format complete, such that we don't force people to rebuild the
>> databases after upgrading to new snapshots.
>>
>> Here is a patch defining TYPE_ flags for all macros that i can
>> imagine might be worth searching for (maybe even a few more, but i'd
>> rather have too many than too few).  I have left out all obsolete
>> macros and most physical formatting macros.
>>
>> This requires switching the mask to 64 bits.  During the switch,
>> i have replaced the very error-prone handling of the key database
>> values by a new struct db_val; some memcpy() calls by normal
>> assignments; and some magical constants by sizeof() constructs.

> Just a word or two before I sleep.  This approach is sound and I've
> no issues with a quick look over the patch, but will wait til
> tomorrow to do so in earnest.  But first, before mandocdb gets
> production, the database should be checked for endian-neutrality.

That's still TODO.

However, that doesn't prevent getting the TYPE_* flags right first,
so here is an updated patch, to be applied on top of my man.conf
stuff sent right before.

OK?
  Ingo


--- apropos_db.c.orig
+++ apropos_db.c
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 #include <regex.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -41,7 +42,7 @@ struct	rectree {
 struct	expr {
 	int		 regex;
 	int		 index;
-	int	 	 mask;
+	uint64_t 	 mask;
 	int		 and;
 	char		*v;
 	regex_t	 	 re;
@@ -49,22 +50,47 @@ struct	expr {
 };
 
 struct	type {
-	int		 mask;
+	uint64_t	 mask;
 	const char	*name;
 };
 
 static	const struct type types[] = {
 	{ TYPE_An, "An" },
+	{ TYPE_Ar, "Ar" },
+	{ TYPE_At, "At" },
+	{ TYPE_Bsx, "Bsx" },
+	{ TYPE_Bx, "Bx" },
 	{ TYPE_Cd, "Cd" },
+	{ TYPE_Cm, "Cm" },
+	{ TYPE_Dv, "Dv" },
+	{ TYPE_Dx, "Dx" },
+	{ TYPE_Em, "Em" },
 	{ TYPE_Er, "Er" },
 	{ TYPE_Ev, "Ev" },
+	{ TYPE_Fa, "Fa" },
+	{ TYPE_Fl, "Fl" },
 	{ TYPE_Fn, "Fn" },
 	{ TYPE_Fn, "Fo" },
+	{ TYPE_Ft, "Ft" },
+	{ TYPE_Fx, "Fx" },
+	{ TYPE_Ic, "Ic" },
 	{ TYPE_In, "In" },
+	{ TYPE_Lb, "Lb" },
+	{ TYPE_Li, "Li" },
+	{ TYPE_Lk, "Lk" },
+	{ TYPE_Ms, "Ms" },
+	{ TYPE_Mt, "Mt" },
 	{ TYPE_Nd, "Nd" },
 	{ TYPE_Nm, "Nm" },
+	{ TYPE_Nx, "Nx" },
+	{ TYPE_Ox, "Ox" },
 	{ TYPE_Pa, "Pa" },
+	{ TYPE_Rs, "Rs" },
+	{ TYPE_Sh, "Sh" },
+	{ TYPE_Ss, "Ss" },
 	{ TYPE_St, "St" },
+	{ TYPE_Sy, "Sy" },
+	{ TYPE_Tn, "Tn" },
 	{ TYPE_Va, "Va" },
 	{ TYPE_Va, "Vt" },
 	{ TYPE_Xr, "Xr" },
@@ -74,9 +100,9 @@ static	const struct type types[] = {
 
 static	DB	*btree_open(void);
 static	int	 btree_read(const DBT *, const struct mchars *, char **);
-static	int	 exprexecpre(const struct expr *, const char *, int);
+static	int	 exprexecpre(const struct expr *, const char *, uint64_t);
 static	void	 exprexecpost(const struct expr *, 
-			const char *, int, int *, size_t);
+			const char *, uint64_t, int *, size_t);
 static	struct expr *exprterm(char *, int, int);
 static	DB	*index_open(void);
 static	int	 index_read(const DBT *, const DBT *, 
@@ -381,7 +407,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 		const struct expr *expr, size_t terms,
 		struct mchars *mc)
 {
-	int		 root, leaf, mask;
+	int		 root, leaf;
 	DBT		 key, val;
 	DB		*btree, *idx;
 	int		 ch;
@@ -389,6 +415,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 	recno_t		 rec;
 	struct rec	*recs;
 	struct rec	 srec;
+	struct db_val	*vbuf;
 
 	root	= -1;
 	leaf	= -1;
@@ -412,21 +439,19 @@ single_search(struct rectree *tree, const struct opts *opts,
 		 * The key must have something in it, and the value must
 		 * have the correct tags/recno mix.
 		 */
-		if (key.size < 2 || 8 != val.size) 
+		if (key.size < 2 || sizeof(struct db_val) != val.size) 
 			break;
 		if ( ! btree_read(&key, mc, &buf))
 			break;
 
-		mask = *(int *)val.data;
-
 		/*
 		 * See if this keyword record matches any of the
 		 * expressions we have stored.
 		 */
-		if ( ! exprexecpre(expr, buf, mask))
+		vbuf = val.data;
+		if ( ! exprexecpre(expr, buf, vbuf->mask))
 			continue;
-
-		memcpy(&rec, val.data + 4, sizeof(recno_t));
+		rec = vbuf->rec;
 
 		/*
 		 * O(log n) scan for prior records.  Since a record
@@ -445,7 +470,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 		if (leaf >= 0 && recs[leaf].rec == rec) {
 			if (0 == recs[leaf].matches[0])
 				exprexecpost
-					(expr, buf, mask, 
+					(expr, buf, vbuf->mask, 
 					 recs[leaf].matches, terms);
 			continue;
 		}
@@ -478,7 +503,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 			mandoc_calloc(terms + 1, sizeof(int));
 
 		exprexecpost
-			(expr, buf, mask, 
+			(expr, buf, vbuf->mask, 
 			 recs[tree->len].matches, terms);
 
 		/* Append to our tree. */
@@ -642,7 +667,7 @@ exprfree(struct expr *p)
  * Return 1 if any expression evaluates to true, else 0.
  */
 static int
-exprexecpre(const struct expr *p, const char *cp, int mask)
+exprexecpre(const struct expr *p, const char *cp, uint64_t mask)
 {
 
 	for ( ; NULL != p; p = p->next) {
@@ -666,7 +691,7 @@ exprexecpre(const struct expr *p, const char *cp, int mask)
  */
 static void
 exprexecpost(const struct expr *e, const char *cp, 
-		int mask, int *matches, size_t matchsz)
+		uint64_t mask, int *matches, size_t matchsz)
 {
 	const struct expr *p;
 	int		   match;
--- mandocdb.8.orig
+++ mandocdb.8
@@ -48,9 +48,13 @@ The arguments are as follows:
 .It Fl a
 Use all directories and files found below
 .Ar dir ... .
-By default, directories and files
-.Xr man 1
-cannot find will be silently skipped.
+By default, only files matching
+.Sm off
+.Sy man Ar section Li /
+.Op Ar arch Li /
+.Ar title . section
+.Sm on
+will be used.
 .It Fl d Ar dir
 Merge (remove and re-add)
 .Ar
--- mandocdb.c.orig
+++ mandocdb.c
@@ -79,7 +79,7 @@ static	void		  buf_append(struct buf *, const char *);
 static	void		  buf_appendb(struct buf *, 
 				const void *, size_t);
 static	void		  dbt_put(DB *, const char *, DBT *, DBT *);
-static	void		  hash_put(DB *, const struct buf *, int);
+static	void		  hash_put(DB *, const struct buf *, uint64_t);
 static	void		  hash_reset(DB **);
 static	void		  index_merge(const struct of *, struct mparse *,
 				struct buf *, struct buf *,
@@ -257,11 +257,11 @@ mandocdb(int argc, char *argv[])
 			*db, /* keyword database */
 			*hash; /* temporary keyword hashtable */
 	BTREEINFO	 info; /* btree configuration */
-	recno_t		 maxrec; /* supremum of all records */
-	recno_t		*recs; /* buffer of empty records */
+	recno_t		 maxrec; /* last record number in the index */
+	recno_t		*recs; /* the numbers of all empty records */
 	size_t		 sz1, sz2,
-			 recsz, /* buffer size of recs */
-			 reccur; /* valid number of recs */
+			 recsz, /* number of allocated slots in recs */
+			 reccur; /* current number of empty records */
 	struct buf	 buf, /* keyword buffer */
 			 dbuf; /* description buffer */
 	struct of	*of; /* list of files for processing */
@@ -348,7 +348,7 @@ mandocdb(int argc, char *argv[])
 		if (NULL == db) {
 			perror(fbuf);
 			exit((int)MANDOCLEVEL_SYSERR);
-		} else if (NULL == db) {
+		} else if (NULL == idx) {
 			perror(ibuf);
 			exit((int)MANDOCLEVEL_SYSERR);
 		}
@@ -410,7 +410,7 @@ mandocdb(int argc, char *argv[])
 		if (NULL == db) {
 			perror(fbuf);
 			exit((int)MANDOCLEVEL_SYSERR);
-		} else if (NULL == db) {
+		} else if (NULL == idx) {
 			perror(ibuf);
 			exit((int)MANDOCLEVEL_SYSERR);
 		}
@@ -473,7 +473,7 @@ index_merge(const struct of *of, struct mparse *mp,
 	const char	*fn, *msec, *mtitle, *arch;
 	size_t		 sv;
 	unsigned	 seq;
-	char		 vbuf[8];
+	struct db_val	 vbuf;
 
 	for (rec = 0; of; of = of->next) {
 		fn = of->fname;
@@ -499,9 +499,9 @@ index_merge(const struct of *of, struct mparse *mp,
 			continue;
 
 		/*
-		 * Make sure the manual section and architecture
-		 * agree with the directory where the file is located
-		 * or man(1) will not be able to find it.
+		 * By default, skip a file if the manual section
+		 * and architecture given in the file disagree
+		 * with the directory where the file is located.
 		 */
 
 		msec = NULL != mdoc ? 
@@ -527,9 +527,10 @@ index_merge(const struct of *of, struct mparse *mp,
 			arch = "";
 
 		/* 
-		 * Case is relevant for man(1), so use the file name
-		 * instead of the (usually) all caps page title,
-		 * if the two agree.
+		 * By default, skip a file if the title given
+		 * in the file disagrees with the file name.
+		 * If both agree, use the file name as the title,
+		 * because the one in the file usually is all caps.
 		 */
 
 		mtitle = NULL != mdoc ? 
@@ -571,17 +572,15 @@ index_merge(const struct of *of, struct mparse *mp,
 		 * Copy from the in-memory hashtable of pending keywords
 		 * into the database.
 		 */
-		
-		memset(vbuf, 0, sizeof(uint32_t));
-		memcpy(vbuf + 4, &rec, sizeof(uint32_t));
 
+		vbuf.rec = rec;
 		seq = R_FIRST;
 		while (0 == (ch = (*hash->seq)(hash, &key, &val, seq))) {
 			seq = R_NEXT;
 
-			memcpy(vbuf, val.data, sizeof(uint32_t));
-			val.size = sizeof(vbuf);
-			val.data = vbuf;
+			vbuf.mask = *(uint64_t *)val.data;
+			val.size = sizeof(struct db_val);
+			val.data = &vbuf;
 
 			if (verb > 1)
 				printf("%s: Added keyword: %s\n", 
@@ -626,6 +625,7 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 {
 	const struct of	*of;
 	const char	*fn;
+	struct db_val	*vbuf;
 	unsigned	 seq, sseq;
 	DBT		 key, val;
 	size_t		 reccur;
@@ -658,8 +658,9 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 		sseq = R_FIRST;
 		while (0 == (ch = (*db->seq)(db, &key, &val, sseq))) {
 			sseq = R_NEXT;
-			assert(8 == val.size);
-			if (*maxrec != *(recno_t *)(val.data + 4))
+			assert(sizeof(struct db_val) == val.size);
+			vbuf = val.data;
+			if (*maxrec != vbuf->rec)
 				continue;
 			if (verb)
 				printf("%s: Deleted keyword: %s\n", 
@@ -1040,7 +1041,7 @@ pmdoc_Nm(MDOC_ARGS)
 }
 
 static void
-hash_put(DB *db, const struct buf *buf, int mask)
+hash_put(DB *db, const struct buf *buf, uint64_t mask)
 {
 	DBT		 key, val;
 	int		 rc;
@@ -1055,10 +1056,10 @@ hash_put(DB *db, const struct buf *buf, int mask)
 		perror("hash");
 		exit((int)MANDOCLEVEL_SYSERR);
 	} else if (0 == rc)
-		mask |= *(int *)val.data;
+		mask |= *(uint64_t *)val.data;
 
 	val.data = &mask;
-	val.size = sizeof(int); 
+	val.size = sizeof(uint64_t); 
 
 	if ((rc = (*db->put)(db, &key, &val, 0)) < 0) {
 		perror("hash");
@@ -1228,7 +1229,9 @@ ofile_argbuild(char *argv[], int argc, int use_all, int verb,
 	for (i = 0; i < argc; i++) {
 
 		/*
-		 * Analyze the path.
+		 * Try to infer the manual section, architecture and
+		 * page title from the path, assuming it looks like
+		 *   man*[/<arch>]/<title>.<section>
 		 */
 
 		if (strlcpy(buf, argv[i], sizeof(buf)) >= sizeof(buf)) {
@@ -1322,8 +1325,8 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			arch = parch;
 
 			/*
-	 		 * Don't bother parsing directories
-			 * that man(1) won't find.
+			 * By default, only use directories called:
+			 *   man<section>/[<arch>/]
 			 */
 
 			if (NULL == sec) {
@@ -1363,7 +1366,9 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			continue;
 
 		/*
-		 * Don't bother parsing files that man(1) won't find.
+		 * By default, skip files where the file name suffix
+		 * does not agree with the section directory
+		 * they are located in.
 		 */
 
 		suffix = strrchr(fn, '.');
@@ -1389,6 +1394,12 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			nof->sec = mandoc_strdup(psec);
 		if (NULL != parch)
 			nof->arch = mandoc_strdup(parch);
+
+		/*
+		 * Remember the file name without the extension,
+		 * to be used as the page title in the database.
+		 */
+
 		if (NULL != suffix)
 			*suffix = '\0';
 		nof->title = mandoc_strdup(fn);
--- mandocdb.h.orig
+++ mandocdb.h
@@ -15,18 +15,49 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+struct db_val {
+	uint64_t	mask;
+	uint32_t	rec;
+};
+
 #define	MANDOC_DB	"mandoc.db"
 #define	MANDOC_IDX	"mandoc.index"
 
-#define	TYPE_An		0x01
-#define	TYPE_Cd		0x02
-#define	TYPE_Er		0x04
-#define	TYPE_Ev		0x08
-#define	TYPE_Fn		0x10
-#define	TYPE_In		0x20
-#define	TYPE_Nd		0x40
-#define	TYPE_Nm		0x100
-#define	TYPE_Pa		0x200
-#define	TYPE_St		0x400
-#define	TYPE_Va		0x1000
-#define	TYPE_Xr		0x2000
+#define	TYPE_An		0x0000000000000001ULL
+#define	TYPE_Ar		0x0000000000000002ULL
+#define	TYPE_At		0x0000000000000004ULL
+#define	TYPE_Bsx	0x0000000000000008ULL
+#define	TYPE_Bx         0x0000000000000010ULL
+#define	TYPE_Cd		0x0000000000000020ULL
+#define	TYPE_Cm		0x0000000000000040ULL
+#define	TYPE_Dv		0x0000000000000080ULL
+#define	TYPE_Dx		0x0000000000000100ULL
+#define	TYPE_Em		0x0000000000000200ULL
+#define	TYPE_Er		0x0000000000000400ULL
+#define	TYPE_Ev		0x0000000000000800ULL
+#define	TYPE_Fa		0x0000000000001000ULL
+#define	TYPE_Fl		0x0000000000002000ULL
+#define	TYPE_Fn		0x0000000000004000ULL
+#define	TYPE_Ft		0x0000000000008000ULL
+#define	TYPE_Fx		0x0000000000010000ULL
+#define	TYPE_Ic		0x0000000000020000ULL
+#define	TYPE_In		0x0000000000040000ULL
+#define	TYPE_Lb		0x0000000000080000ULL
+#define	TYPE_Li		0x0000000000100000ULL
+#define	TYPE_Lk		0x0000000000200000ULL
+#define	TYPE_Ms		0x0000000000400000ULL
+#define	TYPE_Mt		0x0000000000800000ULL
+#define	TYPE_Nd		0x0000000001000000ULL
+#define	TYPE_Nm		0x0000000002000000ULL
+#define	TYPE_Nx		0x0000000004000000ULL
+#define	TYPE_Ox		0x0000000008000000ULL
+#define	TYPE_Pa		0x0000000010000000ULL
+#define	TYPE_Rs		0x0000000020000000ULL
+#define	TYPE_Sh		0x0000000040000000ULL
+#define	TYPE_Ss		0x0000000080000000ULL
+#define	TYPE_St		0x0000000100000000ULL
+#define	TYPE_Sy		0x0000000200000000ULL
+#define	TYPE_Tn		0x0000000400000000ULL
+#define	TYPE_Va		0x0000000800000000ULL
+#define	TYPE_Vt		0x0000001000000000ULL
+#define	TYPE_Xr		0x0000002000000000ULL
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

      parent reply	other threads:[~2011-11-16 16:59 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-11-16  0:39 Ingo Schwarze
2011-11-16  0:52 ` Kristaps Dzonsons
2011-11-16  1:50   ` Ingo Schwarze
2011-11-16 16:59   ` Ingo Schwarze [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111116165935.GO31182@iris.usta.de \
    --to=schwarze@usta.de \
    --cc=tech@mdocml.bsd.lv \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).