tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mandocdb: full set of search types
@ 2011-11-16  0:39 Ingo Schwarze
  2011-11-16  0:52 ` Kristaps Dzonsons
  0 siblings, 1 reply; 4+ messages in thread
From: Ingo Schwarze @ 2011-11-16  0:39 UTC (permalink / raw)
  To: tech; +Cc: jmc

Hi,

we are close to going into production.  I have talked to espie@,
and he is d'accord with putting small pieces into
  /usr/src/libexec/makewhatis/Makewhatis.pm
to call out to mandocdb(8) for automated updates of the mandoc.db
files alongside the updates of the whatis.db files when installing
and removing packages, such that people running -current can easily
start testing the new apropos.

We will keep the old makewhatis/apropos-combo in place until the new
system is reasonably feature-complete and clearly better than the
old one, then install the new apropos in place of the old one and
finally remove the old components.

Before enabling mandocdb in pkg_add(8), i'd like to get the database
format complete, such that we don't force people to rebuild the
databases after upgrading to new snapshots.

Here is a patch defining TYPE_ flags for all macros that i can
imagine might be worth searching for (maybe even a few more, but i'd
rather have too many than too few).  I have left out all obsolete
macros and most physical formatting macros.

This requires switching the mask to 64 bits.  During the switch,
i have replaced the very error-prone handling of the key database
values by a new struct db_val; some memcpy() calls by normal
assignments; and some magical constants by sizeof() constructs.

OK to put this in, or do you see anything that is missing?


Right now, i will start to implement the new search types.

Yours,
  Ingo


--- apropos_db.c.orig
+++ apropos_db.c
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 #include <regex.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -34,28 +35,53 @@
 
 struct	expr {
 	int		 regex;
-	int	 	 mask;
+	uint64_t 	 mask;
 	char		*v;
 	regex_t	 	 re;
 };
 
 struct	type {
-	int		 mask;
+	uint64_t	 mask;
 	const char	*name;
 };
 
 static	const struct type types[] = {
 	{ TYPE_An, "An" },
+	{ TYPE_Ar, "Ar" },
+	{ TYPE_At, "At" },
+	{ TYPE_Bsx, "Bsx" },
+	{ TYPE_Bx, "Bx" },
 	{ TYPE_Cd, "Cd" },
+	{ TYPE_Cm, "Cm" },
+	{ TYPE_Dv, "Dv" },
+	{ TYPE_Dx, "Dx" },
+	{ TYPE_Em, "Em" },
 	{ TYPE_Er, "Er" },
 	{ TYPE_Ev, "Ev" },
+	{ TYPE_Fa, "Fa" },
+	{ TYPE_Fl, "Fl" },
 	{ TYPE_Fn, "Fn" },
 	{ TYPE_Fn, "Fo" },
+	{ TYPE_Ft, "Ft" },
+	{ TYPE_Fx, "Fx" },
+	{ TYPE_Ic, "Ic" },
 	{ TYPE_In, "In" },
+	{ TYPE_Lb, "Lb" },
+	{ TYPE_Li, "Li" },
+	{ TYPE_Lk, "Lk" },
+	{ TYPE_Ms, "Ms" },
+	{ TYPE_Mt, "Mt" },
 	{ TYPE_Nd, "Nd" },
 	{ TYPE_Nm, "Nm" },
+	{ TYPE_Nx, "Nx" },
+	{ TYPE_Ox, "Ox" },
 	{ TYPE_Pa, "Pa" },
+	{ TYPE_Rs, "Rs" },
+	{ TYPE_Sh, "Sh" },
+	{ TYPE_Ss, "Ss" },
 	{ TYPE_St, "St" },
+	{ TYPE_Sy, "Sy" },
+	{ TYPE_Tn, "Tn" },
 	{ TYPE_Va, "Va" },
 	{ TYPE_Va, "Vt" },
 	{ TYPE_Xr, "Xr" },
@@ -65,7 +91,7 @@ static	const struct type types[] = {
 
 static	DB	*btree_open(void);
 static	int	 btree_read(const DBT *, const struct mchars *, char **);
-static	int	 exprexec(const struct expr *, char *, int);
+static	int	 exprexec(const struct expr *, char *, uint64_t);
 static	DB	*index_open(void);
 static	int	 index_read(const DBT *, const DBT *, 
 			const struct mchars *, struct rec *);
@@ -328,6 +354,7 @@ apropos_search(const struct opts *opts, const struct expr *expr,
 	recno_t		 rec;
 	struct rec	*recs;
 	struct rec	 srec;
+	struct db_val	*vbuf;
 
 	root	= -1;
 	leaf	= -1;
@@ -357,15 +384,15 @@ apropos_search(const struct opts *opts, const struct expr *expr,
 		 * The key must have something in it, and the value must
 		 * have the correct tags/recno mix.
 		 */
-		if (key.size < 2 || 8 != val.size) 
+		if (key.size < 2 || sizeof(struct db_val) != val.size) 
 			break;
 		if ( ! btree_read(&key, mc, &buf))
 			break;
 
-		if ( ! exprexec(expr, buf, *(int *)val.data))
+		vbuf = val.data;
+		if ( ! exprexec(expr, buf, vbuf->mask))
 			continue;
-
-		memcpy(&rec, val.data + 4, sizeof(recno_t));
+		rec = vbuf->rec;
 
 		/*
 		 * O(log n) scan for prior records.  Since a record
@@ -524,7 +551,7 @@ exprfree(struct expr *p)
 }
 
 static int
-exprexec(const struct expr *p, char *cp, int mask)
+exprexec(const struct expr *p, char *cp, uint64_t mask)
 {
 
 	if ( ! (mask & p->mask))
--- mandocdb.c.orig
+++ mandocdb.c
@@ -77,7 +77,7 @@ static	void		  buf_append(struct buf *, const char *);
 static	void		  buf_appendb(struct buf *, 
 				const void *, size_t);
 static	void		  dbt_put(DB *, const char *, DBT *, DBT *);
-static	void		  hash_put(DB *, const struct buf *, int);
+static	void		  hash_put(DB *, const struct buf *, uint64_t);
 static	void		  hash_reset(DB **);
 static	void		  index_merge(const struct of *, struct mparse *,
 				struct buf *, struct buf *,
@@ -453,7 +453,7 @@ index_merge(const struct of *of, struct mparse *mp,
 	const char	*fn, *msec, *mtitle, *arch;
 	size_t		 sv;
 	unsigned	 seq;
-	char		 vbuf[8];
+	struct db_val	 vbuf;
 
 	for (rec = 0; of; of = of->next) {
 		fn = of->fname;
@@ -552,17 +552,15 @@ index_merge(const struct of *of, struct mparse *mp,
 		 * Copy from the in-memory hashtable of pending keywords
 		 * into the database.
 		 */
-		
-		memset(vbuf, 0, sizeof(uint32_t));
-		memcpy(vbuf + 4, &rec, sizeof(uint32_t));
 
+		vbuf.rec = rec;
 		seq = R_FIRST;
 		while (0 == (ch = (*hash->seq)(hash, &key, &val, seq))) {
 			seq = R_NEXT;
 
-			memcpy(vbuf, val.data, sizeof(uint32_t));
-			val.size = sizeof(vbuf);
-			val.data = vbuf;
+			vbuf.mask = *(uint64_t *)val.data;
+			val.size = sizeof(struct db_val);
+			val.data = &vbuf;
 
 			if (verb > 1)
 				printf("%s: Added keyword: %s\n", 
@@ -607,6 +605,7 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 {
 	const struct of	*of;
 	const char	*fn;
+	struct db_val	*vbuf;
 	unsigned	 seq, sseq;
 	DBT		 key, val;
 	size_t		 reccur;
@@ -639,8 +638,9 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 		sseq = R_FIRST;
 		while (0 == (ch = (*db->seq)(db, &key, &val, sseq))) {
 			sseq = R_NEXT;
-			assert(8 == val.size);
-			if (*maxrec != *(recno_t *)(val.data + 4))
+			assert(sizeof(struct db_val) == val.size);
+			vbuf = val.data;
+			if (*maxrec != vbuf->rec)
 				continue;
 			if (verb)
 				printf("%s: Deleted keyword: %s\n", 
@@ -1021,7 +1021,7 @@ pmdoc_Nm(MDOC_ARGS)
 }
 
 static void
-hash_put(DB *db, const struct buf *buf, int mask)
+hash_put(DB *db, const struct buf *buf, uint64_t mask)
 {
 	DBT		 key, val;
 	int		 rc;
@@ -1036,10 +1036,10 @@ hash_put(DB *db, const struct buf *buf, int mask)
 		perror("hash");
 		exit((int)MANDOCLEVEL_SYSERR);
 	} else if (0 == rc)
-		mask |= *(int *)val.data;
+		mask |= *(uint64_t *)val.data;
 
 	val.data = &mask;
-	val.size = sizeof(int); 
+	val.size = sizeof(uint64_t); 
 
 	if ((rc = (*db->put)(db, &key, &val, 0)) < 0) {
 		perror("hash");
--- mandocdb.h.orig
+++ mandocdb.h
@@ -15,18 +15,49 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+struct db_val {
+	uint64_t	mask;
+	uint32_t	rec;
+};
+
 #define	MANDOC_DB	"mandoc.db"
 #define	MANDOC_IDX	"mandoc.index"
 
-#define	TYPE_An		0x01
-#define	TYPE_Cd		0x02
-#define	TYPE_Er		0x04
-#define	TYPE_Ev		0x08
-#define	TYPE_Fn		0x10
-#define	TYPE_In		0x20
-#define	TYPE_Nd		0x40
-#define	TYPE_Nm		0x100
-#define	TYPE_Pa		0x200
-#define	TYPE_St		0x400
-#define	TYPE_Va		0x1000
-#define	TYPE_Xr		0x2000
+#define	TYPE_An		0x0000000000000001ULL
+#define	TYPE_Ar		0x0000000000000002ULL
+#define	TYPE_At		0x0000000000000004ULL
+#define	TYPE_Bsx	0x0000000000000008ULL
+#define	TYPE_Bx         0x0000000000000010ULL
+#define	TYPE_Cd		0x0000000000000020ULL
+#define	TYPE_Cm		0x0000000000000040ULL
+#define	TYPE_Dv		0x0000000000000080ULL
+#define	TYPE_Dx		0x0000000000000100ULL
+#define	TYPE_Em		0x0000000000000200ULL
+#define	TYPE_Er		0x0000000000000400ULL
+#define	TYPE_Ev		0x0000000000000800ULL
+#define	TYPE_Fa		0x0000000000001000ULL
+#define	TYPE_Fl		0x0000000000002000ULL
+#define	TYPE_Fn		0x0000000000004000ULL
+#define	TYPE_Ft		0x0000000000008000ULL
+#define	TYPE_Fx		0x0000000000010000ULL
+#define	TYPE_Ic		0x0000000000020000ULL
+#define	TYPE_In		0x0000000000040000ULL
+#define	TYPE_Lb		0x0000000000080000ULL
+#define	TYPE_Li		0x0000000000100000ULL
+#define	TYPE_Lk		0x0000000000200000ULL
+#define	TYPE_Ms		0x0000000000400000ULL
+#define	TYPE_Mt		0x0000000000800000ULL
+#define	TYPE_Nd		0x0000000001000000ULL
+#define	TYPE_Nm		0x0000000002000000ULL
+#define	TYPE_Nx		0x0000000004000000ULL
+#define	TYPE_Ox		0x0000000008000000ULL
+#define	TYPE_Pa		0x0000000010000000ULL
+#define	TYPE_Rs		0x0000000020000000ULL
+#define	TYPE_Sh		0x0000000040000000ULL
+#define	TYPE_Ss		0x0000000080000000ULL
+#define	TYPE_St		0x0000000100000000ULL
+#define	TYPE_Sy		0x0000000200000000ULL
+#define	TYPE_Tn		0x0000000400000000ULL
+#define	TYPE_Va		0x0000000800000000ULL
+#define	TYPE_Vt		0x0000001000000000ULL
+#define	TYPE_Xr		0x0000002000000000ULL
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: mandocdb: full set of search types
  2011-11-16  0:39 mandocdb: full set of search types Ingo Schwarze
@ 2011-11-16  0:52 ` Kristaps Dzonsons
  2011-11-16  1:50   ` Ingo Schwarze
  2011-11-16 16:59   ` Ingo Schwarze
  0 siblings, 2 replies; 4+ messages in thread
From: Kristaps Dzonsons @ 2011-11-16  0:52 UTC (permalink / raw)
  To: tech

On 16/11/2011 01:39, Ingo Schwarze wrote:
> Hi,
>
> we are close to going into production.  I have talked to espie@,
> and he is d'accord with putting small pieces into
>    /usr/src/libexec/makewhatis/Makewhatis.pm
> to call out to mandocdb(8) for automated updates of the mandoc.db
> files alongside the updates of the whatis.db files when installing
> and removing packages, such that people running -current can easily
> start testing the new apropos.
>
> We will keep the old makewhatis/apropos-combo in place until the new
> system is reasonably feature-complete and clearly better than the
> old one, then install the new apropos in place of the old one and
> finally remove the old components.
>
> Before enabling mandocdb in pkg_add(8), i'd like to get the database
> format complete, such that we don't force people to rebuild the
> databases after upgrading to new snapshots.
>
> Here is a patch defining TYPE_ flags for all macros that i can
> imagine might be worth searching for (maybe even a few more, but i'd
> rather have too many than too few).  I have left out all obsolete
> macros and most physical formatting macros.
>
> This requires switching the mask to 64 bits.  During the switch,
> i have replaced the very error-prone handling of the key database
> values by a new struct db_val; some memcpy() calls by normal
> assignments; and some magical constants by sizeof() constructs.
>
> OK to put this in, or do you see anything that is missing?
>
>
> Right now, i will start to implement the new search types.

Hi Ingo,

Just a word or two before I sleep.  This approach is sound and I've no 
issues with a quick look over the patch, but will wait til tomorrow to 
do so in earnest.  But first, before mandocdb gets production, the 
database should be checked for endian-neutrality.

Second, we'd long ago mentioned splitting SYNOPSIS-invoked macros (Fl, 
Fn, etc.) for querying on their SYNOPSIS or non-SYNOPSIS usage.  I think 
an elegant method is to encode the section within the keyword database, 
which allows for

  apropos Fn~mdoc -a -s SYNOPSIS

or whatever `-s' replacement operator.  How does that sound?  This 
sounds a lot more reasonable than encoding separate Fn, Nm, etc. macros 
for SYNOPSIS and non-SYNOPSIS invocation.

Thoughts?

Kristaps
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: mandocdb: full set of search types
  2011-11-16  0:52 ` Kristaps Dzonsons
@ 2011-11-16  1:50   ` Ingo Schwarze
  2011-11-16 16:59   ` Ingo Schwarze
  1 sibling, 0 replies; 4+ messages in thread
From: Ingo Schwarze @ 2011-11-16  1:50 UTC (permalink / raw)
  To: tech

Hi Kristaps,

Kristaps Dzonsons wrote on Wed, Nov 16, 2011 at 01:52:14AM +0100:

> Just a word or two before I sleep.  This approach is sound and I've
> no issues with a quick look over the patch, but will wait til
> tomorrow to do so in earnest.

About the same for your logical operations:  The approach looks
nice, an i will merge it tomorrow and read it in detail.

> But first, before mandocdb gets production, the database should be
> checked for endian-neutrality.

Hm, i never though about that.

> Second, we'd long ago mentioned splitting SYNOPSIS-invoked macros
> (Fl, Fn, etc.) for querying on their SYNOPSIS or non-SYNOPSIS usage.
> I think an elegant method is to encode the section within the
> keyword database, which allows for
> 
>  apropos Fn~mdoc -a -s SYNOPSIS

Not -s, since -s is the other section (grrr).

The database field would have to be a bitmask, or we would
multiply the size auf the database.

The syntax is not completely logical, as the -s SYNOPSIS
is a qualifier for the mdoc query string, not a stand-alone
query phrase.

  apropos SYNOPSIS:Fn=mdoc

would be more logical.  If you really want -o, you have to say:

  apropos SYNOPSIS:any=mdoc -o Fn=mdoc

Your proposal causes ambiguities:

  apropos Fn=mdoc -a -s SYNOPSIS -a Nm=man

Is that:

  apropos SYNOPSIS:Fn=mdoc -a Nm=man
  apropos Fn=mdoc -a SYNOPSIS:Nm=man
  apropos SYNOPSIS:Fn=mdoc -a SYNOPSIS:Nm=man

And even worse, what the heck is:

  apropos Fn=mdoc -o -s SYNOPSIS

My proposal also has a quirk.  Consider:

  .Sh SYNOPSIS
  .Nm foo
  .Ar mdoc
  .Sh DESCRIPTION
  .Nm mdoc

That would match SYNOPSIS:Nm=mdoc.
But that's unfixable, unless we drop the whole bitfield approach,
which will make the database size explode.
Or we could use a bitfield of the size 20 (sections) times
40 (macros) = 800 bits = 100 bytes, which is also very big.

> or whatever `-s' replacement operator.  How does that sound?  This
> sounds a lot more reasonable than encoding separate Fn, Nm, etc.
> macros for SYNOPSIS and non-SYNOPSIS invocation.

Yes, in particular since you will be looking for other macros
in other sections:  SEE ALSO:Xr  FILES:Pa  AUTHORS:An
STANDARDS:St HISTORY:Bx DIAGNOSTICS:Er.  And atypical queries
may occasionally make sense, like STANDARDS:Fl.
Hand-picking combinations seems like unreasonable implementation
effort and not at all user-friendly.

Maybe we don't need section restrictions at all.
The only use for section restrictions would be controlling
noise in searches - or do you see other uses?
But seriously, how much noise do you expect from Nm outside
SYNOPSIS, Xr outside SEE ALSO, Pa outside FILES, and so on?
I expect little, because macros are rare outside their
typical sections.  On top of that, some of these atypical
occurrences will contribute to the signal, so i'd recommend
that people not use section restrictions by default, but
only switch them on when drowning in noise - and then,
honestly, it's not even likely to help much.

So i'd probably suggest to not implement section restrictions
right now, but reconsider this in a year or two, when we have
a better feeling how the new apropos will actually be used.
The database format is not set in stone for eternity, i just
don't want to announce public availability and then gratuitously
break the format the very next week.

Yours,
  Ingo
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: mandocdb: full set of search types
  2011-11-16  0:52 ` Kristaps Dzonsons
  2011-11-16  1:50   ` Ingo Schwarze
@ 2011-11-16 16:59   ` Ingo Schwarze
  1 sibling, 0 replies; 4+ messages in thread
From: Ingo Schwarze @ 2011-11-16 16:59 UTC (permalink / raw)
  To: tech

Hi Kristaps,

Kristaps Dzonsons wrote on Wed, Nov 16, 2011 at 01:52:14AM +0100:
> On 16/11/2011 01:39, Ingo Schwarze wrote:

[...]
>> Before enabling mandocdb in pkg_add(8), i'd like to get the database
>> format complete, such that we don't force people to rebuild the
>> databases after upgrading to new snapshots.
>>
>> Here is a patch defining TYPE_ flags for all macros that i can
>> imagine might be worth searching for (maybe even a few more, but i'd
>> rather have too many than too few).  I have left out all obsolete
>> macros and most physical formatting macros.
>>
>> This requires switching the mask to 64 bits.  During the switch,
>> i have replaced the very error-prone handling of the key database
>> values by a new struct db_val; some memcpy() calls by normal
>> assignments; and some magical constants by sizeof() constructs.

> Just a word or two before I sleep.  This approach is sound and I've
> no issues with a quick look over the patch, but will wait til
> tomorrow to do so in earnest.  But first, before mandocdb gets
> production, the database should be checked for endian-neutrality.

That's still TODO.

However, that doesn't prevent getting the TYPE_* flags right first,
so here is an updated patch, to be applied on top of my man.conf
stuff sent right before.

OK?
  Ingo


--- apropos_db.c.orig
+++ apropos_db.c
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 #include <regex.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -41,7 +42,7 @@ struct	rectree {
 struct	expr {
 	int		 regex;
 	int		 index;
-	int	 	 mask;
+	uint64_t 	 mask;
 	int		 and;
 	char		*v;
 	regex_t	 	 re;
@@ -49,22 +50,47 @@ struct	expr {
 };
 
 struct	type {
-	int		 mask;
+	uint64_t	 mask;
 	const char	*name;
 };
 
 static	const struct type types[] = {
 	{ TYPE_An, "An" },
+	{ TYPE_Ar, "Ar" },
+	{ TYPE_At, "At" },
+	{ TYPE_Bsx, "Bsx" },
+	{ TYPE_Bx, "Bx" },
 	{ TYPE_Cd, "Cd" },
+	{ TYPE_Cm, "Cm" },
+	{ TYPE_Dv, "Dv" },
+	{ TYPE_Dx, "Dx" },
+	{ TYPE_Em, "Em" },
 	{ TYPE_Er, "Er" },
 	{ TYPE_Ev, "Ev" },
+	{ TYPE_Fa, "Fa" },
+	{ TYPE_Fl, "Fl" },
 	{ TYPE_Fn, "Fn" },
 	{ TYPE_Fn, "Fo" },
+	{ TYPE_Ft, "Ft" },
+	{ TYPE_Fx, "Fx" },
+	{ TYPE_Ic, "Ic" },
 	{ TYPE_In, "In" },
+	{ TYPE_Lb, "Lb" },
+	{ TYPE_Li, "Li" },
+	{ TYPE_Lk, "Lk" },
+	{ TYPE_Ms, "Ms" },
+	{ TYPE_Mt, "Mt" },
 	{ TYPE_Nd, "Nd" },
 	{ TYPE_Nm, "Nm" },
+	{ TYPE_Nx, "Nx" },
+	{ TYPE_Ox, "Ox" },
 	{ TYPE_Pa, "Pa" },
+	{ TYPE_Rs, "Rs" },
+	{ TYPE_Sh, "Sh" },
+	{ TYPE_Ss, "Ss" },
 	{ TYPE_St, "St" },
+	{ TYPE_Sy, "Sy" },
+	{ TYPE_Tn, "Tn" },
 	{ TYPE_Va, "Va" },
 	{ TYPE_Va, "Vt" },
 	{ TYPE_Xr, "Xr" },
@@ -74,9 +100,9 @@ static	const struct type types[] = {
 
 static	DB	*btree_open(void);
 static	int	 btree_read(const DBT *, const struct mchars *, char **);
-static	int	 exprexecpre(const struct expr *, const char *, int);
+static	int	 exprexecpre(const struct expr *, const char *, uint64_t);
 static	void	 exprexecpost(const struct expr *, 
-			const char *, int, int *, size_t);
+			const char *, uint64_t, int *, size_t);
 static	struct expr *exprterm(char *, int, int);
 static	DB	*index_open(void);
 static	int	 index_read(const DBT *, const DBT *, 
@@ -381,7 +407,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 		const struct expr *expr, size_t terms,
 		struct mchars *mc)
 {
-	int		 root, leaf, mask;
+	int		 root, leaf;
 	DBT		 key, val;
 	DB		*btree, *idx;
 	int		 ch;
@@ -389,6 +415,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 	recno_t		 rec;
 	struct rec	*recs;
 	struct rec	 srec;
+	struct db_val	*vbuf;
 
 	root	= -1;
 	leaf	= -1;
@@ -412,21 +439,19 @@ single_search(struct rectree *tree, const struct opts *opts,
 		 * The key must have something in it, and the value must
 		 * have the correct tags/recno mix.
 		 */
-		if (key.size < 2 || 8 != val.size) 
+		if (key.size < 2 || sizeof(struct db_val) != val.size) 
 			break;
 		if ( ! btree_read(&key, mc, &buf))
 			break;
 
-		mask = *(int *)val.data;
-
 		/*
 		 * See if this keyword record matches any of the
 		 * expressions we have stored.
 		 */
-		if ( ! exprexecpre(expr, buf, mask))
+		vbuf = val.data;
+		if ( ! exprexecpre(expr, buf, vbuf->mask))
 			continue;
-
-		memcpy(&rec, val.data + 4, sizeof(recno_t));
+		rec = vbuf->rec;
 
 		/*
 		 * O(log n) scan for prior records.  Since a record
@@ -445,7 +470,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 		if (leaf >= 0 && recs[leaf].rec == rec) {
 			if (0 == recs[leaf].matches[0])
 				exprexecpost
-					(expr, buf, mask, 
+					(expr, buf, vbuf->mask, 
 					 recs[leaf].matches, terms);
 			continue;
 		}
@@ -478,7 +503,7 @@ single_search(struct rectree *tree, const struct opts *opts,
 			mandoc_calloc(terms + 1, sizeof(int));
 
 		exprexecpost
-			(expr, buf, mask, 
+			(expr, buf, vbuf->mask, 
 			 recs[tree->len].matches, terms);
 
 		/* Append to our tree. */
@@ -642,7 +667,7 @@ exprfree(struct expr *p)
  * Return 1 if any expression evaluates to true, else 0.
  */
 static int
-exprexecpre(const struct expr *p, const char *cp, int mask)
+exprexecpre(const struct expr *p, const char *cp, uint64_t mask)
 {
 
 	for ( ; NULL != p; p = p->next) {
@@ -666,7 +691,7 @@ exprexecpre(const struct expr *p, const char *cp, int mask)
  */
 static void
 exprexecpost(const struct expr *e, const char *cp, 
-		int mask, int *matches, size_t matchsz)
+		uint64_t mask, int *matches, size_t matchsz)
 {
 	const struct expr *p;
 	int		   match;
--- mandocdb.8.orig
+++ mandocdb.8
@@ -48,9 +48,13 @@ The arguments are as follows:
 .It Fl a
 Use all directories and files found below
 .Ar dir ... .
-By default, directories and files
-.Xr man 1
-cannot find will be silently skipped.
+By default, only files matching
+.Sm off
+.Sy man Ar section Li /
+.Op Ar arch Li /
+.Ar title . section
+.Sm on
+will be used.
 .It Fl d Ar dir
 Merge (remove and re-add)
 .Ar
--- mandocdb.c.orig
+++ mandocdb.c
@@ -79,7 +79,7 @@ static	void		  buf_append(struct buf *, const char *);
 static	void		  buf_appendb(struct buf *, 
 				const void *, size_t);
 static	void		  dbt_put(DB *, const char *, DBT *, DBT *);
-static	void		  hash_put(DB *, const struct buf *, int);
+static	void		  hash_put(DB *, const struct buf *, uint64_t);
 static	void		  hash_reset(DB **);
 static	void		  index_merge(const struct of *, struct mparse *,
 				struct buf *, struct buf *,
@@ -257,11 +257,11 @@ mandocdb(int argc, char *argv[])
 			*db, /* keyword database */
 			*hash; /* temporary keyword hashtable */
 	BTREEINFO	 info; /* btree configuration */
-	recno_t		 maxrec; /* supremum of all records */
-	recno_t		*recs; /* buffer of empty records */
+	recno_t		 maxrec; /* last record number in the index */
+	recno_t		*recs; /* the numbers of all empty records */
 	size_t		 sz1, sz2,
-			 recsz, /* buffer size of recs */
-			 reccur; /* valid number of recs */
+			 recsz, /* number of allocated slots in recs */
+			 reccur; /* current number of empty records */
 	struct buf	 buf, /* keyword buffer */
 			 dbuf; /* description buffer */
 	struct of	*of; /* list of files for processing */
@@ -348,7 +348,7 @@ mandocdb(int argc, char *argv[])
 		if (NULL == db) {
 			perror(fbuf);
 			exit((int)MANDOCLEVEL_SYSERR);
-		} else if (NULL == db) {
+		} else if (NULL == idx) {
 			perror(ibuf);
 			exit((int)MANDOCLEVEL_SYSERR);
 		}
@@ -410,7 +410,7 @@ mandocdb(int argc, char *argv[])
 		if (NULL == db) {
 			perror(fbuf);
 			exit((int)MANDOCLEVEL_SYSERR);
-		} else if (NULL == db) {
+		} else if (NULL == idx) {
 			perror(ibuf);
 			exit((int)MANDOCLEVEL_SYSERR);
 		}
@@ -473,7 +473,7 @@ index_merge(const struct of *of, struct mparse *mp,
 	const char	*fn, *msec, *mtitle, *arch;
 	size_t		 sv;
 	unsigned	 seq;
-	char		 vbuf[8];
+	struct db_val	 vbuf;
 
 	for (rec = 0; of; of = of->next) {
 		fn = of->fname;
@@ -499,9 +499,9 @@ index_merge(const struct of *of, struct mparse *mp,
 			continue;
 
 		/*
-		 * Make sure the manual section and architecture
-		 * agree with the directory where the file is located
-		 * or man(1) will not be able to find it.
+		 * By default, skip a file if the manual section
+		 * and architecture given in the file disagree
+		 * with the directory where the file is located.
 		 */
 
 		msec = NULL != mdoc ? 
@@ -527,9 +527,10 @@ index_merge(const struct of *of, struct mparse *mp,
 			arch = "";
 
 		/* 
-		 * Case is relevant for man(1), so use the file name
-		 * instead of the (usually) all caps page title,
-		 * if the two agree.
+		 * By default, skip a file if the title given
+		 * in the file disagrees with the file name.
+		 * If both agree, use the file name as the title,
+		 * because the one in the file usually is all caps.
 		 */
 
 		mtitle = NULL != mdoc ? 
@@ -571,17 +572,15 @@ index_merge(const struct of *of, struct mparse *mp,
 		 * Copy from the in-memory hashtable of pending keywords
 		 * into the database.
 		 */
-		
-		memset(vbuf, 0, sizeof(uint32_t));
-		memcpy(vbuf + 4, &rec, sizeof(uint32_t));
 
+		vbuf.rec = rec;
 		seq = R_FIRST;
 		while (0 == (ch = (*hash->seq)(hash, &key, &val, seq))) {
 			seq = R_NEXT;
 
-			memcpy(vbuf, val.data, sizeof(uint32_t));
-			val.size = sizeof(vbuf);
-			val.data = vbuf;
+			vbuf.mask = *(uint64_t *)val.data;
+			val.size = sizeof(struct db_val);
+			val.data = &vbuf;
 
 			if (verb > 1)
 				printf("%s: Added keyword: %s\n", 
@@ -626,6 +625,7 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 {
 	const struct of	*of;
 	const char	*fn;
+	struct db_val	*vbuf;
 	unsigned	 seq, sseq;
 	DBT		 key, val;
 	size_t		 reccur;
@@ -658,8 +658,9 @@ index_prune(const struct of *ofile, DB *db, const char *dbf,
 		sseq = R_FIRST;
 		while (0 == (ch = (*db->seq)(db, &key, &val, sseq))) {
 			sseq = R_NEXT;
-			assert(8 == val.size);
-			if (*maxrec != *(recno_t *)(val.data + 4))
+			assert(sizeof(struct db_val) == val.size);
+			vbuf = val.data;
+			if (*maxrec != vbuf->rec)
 				continue;
 			if (verb)
 				printf("%s: Deleted keyword: %s\n", 
@@ -1040,7 +1041,7 @@ pmdoc_Nm(MDOC_ARGS)
 }
 
 static void
-hash_put(DB *db, const struct buf *buf, int mask)
+hash_put(DB *db, const struct buf *buf, uint64_t mask)
 {
 	DBT		 key, val;
 	int		 rc;
@@ -1055,10 +1056,10 @@ hash_put(DB *db, const struct buf *buf, int mask)
 		perror("hash");
 		exit((int)MANDOCLEVEL_SYSERR);
 	} else if (0 == rc)
-		mask |= *(int *)val.data;
+		mask |= *(uint64_t *)val.data;
 
 	val.data = &mask;
-	val.size = sizeof(int); 
+	val.size = sizeof(uint64_t); 
 
 	if ((rc = (*db->put)(db, &key, &val, 0)) < 0) {
 		perror("hash");
@@ -1228,7 +1229,9 @@ ofile_argbuild(char *argv[], int argc, int use_all, int verb,
 	for (i = 0; i < argc; i++) {
 
 		/*
-		 * Analyze the path.
+		 * Try to infer the manual section, architecture and
+		 * page title from the path, assuming it looks like
+		 *   man*[/<arch>]/<title>.<section>
 		 */
 
 		if (strlcpy(buf, argv[i], sizeof(buf)) >= sizeof(buf)) {
@@ -1322,8 +1325,8 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			arch = parch;
 
 			/*
-	 		 * Don't bother parsing directories
-			 * that man(1) won't find.
+			 * By default, only use directories called:
+			 *   man<section>/[<arch>/]
 			 */
 
 			if (NULL == sec) {
@@ -1363,7 +1366,9 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			continue;
 
 		/*
-		 * Don't bother parsing files that man(1) won't find.
+		 * By default, skip files where the file name suffix
+		 * does not agree with the section directory
+		 * they are located in.
 		 */
 
 		suffix = strrchr(fn, '.');
@@ -1389,6 +1394,12 @@ ofile_dirbuild(const char *dir, const char* psec, const char *parch,
 			nof->sec = mandoc_strdup(psec);
 		if (NULL != parch)
 			nof->arch = mandoc_strdup(parch);
+
+		/*
+		 * Remember the file name without the extension,
+		 * to be used as the page title in the database.
+		 */
+
 		if (NULL != suffix)
 			*suffix = '\0';
 		nof->title = mandoc_strdup(fn);
--- mandocdb.h.orig
+++ mandocdb.h
@@ -15,18 +15,49 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+struct db_val {
+	uint64_t	mask;
+	uint32_t	rec;
+};
+
 #define	MANDOC_DB	"mandoc.db"
 #define	MANDOC_IDX	"mandoc.index"
 
-#define	TYPE_An		0x01
-#define	TYPE_Cd		0x02
-#define	TYPE_Er		0x04
-#define	TYPE_Ev		0x08
-#define	TYPE_Fn		0x10
-#define	TYPE_In		0x20
-#define	TYPE_Nd		0x40
-#define	TYPE_Nm		0x100
-#define	TYPE_Pa		0x200
-#define	TYPE_St		0x400
-#define	TYPE_Va		0x1000
-#define	TYPE_Xr		0x2000
+#define	TYPE_An		0x0000000000000001ULL
+#define	TYPE_Ar		0x0000000000000002ULL
+#define	TYPE_At		0x0000000000000004ULL
+#define	TYPE_Bsx	0x0000000000000008ULL
+#define	TYPE_Bx         0x0000000000000010ULL
+#define	TYPE_Cd		0x0000000000000020ULL
+#define	TYPE_Cm		0x0000000000000040ULL
+#define	TYPE_Dv		0x0000000000000080ULL
+#define	TYPE_Dx		0x0000000000000100ULL
+#define	TYPE_Em		0x0000000000000200ULL
+#define	TYPE_Er		0x0000000000000400ULL
+#define	TYPE_Ev		0x0000000000000800ULL
+#define	TYPE_Fa		0x0000000000001000ULL
+#define	TYPE_Fl		0x0000000000002000ULL
+#define	TYPE_Fn		0x0000000000004000ULL
+#define	TYPE_Ft		0x0000000000008000ULL
+#define	TYPE_Fx		0x0000000000010000ULL
+#define	TYPE_Ic		0x0000000000020000ULL
+#define	TYPE_In		0x0000000000040000ULL
+#define	TYPE_Lb		0x0000000000080000ULL
+#define	TYPE_Li		0x0000000000100000ULL
+#define	TYPE_Lk		0x0000000000200000ULL
+#define	TYPE_Ms		0x0000000000400000ULL
+#define	TYPE_Mt		0x0000000000800000ULL
+#define	TYPE_Nd		0x0000000001000000ULL
+#define	TYPE_Nm		0x0000000002000000ULL
+#define	TYPE_Nx		0x0000000004000000ULL
+#define	TYPE_Ox		0x0000000008000000ULL
+#define	TYPE_Pa		0x0000000010000000ULL
+#define	TYPE_Rs		0x0000000020000000ULL
+#define	TYPE_Sh		0x0000000040000000ULL
+#define	TYPE_Ss		0x0000000080000000ULL
+#define	TYPE_St		0x0000000100000000ULL
+#define	TYPE_Sy		0x0000000200000000ULL
+#define	TYPE_Tn		0x0000000400000000ULL
+#define	TYPE_Va		0x0000000800000000ULL
+#define	TYPE_Vt		0x0000001000000000ULL
+#define	TYPE_Xr		0x0000002000000000ULL
--
 To unsubscribe send an email to tech+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-11-16 16:59 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-16  0:39 mandocdb: full set of search types Ingo Schwarze
2011-11-16  0:52 ` Kristaps Dzonsons
2011-11-16  1:50   ` Ingo Schwarze
2011-11-16 16:59   ` Ingo Schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).