tech@mandoc.bsd.lv
 help / color / mirror / Atom feed
* Perfect hashes.
@ 2011-05-25 20:43 Kristaps Dzonsons
  2011-06-19 13:24 ` Kristaps Dzonsons
  2011-06-21  0:11 ` Ingo Schwarze
  0 siblings, 2 replies; 3+ messages in thread
From: Kristaps Dzonsons @ 2011-05-25 20:43 UTC (permalink / raw)
  To: tech

[-- Attachment #1: Type: text/plain, Size: 1654 bytes --]

Hi,

A while ago, Joerg brought up the idea of using perfect hashes for some 
of the keywords in mandoc.  I'm reviving this conversation because I'd 
like a uniform interface for the roff, mdoc, man, chars, and mdoc-args 
tables.  For the time being, each of these has its own method, which is 
error-prone, ugly, and bloated.

Perfect hashes make sense because the above data is fixed. Furthermore, 
they have no cost of allocation and have little overhead in terms of 
unused table-space.

Enclosed is a patch that implements this just for the mdoc macro names. 
   I used gperf as I don't have a new NetBSD box handy for nbperf.  (It 
doesn't matter which is used---I messed around with the generated code 
as it is.)

The speed-up is a bit more than 1% when repeatedly running mandoc over a 
set of manuals.  When running it with a large number of manuals on the 
command-line, the results are more or less the same.  Either way, the 
generated binary is slightly smaller.

I'm leaning toward implementing similar steps for all existing hash 
files.  Does anybody have any objections?  My intent in this isn't so 
much performance as uniform interfacing, code readability, correctness, 
and bloat.  All of these are covered by perfect hashes, at the cost of 
needing an external utility whenever the symbols change.

(I had to remove mdoc_macronames[] because there wasn't an immediate way 
to map back... I'll figure this out later, if nobody objects to the 
notion in the first place.)

Thoughts?

Kristaps

PS, a good hash for the roff symbols (ds/de) is also under 
investigation.  Any suggestions---not dbopen---on a hash table for all 
Unices?

[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 14748 bytes --]

? config.h
? config.log
? foo
? man.txt
? mandoc
? mandoc.cmp
? mandoc.older
? manuals.txt
? mdoc.perf
? patch.int.txt
? patch.mb.txt
? patch.strtol.txt
? patch.txt
? patch.unicode.txt
Index: libmdoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/libmdoc.h,v
retrieving revision 1.74
diff -u -r1.74 libmdoc.h
--- libmdoc.h	19 Apr 2011 16:38:48 -0000	1.74
+++ libmdoc.h	25 May 2011 12:58:05 -0000
@@ -117,8 +117,7 @@
 			enum mdoct tok, struct mdoc_node *body,
 			enum mdoc_endbody end);
 void		  mdoc_node_delete(struct mdoc *, struct mdoc_node *);
-void		  mdoc_hash_init(void);
-enum mdoct	  mdoc_hash_find(const char *);
+enum mdoct	  mdoc_hash_find(const char *, size_t);
 const char	 *mdoc_a2att(const char *);
 const char	 *mdoc_a2lib(const char *);
 const char	 *mdoc_a2st(const char *);
Index: mdoc.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc.c,v
retrieving revision 1.188
diff -u -r1.188 mdoc.c
--- mdoc.c	28 Mar 2011 23:52:13 -0000	1.188
+++ mdoc.c	25 May 2011 12:58:05 -0000
@@ -33,46 +33,6 @@
 #include "libmdoc.h"
 #include "libmandoc.h"
 
-const	char *const __mdoc_macronames[MDOC_MAX] = {		 
-	"Ap",		"Dd",		"Dt",		"Os",
-	"Sh",		"Ss",		"Pp",		"D1",
-	"Dl",		"Bd",		"Ed",		"Bl",
-	"El",		"It",		"Ad",		"An",
-	"Ar",		"Cd",		"Cm",		"Dv",
-	"Er",		"Ev",		"Ex",		"Fa",
-	"Fd",		"Fl",		"Fn",		"Ft",
-	"Ic",		"In",		"Li",		"Nd",
-	"Nm",		"Op",		"Ot",		"Pa",
-	"Rv",		"St",		"Va",		"Vt",
-	/* LINTED */
-	"Xr",		"%A",		"%B",		"%D",
-	/* LINTED */
-	"%I",		"%J",		"%N",		"%O",
-	/* LINTED */
-	"%P",		"%R",		"%T",		"%V",
-	"Ac",		"Ao",		"Aq",		"At",
-	"Bc",		"Bf",		"Bo",		"Bq",
-	"Bsx",		"Bx",		"Db",		"Dc",
-	"Do",		"Dq",		"Ec",		"Ef",
-	"Em",		"Eo",		"Fx",		"Ms",
-	"No",		"Ns",		"Nx",		"Ox",
-	"Pc",		"Pf",		"Po",		"Pq",
-	"Qc",		"Ql",		"Qo",		"Qq",
-	"Re",		"Rs",		"Sc",		"So",
-	"Sq",		"Sm",		"Sx",		"Sy",
-	"Tn",		"Ux",		"Xc",		"Xo",
-	"Fo",		"Fc",		"Oo",		"Oc",
-	"Bk",		"Ek",		"Bt",		"Hf",
-	"Fr",		"Ud",		"Lb",		"Lp",
-	"Lk",		"Mt",		"Brq",		"Bro",
-	/* LINTED */
-	"Brc",		"%C",		"Es",		"En",
-	/* LINTED */
-	"Dx",		"%Q",		"br",		"sp",
-	/* LINTED */
-	"%U",		"Ta"
-	};
-
 const	char *const __mdoc_argnames[MDOC_ARG_MAX] = {		 
 	"split",		"nosplit",		"ragged",
 	"unfilled",		"literal",		"file",		 
@@ -85,7 +45,6 @@
 	"symbolic",		"nested",		"centered"
 	};
 
-const	char * const *mdoc_macronames = __mdoc_macronames;
 const	char * const *mdoc_argnames = __mdoc_argnames;
 
 static	void		  mdoc_node_free(struct mdoc_node *);
@@ -202,7 +161,6 @@
 	p->parse = parse;
 	p->regs = regs;
 
-	mdoc_hash_init();
 	mdoc_alloc1(p);
 	return(p);
 }
@@ -813,7 +771,7 @@
 
 	mac[i] = '\0';
 
-	tok = (i > 1 || i < 4) ? mdoc_hash_find(mac) : MDOC_MAX;
+	tok = (i > 1 || i < 4) ? mdoc_hash_find(mac, i) : MDOC_MAX;
 
 	if (MDOC_MAX == tok) {
 		mandoc_vmsg(MANDOCERR_MACRO, m->parse, 
Index: mdoc.h
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc.h,v
retrieving revision 1.122
diff -u -r1.122 mdoc.h
--- mdoc.h	22 Mar 2011 14:05:45 -0000	1.122
+++ mdoc.h	25 May 2011 12:58:05 -0000
@@ -18,7 +18,7 @@
 #define MDOC_H
 
 enum	mdoct {
-	MDOC_Ap = 0,
+	MDOC_Ap,
 	MDOC_Dd,
 	MDOC_Dt,
 	MDOC_Os,
@@ -373,9 +373,6 @@
 	const struct eqn *eqn; /* EQN */
 	enum mdoc_endbody end; /* BODY */
 };
-
-/* Names of macros.  Index is enum mdoct. */
-extern	const char *const *mdoc_macronames;
 
 /* Names of macro args.  Index is enum mdocargt. */
 extern	const char *const *mdoc_argnames;
Index: mdoc_hash.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_hash.c,v
retrieving revision 1.17
diff -u -r1.17 mdoc_hash.c
--- mdoc_hash.c	22 Mar 2011 14:33:05 -0000	1.17
+++ mdoc_hash.c	25 May 2011 12:58:05 -0000
@@ -1,6 +1,6 @@
 /*	$Id: mdoc_hash.c,v 1.17 2011/03/22 14:33:05 kristaps Exp $ */
 /*
- * Copyright (c) 2008, 2009 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2008, 2009, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -31,64 +31,270 @@
 #include "mandoc.h"
 #include "libmdoc.h"
 
-static	u_char		 table[27 * 12];
+struct opt { 
+	const char *name; 
+	enum mdoct macro;
+};
+
+#define TOTAL_KEYWORDS 122
+#define MIN_WORD_LENGTH 2
+#define MAX_WORD_LENGTH 3
+#define MIN_HASH_VALUE 2
+#define MAX_HASH_VALUE 247
+/* maximum key range = 246, duplicates = 0 */
 
-/*
- * XXX - this hash has global scope, so if intended for use as a library
- * with multiple callers, it will need re-invocation protection.
- */
-void
-mdoc_hash_init(void)
+static unsigned int
+hash (register const char *str, register size_t len)
 {
-	int		 i, j, major;
-	const char	*p;
-
-	memset(table, UCHAR_MAX, sizeof(table));
-
-	for (i = 0; i < (int)MDOC_MAX; i++) {
-		p = mdoc_macronames[i];
-
-		if (isalpha((u_char)p[1]))
-			major = 12 * (tolower((u_char)p[1]) - 97);
-		else
-			major = 12 * 26;
-
-		for (j = 0; j < 12; j++)
-			if (UCHAR_MAX == table[major + j]) {
-				table[major + j] = (u_char)i;
-				break;
-			}
-
-		assert(j < 12);
-	}
+  static const unsigned char asso_values[] =
+    {
+      248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
+      248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
+      248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
+      248, 248, 248, 248, 248, 248, 248, 120, 248, 248,
+      248, 248, 248, 248, 248, 248, 248, 248, 248,  19,
+      248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
+      248, 248, 248, 248, 248,  70,  20, 115,   0,  17,
+        2, 248, 125,  92,  60, 248,  87,   4,  77, 105,
+       67,  42, 112,  40, 125,  85,  97, 248,   4, 248,
+      248, 248, 248, 248, 248, 248, 248,  12,  29,   5,
+      125, 110, 102, 248,  50,  90, 248,  24,  32, 115,
+       90,   0,  80,  15, 100,  95,  60, 248,  37, 248,
+       10,  25, 248, 248, 248, 248, 248, 248
+    };
+  return len + asso_values[(unsigned char)str[len - 1]] + asso_values[(unsigned char)str[0]];
 }
 
 enum mdoct
-mdoc_hash_find(const char *p)
+mdoc_hash_find (register const char *str, register size_t len)
 {
-	int		  major, i, j;
-
-	if (0 == p[0])
-		return(MDOC_MAX);
-	if ( ! isalpha((u_char)p[0]) && '%' != p[0])
-		return(MDOC_MAX);
-
-	if (isalpha((u_char)p[1]))
-		major = 12 * (tolower((u_char)p[1]) - 97);
-	else if ('1' == p[1])
-		major = 12 * 26;
-	else 
-		return(MDOC_MAX);
-
-	if (p[2] && p[3])
-		return(MDOC_MAX);
-
-	for (j = 0; j < 12; j++) {
-		if (UCHAR_MAX == (i = table[major + j]))
-			break;
-		if (0 == strcmp(p, mdoc_macronames[i]))
-			return((enum mdoct)i);
-	}
-
-	return(MDOC_MAX);
+  static const struct opt wordlist[] =
+    {
+      {""}, {""},
+      {"Do", MDOC_Do},
+      {""},
+      {"Fo", MDOC_Fo},
+      {""},
+      {"Xo", MDOC_Xo},
+      {"Dc", MDOC_Dc},
+      {""},
+      {"Fc", MDOC_Fc},
+      {""},
+      {"Xc", MDOC_Xc},
+      {"Dx", MDOC_Dx},
+      {""},
+      {"Fx", MDOC_Fx},
+      {""},
+      {"Fa", MDOC_Fa},
+      {"Dq", MDOC_Dq},
+      {""},
+      {"Eo", MDOC_Eo},
+      {""},
+      {"D1", MDOC_D1},
+      {"Bo", MDOC_Bo},
+      {"Bro", MDOC_Bro},
+      {"Ec", MDOC_Ec},
+      {""}, {""},
+      {"Bc", MDOC_Bc},
+      {"Brc", MDOC_Brc},
+      {"Ex", MDOC_Ex},
+      {""},
+      {"Db", MDOC_Db},
+      {"Bx", MDOC_Bx},
+      {"Bsx", MDOC_Bsx},
+      {"Dl", MDOC_Dl},
+      {""},
+      {"Fl", MDOC_Fl},
+      {"Bq", MDOC_Bq},
+      {"Brq", MDOC_Brq},
+      {"Dv", MDOC_Dv},
+      {""}, {""},
+      {"So", MDOC_So},
+      {"Ek", MDOC_Ek},
+      {"Qo", MDOC_Qo},
+      {""},
+      {"Bk", MDOC_Bk},
+      {"Sc", MDOC_Sc},
+      {""},
+      {"Qc", MDOC_Qc},
+      {""},
+      {"El", MDOC_El},
+      {"Sx", MDOC_Sx},
+      {""},
+      {"Bl", MDOC_Bl},
+      {""},
+      {"Ev", MDOC_Ev},
+      {"Sq", MDOC_Sq},
+      {""},
+      {"Qq", MDOC_Qq},
+      {""}, {""},
+      {"Dt", MDOC_Dt},
+      {""},
+      {"Ft", MDOC_Ft},
+      {""},
+      {"Mt", MDOC_Mt},
+      {"Sy", MDOC_Sy},
+      {""},
+      {"Po", MDOC_Po},
+      {""}, {""},
+      {"Ao", MDOC_Ao},
+      {""},
+      {"Pc", MDOC_Pc},
+      {""},
+      {"Ql", MDOC_Ql},
+      {"Ac", MDOC_Ac},
+      {""},
+      {"No", MDOC_No},
+      {""},
+      {"Pa", MDOC_Pa},
+      {"Bt", MDOC_Bt},
+      {""},
+      {"Pq", MDOC_Pq},
+      {""}, {""},
+      {"Aq", MDOC_Aq},
+      {""},
+      {"Nx", MDOC_Nx},
+      {""}, {""},
+      {"Sh", MDOC_Sh},
+      {""},
+      {"Fn", MDOC_Fn},
+      {""}, {""},
+      {"Ux", MDOC_Ux},
+      {""},
+      {"Ic", MDOC_Ic},
+      {""},
+      {"Ms", MDOC_Ms},
+      {"St", MDOC_St},
+      {""},
+      {"Fr", MDOC_Fr},
+      {""},
+      {"Xr", MDOC_Xr},
+      {"Oo", MDOC_Oo},
+      {""},
+      {"En", MDOC_En},
+      {""},
+      {"Va", MDOC_Va},
+      {"Oc", MDOC_Oc},
+      {"Lk", MDOC_Lk},
+      {"Es", MDOC_Es},
+      {""}, {""},
+      {"Ox", MDOC_Ox},
+      {"Lb", MDOC_Lb},
+      {"Er", MDOC_Er},
+      {""},
+      {"Ef", MDOC_Ef},
+      {"%D", MDOC__D},
+      {""},
+      {"Bf", MDOC_Bf},
+      {""}, {""},
+      {"Dd", MDOC_Dd},
+      {""},
+      {"Fd", MDOC_Fd},
+      {""},
+      {"br", MDOC_br},
+      {"At", MDOC_At},
+      {""},
+      {"Em", MDOC_Em},
+      {""}, {""},
+      {"Ss", MDOC_Ss},
+      {""},
+      {"Ta", MDOC_Ta},
+      {""}, {""},
+      {"%B", MDOC__B},
+      {""},
+      {"Ed", MDOC_Ed},
+      {""}, {""},
+      {"Bd", MDOC_Bd},
+      {""},
+      {"Pp", MDOC_Pp},
+      {""},
+      {"Rv", MDOC_Rv},
+      {"Ap", MDOC_Ap},
+      {""},
+      {"It", MDOC_It},
+      {""}, {""},
+      {"Sm", MDOC_Sm},
+      {""},
+      {"Vt", MDOC_Vt},
+      {""}, {""},
+      {"An", MDOC_An},
+      {""},
+      {"%Q", MDOC__Q},
+      {""}, {""},
+      {"Ot", MDOC_Ot},
+      {""},
+      {"Lp", MDOC_Lp},
+      {""},
+      {"Pf", MDOC_Pf},
+      {"Ar", MDOC_Ar},
+      {""},
+      {"Ns", MDOC_Ns},
+      {""}, {""},
+      {"sp", MDOC_sp},
+      {""},
+      {"Li", MDOC_Li},
+      {""}, {""},
+      {"%J", MDOC__J},
+      {""},
+      {"In", MDOC_In},
+      {""}, {""},
+      {"Op", MDOC_Op},
+      {""},
+      {"%P", MDOC__P},
+      {""}, {""},
+      {"%A", MDOC__A},
+      {""},
+      {"Nm", MDOC_Nm},
+      {""}, {""},
+      {"Ad", MDOC_Ad},
+      {""},
+      {"%N", MDOC__N},
+      {""}, {""},
+      {"Os", MDOC_Os},
+      {""},
+      {"Nd", MDOC_Nd},
+      {""}, {""},
+      {"%U", MDOC__U},
+      {""},
+      {"Rs", MDOC_Rs},
+      {""}, {""},
+      {"Ud", MDOC_Ud},
+      {""},
+      {"%I", MDOC__I},
+      {""}, {""},
+      {"Tn", MDOC_Tn},
+      {""},
+      {"%V", MDOC__V},
+      {""}, {""}, {""}, {""},
+      {"Re", MDOC_Re},
+      {""}, {""},
+      {"%O", MDOC__O},
+      {""},
+      {"Hf", MDOC_Hf},
+      {""}, {""},
+      {"Cm", MDOC_Cm},
+      {""},
+      {"%R", MDOC__R},
+      {""}, {""},
+      {"%C", MDOC__C},
+      {""}, {""}, {""}, {""},
+      {"Cd", MDOC_Cd},
+      {""}, {""}, {""}, {""},
+      {"%T", MDOC__T}
+    };
+
+  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
+    {
+      register int key = hash (str, len);
+
+      if (key <= MAX_HASH_VALUE && key >= 0)
+        {
+          register const char *s = wordlist[key].name;
+
+          if (*str == *s && !strcmp (str + 1, s + 1))
+            return wordlist[key].macro;
+        }
+    }
+  return MDOC_MAX;
 }
+
Index: mdoc_macro.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_macro.c,v
retrieving revision 1.109
diff -u -r1.109 mdoc_macro.c
--- mdoc_macro.c	30 Apr 2011 10:18:24 -0000	1.109
+++ mdoc_macro.c	25 May 2011 12:58:06 -0000
@@ -244,7 +244,7 @@
 {
 	enum mdoct	 res;
 
-	if (MDOC_MAX == (res = mdoc_hash_find(p)))
+	if (MDOC_MAX == (res = mdoc_hash_find(p, strlen(p))))
 		return(MDOC_MAX);
 	if (MDOC_CALLABLE & mdoc_macros[res].flags)
 		return(res);
@@ -517,8 +517,8 @@
 		}
 		broken->pending = breaker;
 		mandoc_vmsg(MANDOCERR_SCOPENEST, m->parse, line, ppos,
-				"%s breaks %s", mdoc_macronames[tok],
-				mdoc_macronames[broken->tok]);
+				"%s breaks %s", "" /*mdoc_macronames[tok]*/,
+				"" /*mdoc_macronames[broken->tok]*/);
 		return(1);
 	}
 
@@ -546,8 +546,8 @@
 		case (REWIND_FORCE):
 			mandoc_vmsg(MANDOCERR_SCOPEBROKEN, m->parse, 
 					line, ppos, "%s breaks %s", 
-					mdoc_macronames[tok],
-					mdoc_macronames[n->tok]);
+					"" /*mdoc_macronames[tok]*/,
+					"" /*mdoc_macronames[n->tok]*/);
 			/* FALLTHROUGH */
 		case (REWIND_MORE):
 			n = n->parent;
@@ -1336,7 +1336,7 @@
 	 */
 	if (n != body)
 		mandoc_vmsg(MANDOCERR_SCOPENEST, m->parse, line, ppos, 
-				"%s broken", mdoc_macronames[tok]);
+				"%s broken", "" /*mdoc_macronames[tok]*/);
 
 	if (n && ! rew_sub(MDOC_BODY, m, tok, line, ppos))
 		return(0);
Index: mdoc_validate.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mdoc_validate.c,v
retrieving revision 1.169
diff -u -r1.169 mdoc_validate.c
--- mdoc_validate.c	30 Apr 2011 10:18:24 -0000	1.169
+++ mdoc_validate.c	25 May 2011 12:58:06 -0000
@@ -592,7 +592,7 @@
 
 	mandoc_vmsg(MANDOCERR_SYNTCHILD, mdoc->parse, n->line, 
 			n->pos, "want parent %s", MDOC_ROOT == t ? 
-			"<root>" : mdoc_macronames[tok]);
+			"<root>" : "" /*mdoc_macronames[tok]*/);
 	return(0);
 }
 
@@ -1402,7 +1402,7 @@
 
 	if (0 == strcmp(n->norm->Bl.width, "Ds"))
 		width = 6;
-	else if (MDOC_MAX == (tok = mdoc_hash_find(n->norm->Bl.width)))
+	else if (MDOC_MAX == (tok = mdoc_hash_find(n->norm->Bl.width, strlen(n->norm->Bl.width))))
 		return(1);
 	else if (0 == (width = macro2len(tok)))  {
 		mdoc_nmsg(mdoc, n, MANDOCERR_BADWIDTH);
Index: tree.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/tree.c,v
retrieving revision 1.37
diff -u -r1.37 tree.c
--- tree.c	23 Mar 2011 12:33:01 -0000	1.37
+++ tree.c	25 May 2011 12:58:06 -0000
@@ -107,23 +107,18 @@
 		p = n->string;
 		break;
 	case (MDOC_BODY):
-		p = mdoc_macronames[n->tok];
-		break;
+		/* FALLTHROUGH */
 	case (MDOC_HEAD):
-		p = mdoc_macronames[n->tok];
-		break;
+		/* FALLTHROUGH */
 	case (MDOC_TAIL):
-		p = mdoc_macronames[n->tok];
+		p = "";
+		/*p = mdoc_macronames[n->tok];*/
 		break;
 	case (MDOC_ELEM):
-		p = mdoc_macronames[n->tok];
-		if (n->args) {
-			argv = n->args->argv;
-			argc = n->args->argc;
-		}
-		break;
+		/* FALLTHROUGH */
 	case (MDOC_BLOCK):
-		p = mdoc_macronames[n->tok];
+		p = "";
+		/*p = mdoc_macronames[n->tok];*/
 		if (n->args) {
 			argv = n->args->argv;
 			argc = n->args->argc;


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2011-06-21  0:11 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-25 20:43 Perfect hashes Kristaps Dzonsons
2011-06-19 13:24 ` Kristaps Dzonsons
2011-06-21  0:11 ` Ingo Schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).