source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mdocml: When looking up macro values while the macro tables are being
@ 2017-01-15 15:29 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2017-01-15 15:29 UTC (permalink / raw)
  To: source

Log Message:
-----------
When looking up macro values while the macro tables are being built
in makewhatis(8), use ohash rather than linear searches.

This was identified as the main makewhatis(8) performance bottleneck
by Baptiste Daroussin <bapt at FreeBSD>, who also suggested part 
of the improved algorithm.

This reduces the run time of "makewhatis /usr/share/man" from eleven
to five seconds on my notebook.  Note that the changed code is not
used in apropos(1), so don't expect speedups there.

While here, sort macro values asciibetically, to improve reproducibility - 
which still isn't perfect, but getting better.

Modified Files:
--------------
    mdocml:
        Makefile.depend
        dba.c

Revision Data
-------------
Index: Makefile.depend
===================================================================
RCS file: /home/cvs/mdocml/mdocml/Makefile.depend,v
retrieving revision 1.25
retrieving revision 1.26
diff -LMakefile.depend -LMakefile.depend -u -p -r1.25 -r1.26
--- Makefile.depend
+++ Makefile.depend
@@ -17,7 +17,7 @@ compat_strlcpy.o: compat_strlcpy.c confi
 compat_strsep.o: compat_strsep.c config.h
 compat_strtonum.o: compat_strtonum.c config.h
 compat_vasprintf.o: compat_vasprintf.c config.h
-dba.o: dba.c config.h mandoc_aux.h mansearch.h dba_write.h dba_array.h dba.h
+dba.o: dba.c config.h mandoc_aux.h mandoc_ohash.h compat_ohash.h mansearch.h dba_write.h dba_array.h dba.h
 dba_array.o: dba_array.c mandoc_aux.h dba_write.h dba_array.h
 dba_read.o: dba_read.c mandoc_aux.h mansearch.h dba_array.h dba.h dbm.h
 dba_write.o: dba_write.c config.h dba_write.h
Index: dba.c
===================================================================
RCS file: /home/cvs/mdocml/mdocml/dba.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -Ldba.c -Ldba.c -u -p -r1.8 -r1.9
--- dba.c
+++ dba.c
@@ -1,6 +1,6 @@
 /*	$Id$ */
 /*
- * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2016, 2017 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -28,23 +28,34 @@
 #include <arpa/inet.h>
 #endif
 #include <errno.h>
+#include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "mandoc_aux.h"
+#include "mandoc_ohash.h"
 #include "mansearch.h"
 #include "dba_write.h"
 #include "dba_array.h"
 #include "dba.h"
 
+struct macro_entry {
+	struct dba_array	*pages;
+	char			 value[];
+};
+
 static void	*prepend(const char *, char);
 static void	 dba_pages_write(struct dba_array *);
 static int	 compare_names(const void *, const void *);
 static int	 compare_strings(const void *, const void *);
+
+static struct macro_entry
+		*get_macro_entry(struct ohash *, const char *, int32_t);
 static void	 dba_macros_write(struct dba_array *);
-static void	 dba_macro_write(struct dba_array *);
+static void	 dba_macro_write(struct ohash *);
+static int	 compare_entries(const void *, const void *);
 
 
 /*** top-level functions **********************************************/
@@ -53,29 +64,37 @@ struct dba *
 dba_new(int32_t npages)
 {
 	struct dba	*dba;
+	struct ohash	*macro;
 	int32_t		 im;
 
 	dba = mandoc_malloc(sizeof(*dba));
 	dba->pages = dba_array_new(npages, DBA_GROW);
 	dba->macros = dba_array_new(MACRO_MAX, 0);
-	for (im = 0; im < MACRO_MAX; im++)
-		dba_array_set(dba->macros, im, dba_array_new(128, DBA_GROW));
+	for (im = 0; im < MACRO_MAX; im++) {
+		macro = mandoc_malloc(sizeof(*macro));
+		mandoc_ohash_init(macro, 4,
+		    offsetof(struct macro_entry, value));
+		dba_array_set(dba->macros, im, macro);
+	}
 	return dba;
 }
 
 void
 dba_free(struct dba *dba)
 {
-	struct dba_array	*page, *macro, *entry;
+	struct dba_array	*page;
+	struct ohash		*macro;
+	struct macro_entry	*entry;
+	unsigned int		 slot;
 
 	dba_array_FOREACH(dba->macros, macro) {
-		dba_array_undel(macro);
-		dba_array_FOREACH(macro, entry) {
-			free(dba_array_get(entry, 0));
-			dba_array_free(dba_array_get(entry, 1));
-			dba_array_free(entry);
+		for (entry = ohash_first(macro, &slot); entry != NULL;
+		     entry = ohash_next(macro, &slot)) {
+			dba_array_free(entry->pages);
+			free(entry);
 		}
-		dba_array_free(macro);
+		ohash_delete(macro);
+		free(macro);
 	}
 	dba_array_free(dba->macros);
 
@@ -315,57 +334,64 @@ compare_strings(const void *vp1, const v
 /*** functions for handling macros ************************************/
 
 /*
- * Create a new macro entry and append it to one of the macro tables.
+ * In the hash table for a single macro, look up an entry by
+ * the macro value or add an empty one if it doesn't exist yet.
+ */
+static struct macro_entry *
+get_macro_entry(struct ohash *macro, const char *value, int32_t np)
+{
+	struct macro_entry	*entry;
+	size_t			 len;
+	unsigned int		 slot;
+
+	slot = ohash_qlookup(macro, value);
+	if ((entry = ohash_find(macro, slot)) == NULL) {
+		len = strlen(value) + 1;
+		entry = mandoc_malloc(sizeof(*entry) + len);
+		memcpy(&entry->value, value, len);
+		entry->pages = dba_array_new(np, DBA_GROW);
+		ohash_insert(macro, slot, entry);
+	}
+	return entry;
+}
+
+/*
+ * In addition to get_macro_entry(), add multiple page references,
+ * converting them from the on-disk format (byte offsets in the file)
+ * to page pointers in memory.
  */
 void
 dba_macro_new(struct dba *dba, int32_t im, const char *value,
     const int32_t *pp)
 {
-	struct dba_array	*entry, *pages;
+	struct macro_entry	*entry;
 	const int32_t		*ip;
 	int32_t			 np;
 
 	np = 0;
 	for (ip = pp; *ip; ip++)
 		np++;
-	pages = dba_array_new(np, DBA_GROW);
+
+	entry = get_macro_entry(dba_array_get(dba->macros, im), value, np);
 	for (ip = pp; *ip; ip++)
-		dba_array_add(pages, dba_array_get(dba->pages,
+		dba_array_add(entry->pages, dba_array_get(dba->pages,
 		    be32toh(*ip) / 5 / sizeof(*ip) - 1));
-
-	entry = dba_array_new(2, 0);
-	dba_array_add(entry, mandoc_strdup(value));
-	dba_array_add(entry, pages);
-
-	dba_array_add(dba_array_get(dba->macros, im), entry);
 }
 
 /*
- * Look up a macro entry by value and add a reference to a new page to it.
- * If the value does not yet exist, create a new macro entry
- * and add it to the macro table in question.
+ * In addition to get_macro_entry(), add one page reference,
+ * directly taking the in-memory page pointer as an argument.
  */
 void
 dba_macro_add(struct dba_array *macros, int32_t im, const char *value,
     struct dba_array *page)
 {
-	struct dba_array	*macro, *entry, *pages;
+	struct macro_entry	*entry;
 
 	if (*value == '\0')
 		return;
-	macro = dba_array_get(macros, im);
-	dba_array_FOREACH(macro, entry)
-		if (strcmp(value, dba_array_get(entry, 0)) == 0)
-			break;
-	if (entry == NULL) {
-		entry = dba_array_new(2, 0);
-		dba_array_add(entry, mandoc_strdup(value));
-		pages = dba_array_new(1, DBA_GROW);
-		dba_array_add(entry, pages);
-		dba_array_add(macro, entry);
-	} else
-		pages = dba_array_get(entry, 1);
-	dba_array_add(pages, page);
+	entry = get_macro_entry(dba_array_get(macros, im), value, 1);
+	dba_array_add(entry->pages, page);
 }
 
 /*
@@ -377,7 +403,7 @@ dba_macro_add(struct dba_array *macros, 
 static void
 dba_macros_write(struct dba_array *macros)
 {
-	struct dba_array	*macro;
+	struct ohash		*macro;
 	int32_t			 im, pos_macros, pos_end;
 
 	pos_macros = dba_array_writelen(macros, 1);
@@ -403,38 +429,80 @@ dba_macros_write(struct dba_array *macro
  * - A list of pointers to pages, each list ending in a 0 integer.
  */
 static void
-dba_macro_write(struct dba_array *macro)
+dba_macro_write(struct ohash *macro)
 {
-	struct dba_array	*entry, *pages, *page;
-	int			 empty;
-	int32_t			 addr, pos_macro, pos_end;
-
-	dba_array_FOREACH(macro, entry) {
-		pages = dba_array_get(entry, 1);
-		empty = 1;
-		dba_array_FOREACH(pages, page)
+	struct macro_entry	**entries, *entry;
+	struct dba_array	 *page;
+	int32_t			 *kpos, *dpos;
+	unsigned int		  ie, ne, slot;
+	int			  use;
+	int32_t			  addr, pos_macro, pos_end;
+
+	/* Temporary storage for filtering and sorting. */
+
+	ne = ohash_entries(macro);
+	entries = mandoc_reallocarray(NULL, ne, sizeof(*entries));
+	kpos = mandoc_reallocarray(NULL, ne, sizeof(*kpos));
+	dpos = mandoc_reallocarray(NULL, ne, sizeof(*dpos));
+
+	/* Build a list of non-empty entries and sort it. */
+
+	ne = 0;
+	for (entry = ohash_first(macro, &slot); entry != NULL;
+	     entry = ohash_next(macro, &slot)) {
+		use = 0;
+		dba_array_FOREACH(entry->pages, page)
 			if (dba_array_getpos(page))
-				empty = 0;
-		if (empty)
-			dba_array_del(macro);
-	}
-	pos_macro = dba_array_writelen(macro, 2);
-	dba_array_FOREACH(macro, entry) {
-		dba_array_setpos(entry, 0, dba_tell());
-		dba_str_write(dba_array_get(entry, 0));
+				use = 1;
+		if (use)
+			entries[ne++] = entry;
+	}
+	qsort(entries, ne, sizeof(*entries), compare_entries);
+
+	/* Number of entries, and space for the pointer pairs. */
+
+	dba_int_write(ne);
+	pos_macro = dba_skip(2, ne);
+
+	/* String table. */
+
+	for (ie = 0; ie < ne; ie++) {
+		kpos[ie] = dba_tell();
+		dba_str_write(entries[ie]->value);
 	}
 	dba_align();
-	dba_array_FOREACH(macro, entry) {
-		dba_array_setpos(entry, 1, dba_tell());
-		pages = dba_array_get(entry, 1);
-		dba_array_FOREACH(pages, page)
+
+	/* Pages table. */
+
+	for (ie = 0; ie < ne; ie++) {
+		dpos[ie] = dba_tell();
+		dba_array_FOREACH(entries[ie]->pages, page)
 			if ((addr = dba_array_getpos(page)))
 				dba_int_write(addr);
 		dba_int_write(0);
 	}
 	pos_end = dba_tell();
+
+	/* Fill in the pointer pairs. */
+
 	dba_seek(pos_macro);
-	dba_array_FOREACH(macro, entry)
-		dba_array_writepos(entry);
+	for (ie = 0; ie < ne; ie++) {
+		dba_int_write(kpos[ie]);
+		dba_int_write(dpos[ie]);
+	}
 	dba_seek(pos_end);
+
+	free(entries);
+	free(kpos);
+	free(dpos);
+}
+
+static int
+compare_entries(const void *vp1, const void *vp2)
+{
+	const struct macro_entry *ep1, *ep2;
+
+	ep1 = *(struct macro_entry **)vp1;
+	ep2 = *(struct macro_entry **)vp2;
+	return strcmp(ep1->value, ep2->value);
 }
--
 To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2017-01-15 15:29 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-15 15:29 mdocml: When looking up macro values while the macro tables are being schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).