source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mdocml: The string hash table is only needed to combine multiple
@ 2013-06-07  1:01 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2013-06-07  1:01 UTC (permalink / raw)
  To: source

Log Message:
-----------
The string hash table is only needed to combine multiple occurences 
of the same string within the same manual, so initialize and purge
it for each manual in ofmerge() instead of one single time in main().

There is no point in saving manual names and descriptions in that 
table because each of them occurs only once, or very few times.
The is no point in saving section numbers there because they are
so much shorter than the descriptions.

Testing with the complete tree /usr/share/man/ on my notebook shows 
that this change slightly reduces memory consumption by about 20%
while there is no measurable difference in execution time.

As a bonus, this allows to delete the functions stradd() and stradds(), 
the "next" member from struct str, and the global struct str *words.

While adapting the places in the code using stradd(), i noticed that
parsing of the mdoc(7) .Nd macro was completely broken and that for
formatted manual pages with unusable NAME section, the description 
was never set in the struct of.  This commit fixes both bugs as well.

Modified Files:
--------------
    mdocml:
        mandocdb.c

Revision Data
-------------
Index: mandocdb.c
===================================================================
RCS file: /usr/vhosts/mdocml.bsd.lv/cvs/mdocml/mandocdb.c,v
retrieving revision 1.65
retrieving revision 1.66
diff -Lmandocdb.c -Lmandocdb.c -u -p -r1.65 -r1.66
--- mandocdb.c
+++ mandocdb.c
@@ -84,7 +84,6 @@ enum	form {
 struct	str {
 	char		*utf8; /* key in UTF-8 form */
 	const struct of *of; /* if set, the owning parse */
-	struct str	*next; /* next in owning parse sequence */
 	uint64_t	 mask; /* bitmask in sequence */
 	char		 key[]; /* the string itself */
 };
@@ -100,11 +99,11 @@ struct	of {
 	enum form	 dform; /* path-cued form */
 	enum form	 sform; /* suffix-cued form */
 	char		 file[PATH_MAX]; /* filename rel. to manpath */
-	const char	*desc; /* parsed description */
-	const char	*sec; /* suffix-cued section (or empty) */
-	const char	*dsec; /* path-cued section (or empty) */
-	const char	*arch; /* path-cued arch. (or empty) */
-	const char	*name; /* name (from filename) (not empty) */
+	char		*desc; /* parsed description */
+	char		*name; /* name (from filename) (not empty) */
+	char		*sec; /* suffix-cued section (or empty) */
+	char		*dsec; /* path-cued section (or empty) */
+	char		*arch; /* path-cued arch. (or empty) */
 };
 
 enum	stmt {
@@ -138,7 +137,8 @@ static	int	 inocheck(const struct stat *
 static	void	 ofadd(int, const char *, const char *, const char *,
 			const char *, const char *, const struct stat *);
 static	void	 offree(void);
-static	void	 ofmerge(struct mchars *, struct mparse *);
+static	void	 ofmerge(struct mchars *, struct mparse *,
+			struct ohash_info*);
 static	void	 parse_catpage(struct of *);
 static	void	 parse_man(struct of *, const struct man_node *);
 static	void	 parse_mdoc(struct of *, const struct mdoc_node *);
@@ -160,8 +160,6 @@ static	void	 putkeys(const struct of *, 
 static	void	 putmdockey(const struct of *,
 			const struct mdoc_node *, uint64_t);
 static	void	 say(const char *, const char *, ...);
-static	char 	*stradd(const char *);
-static	char 	*stradds(const char *, size_t);
 static	int	 treescan(void);
 static	size_t	 utf8(unsigned int, char [7]);
 static	void	 utf8key(struct mchars *, struct str *);
@@ -178,7 +176,6 @@ static	struct ohash	 inos; /* table of i
 static	struct ohash	 filenames; /* table of filenames */
 static	struct ohash	 strings; /* table of all strings */
 static	struct of	*ofs = NULL; /* vector of files to parse */
-static	struct str	*words = NULL; /* word list in current parse */
 static	sqlite3		*db = NULL; /* current database */
 static	sqlite3_stmt	*stmts[STMT__MAX]; /* current statements */
 
@@ -311,10 +308,8 @@ int
 main(int argc, char *argv[])
 {
 	int		  ch, i;
-	unsigned int	  slot;
 	size_t		  j, sz;
 	const char	 *path_arg;
-	struct str	 *s;
 	struct mchars	 *mc;
 	struct manpaths	  dirs;
 	struct mparse	 *mp;
@@ -406,7 +401,6 @@ main(int argc, char *argv[])
 		MANDOCLEVEL_FATAL, NULL, NULL, NULL);
 	mc = mchars_alloc();
 
-	ohash_init(&strings, 6, &str_info);
 	ohash_init(&inos, 6, &ino_info);
 	ohash_init(&filenames, 6, &filename_info);
 
@@ -430,7 +424,7 @@ main(int argc, char *argv[])
 		if (OP_TEST != op)
 			dbprune();
 		if (OP_DELETE != op)
-			ofmerge(mc, mp);
+			ofmerge(mc, mp, &str_info);
 		dbclose(1);
 	} else {
 		/*
@@ -460,6 +454,12 @@ main(int argc, char *argv[])
 				dirs.paths[j][--sz] = '\0';
 			if (0 == sz)
 				continue;
+
+			if (j) {
+				ohash_init(&inos, 6, &ino_info);
+				ohash_init(&filenames, 6, &filename_info);
+			}
+
 			if (0 == set_basedir(dirs.paths[j]))
 				goto out;
 			if (0 == treescan())
@@ -478,13 +478,14 @@ main(int argc, char *argv[])
 			SQL_EXEC("PRAGMA synchronous = OFF");
 #endif
 
-			ofmerge(mc, mp);
+			ofmerge(mc, mp, &str_info);
 			dbclose(0);
-			offree();
-			ohash_delete(&inos);
-			ohash_init(&inos, 6, &ino_info);
-			ohash_delete(&filenames);
-			ohash_init(&filenames, 6, &filename_info);
+
+			if (j + 1 < dirs.sz) {
+				ohash_delete(&inos);
+				ohash_delete(&filenames);
+				offree();
+			}
 		}
 	}
 out:
@@ -492,13 +493,6 @@ out:
 	manpath_free(&dirs);
 	mchars_free(mc);
 	mparse_free(mp);
-	for (s = ohash_first(&strings, &slot); NULL != s;
-	     s = ohash_next(&strings, &slot)) {
-		if (s->utf8 != s->key)
-			free(s->utf8);
-		free(s);
-	}
-	ohash_delete(&strings);
 	ohash_delete(&inos);
 	ohash_delete(&filenames);
 	offree();
@@ -536,7 +530,7 @@ treescan(void)
 	FTSENT		*ff;
 	int		 dform;
 	char		*sec;
-	const char	*dsec, *arch, *cp, *name, *path;
+	const char	*dsec, *arch, *cp, *path;
 	const char	*argv[2];
 
 	argv[0] = ".";
@@ -604,13 +598,10 @@ treescan(void)
 				if (warnings)
 					say(path, "Wrong filename suffix");
 				continue;
-			} else {
+			} else
 				sec[-1] = '\0';
-				sec = stradd(sec);
-			}
-			name = stradd(ff->fts_name);
-			ofadd(dform, path, 
-				name, dsec, sec, arch, ff->fts_statp);
+			ofadd(dform, path, ff->fts_name, dsec, sec,
+					arch, ff->fts_statp);
 			continue;
 		} else if (FTS_D != ff->fts_info && 
 				FTS_DP != ff->fts_info) {
@@ -637,10 +628,10 @@ treescan(void)
 
 			if (0 == strncmp(cp, "man", 3)) {
 				dform = FORM_SRC;
-				dsec = stradd(cp + 3);
+				dsec = cp + 3;
 			} else if (0 == strncmp(cp, "cat", 3)) {
 				dform = FORM_CAT;
-				dsec = stradd(cp + 3);
+				dsec = cp + 3;
 			}
 
 			if (NULL != dsec || use_all) 
@@ -657,7 +648,7 @@ treescan(void)
 			 */
 			arch = NULL;
 			if (FTS_DP != ff->fts_info && NULL != dsec)
-				arch = stradd(ff->fts_name);
+				arch = ff->fts_name;
 			break;
 		default:
 			if (FTS_DP == ff->fts_info || use_all)
@@ -867,10 +858,10 @@ ofadd(int dform, const char *file, const
 
 	of = mandoc_calloc(1, sizeof(struct of));
 	strlcpy(of->file, file, PATH_MAX);
-	of->name = name;
-	of->sec = sec;
-	of->dsec = dsec;
-	of->arch = arch;
+	of->name = mandoc_strdup(name);
+	of->sec = mandoc_strdup(sec);
+	of->dsec = mandoc_strdup(dsec);
+	of->arch = mandoc_strdup(arch);
 	of->sform = sform;
 	of->dform = dform;
 	of->next = ofs;
@@ -892,6 +883,10 @@ offree(void)
 
 	while (NULL != (of = ofs)) {
 		ofs = of->next;
+		free(of->name);
+		free(of->sec);
+		free(of->dsec);
+		free(of->arch);
 		free(of);
 	}
 }
@@ -904,7 +899,8 @@ offree(void)
  * and filename to determine whether the file is parsable or not.
  */
 static void
-ofmerge(struct mchars *mc, struct mparse *mp)
+ofmerge(struct mchars *mc, struct mparse *mp,
+		struct ohash_info *infop)
 {
 	int		 form;
 	size_t		 sz;
@@ -945,7 +941,7 @@ ofmerge(struct mchars *mc, struct mparse
 			}
 		}
 
-		words = NULL;
+		ohash_init(&strings, 6, infop);
 		mparse_reset(mp);
 		mdoc = NULL;
 		man = NULL;
@@ -1023,13 +1019,17 @@ ofmerge(struct mchars *mc, struct mparse
 		if (NULL != mdoc) {
 			if (NULL != (cp = mdoc_meta(mdoc)->name))
 				putkey(of, cp, TYPE_Nm);
+			assert(NULL == of->desc);
 			parse_mdoc(of, mdoc_node(mdoc));
+			putkey(of, NULL != of->desc ?
+				of->desc : of->name, TYPE_Nd);
 		} else if (NULL != man)
 			parse_man(of, man_node(man));
 		else
 			parse_catpage(of);
 
 		dbindex(mc, form, of);
+		ohash_delete(&strings);
 	}
 }
 
@@ -1096,6 +1096,8 @@ parse_catpage(struct of *of)
 	if (NULL == title || '\0' == *title) {
 		if (warnings)
 			say(of->file, "Cannot find NAME section");
+		assert(NULL == of->desc);
+		of->desc = mandoc_strdup(of->name);
 		putkey(of, of->name, TYPE_Nd);
 		fclose(stream);
 		free(title);
@@ -1134,8 +1136,9 @@ parse_catpage(struct of *of)
 		plen -= 2;
 	}
 
-	of->desc = stradd(p);
-	putkey(of, p, TYPE_Nd);
+	assert(NULL == of->desc);
+	of->desc = mandoc_strdup(p);
+	putkey(of, of->desc, TYPE_Nd);
 	fclose(stream);
 	free(title);
 }
@@ -1299,8 +1302,8 @@ parse_man(struct of *of, const struct ma
 				start++;
 
 			assert(NULL == of->desc);
-			of->desc = stradd(start);
-			putkey(of, start, TYPE_Nd);
+			of->desc = mandoc_strdup(start);
+			putkey(of, of->desc, TYPE_Nd);
 			free(title);
 			return;
 		}
@@ -1455,7 +1458,6 @@ static int
 parse_mdoc_Nd(struct of *of, const struct mdoc_node *n)
 {
 	size_t		 sz;
-	char		*sv, *desc;
 
 	if (MDOC_BODY != n->type)
 		return(0);
@@ -1465,25 +1467,20 @@ parse_mdoc_Nd(struct of *of, const struc
 	 * into the document table.
 	 */
 
-	desc = NULL;
 	for (n = n->child; NULL != n; n = n->next) {
 		if (MDOC_TEXT == n->type) {
-			sz = strlen(n->string) + 1;
-			if (NULL != (sv = desc))
-				sz += strlen(desc) + 1;
-			desc = mandoc_realloc(desc, sz);
-			if (NULL != sv)
-				strlcat(desc, " ", sz);
-			else
-				*desc = '\0';
-			strlcat(desc, n->string, sz);
+			if (NULL != of->desc) {
+				sz = strlen(of->desc) +
+				     strlen(n->string) + 2;
+				of->desc = mandoc_realloc(of->desc, sz);
+				strlcat(of->desc, " ", sz);
+				strlcat(of->desc, n->string, sz);
+			} else
+				of->desc = mandoc_strdup(n->string);
 		}
 		if (NULL != n->child)
 			parse_mdoc_Nd(of, n);
 	}
-
-	of->desc = NULL != desc ? stradd(desc) : NULL;
-	free(desc);
 	return(1);
 }
 
@@ -1521,50 +1518,9 @@ parse_mdoc_body(struct of *of, const str
 }
 
 /*
- * See stradds().
- */
-static char *
-stradd(const char *cp)
-{
-
-	return(stradds(cp, strlen(cp)));
-}
-
-/*
- * This looks up or adds a string to the string table.
- * The string table is a table of all strings encountered during parse
- * or file scan.
- * In using it, we avoid having thousands of (e.g.) "cat1" string
- * allocations for the "of" table.
- * We also have a layer atop the string table for keeping track of words
- * in a parse sequence (see putkeys()).
- */
-static char *
-stradds(const char *cp, size_t sz)
-{
-	struct str	*s;
-	unsigned int	 slot;
-	const char	*end;
-
-	end = cp + sz;
-	slot = ohash_qlookupi(&strings, cp, &end);
-	if (NULL != (s = ohash_find(&strings, slot)))
-		return(s->key);
-
-	s = mandoc_calloc(sizeof(struct str) + sz + 1, 1);
-	memcpy(s->key, cp, sz);
-	ohash_insert(&strings, slot, s);
-	return(s->key);
-}
-
-/*
- * Add a word to the current parse sequence.
- * Within the hashtable of strings, we maintain a list of strings that
- * are currently indexed.
- * Each of these ("words") has a bitmask modified within the parse.
- * When we finish a parse, we'll dump the list, then remove the head
- * entry -- since the next parse will have a new "of", it can keep track
- * of its entries without conflict.
+ * Add a string to the hash table for the current manual.
+ * Each string has a bitmask telling which macros it belongs to.
+ * When we finish the manual, we'll dump the table.
  */
 static void
 putkeys(const struct of *of, const char *cp, size_t sz, uint64_t v)
@@ -1588,11 +1544,8 @@ putkeys(const struct of *of, const char 
 		memcpy(s->key, cp, sz);
 		ohash_insert(&strings, slot, s);
 	}
-
-	s->next = words;
 	s->of = of;
 	s->mask = v;
-	words = s;
 }
 
 /*
@@ -1764,6 +1717,7 @@ dbindex(struct mchars *mc, int form, con
 	const char	*desc;
 	int64_t		 recno;
 	size_t		 i;
+	unsigned int	 slot;
 
 	if (verb)
 		say(of->file, "Adding to index");
@@ -1793,7 +1747,8 @@ dbindex(struct mchars *mc, int form, con
 	recno = sqlite3_last_insert_rowid(db);
 	sqlite3_reset(stmts[STMT_INSERT_DOC]);
 
-	for (key = words; NULL != key; key = key->next) {
+	for (key = ohash_first(&strings, &slot); NULL != key;
+	     key = ohash_next(&strings, &slot)) {
 		assert(key->of == of);
 		if (NULL == key->utf8)
 			utf8key(mc, key);
@@ -1803,6 +1758,9 @@ dbindex(struct mchars *mc, int form, con
 		SQL_BIND_INT64(stmts[STMT_INSERT_KEY], i, recno);
 		SQL_STEP(stmts[STMT_INSERT_KEY]);
 		sqlite3_reset(stmts[STMT_INSERT_KEY]);
+		if (key->utf8 != key->key)
+			free(key->utf8);
+		free(key);
 	}
 
 	SQL_EXEC("END TRANSACTION");
--
 To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2013-06-07  1:01 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-07  1:01 mdocml: The string hash table is only needed to combine multiple schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).