source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mandoc: Catch typos in .Sh names; suggested by jmc@.
@ 2017-06-25 17:44 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2017-06-25 17:44 UTC (permalink / raw)
  To: source

Log Message:
-----------
Catch typos in .Sh names; suggested by jmc@.

I'm using a very simple, linear time / zero space fuzzy string
matching heuristic rather than a full Levenshtein metric, to keep
the code both simple and fast.

Modified Files:
--------------
    mandoc:
        mandoc.1
        mandoc.h
        mdoc_validate.c
        read.c

Revision Data
-------------
Index: mdoc_validate.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mdoc_validate.c,v
retrieving revision 1.342
retrieving revision 1.343
diff -Lmdoc_validate.c -Lmdoc_validate.c -u -p -r1.342 -r1.343
--- mdoc_validate.c
+++ mdoc_validate.c
@@ -60,6 +60,7 @@ static	void	 check_toptext(struct roff_m
 static	int	 child_an(const struct roff_node *);
 static	size_t		macro2len(enum roff_tok);
 static	void	 rewrite_macro2len(struct roff_man *, char **);
+static	int	 similar(const char *, const char *);
 
 static	void	 post_an(POST_ARGS);
 static	void	 post_an_norm(POST_ARGS);
@@ -2148,11 +2149,54 @@ post_sh_authors(POST_ARGS)
 		    mdoc->last->line, mdoc->last->pos, NULL);
 }
 
+/*
+ * Return an upper bound for the string distance (allowing
+ * transpositions).  Not a full Levenshtein implementation
+ * because Levenshtein is quadratic in the string length
+ * and this function is called for every standard name,
+ * so the check for each custom name would be cubic.
+ * The following crude heuristics is linear, resulting
+ * in quadratic behaviour for checking one custom name,
+ * which does not cause measurable slowdown.
+ */
+static int
+similar(const char *s1, const char *s2)
+{
+	const int	maxdist = 3;
+	int		dist = 0;
+
+	while (s1[0] != '\0' && s2[0] != '\0') {
+		if (s1[0] == s2[0]) {
+			s1++;
+			s2++;
+			continue;
+		}
+		if (++dist > maxdist)
+			return INT_MAX;
+		if (s1[1] == s2[1]) {  /* replacement */
+			s1++;
+			s2++;
+		} else if (s1[0] == s2[1] && s1[1] == s2[0]) {
+			s1 += 2;	/* transposition */
+			s2 += 2;
+		} else if (s1[0] == s2[1])  /* insertion */
+			s2++;
+		else if (s1[1] == s2[0])  /* deletion */
+			s1++;
+		else
+			return INT_MAX;
+	}
+	dist += strlen(s1) + strlen(s2);
+	return dist > maxdist ? INT_MAX : dist;
+}
+
 static void
 post_sh_head(POST_ARGS)
 {
 	struct roff_node	*nch;
 	const char		*goodsec;
+	const char *const	*testsec;
+	int			 dist, mindist;
 	enum roff_sec		 sec;
 
 	/*
@@ -2190,8 +2234,25 @@ post_sh_head(POST_ARGS)
 
 	/* We don't care about custom sections after this. */
 
-	if (sec == SEC_CUSTOM)
+	if (sec == SEC_CUSTOM) {
+		if ((nch = mdoc->last->child) == NULL ||
+		    nch->type != ROFFT_TEXT || nch->next != NULL)
+			return;
+		goodsec = NULL;
+		mindist = INT_MAX;
+		for (testsec = secnames + 1; *testsec != NULL; testsec++) {
+			dist = similar(nch->string, *testsec);
+			if (dist < mindist) {
+				goodsec = *testsec;
+				mindist = dist;
+			}
+		}
+		if (goodsec != NULL)
+			mandoc_vmsg(MANDOCERR_SEC_TYPO, mdoc->parse,
+			    nch->line, nch->pos, "Sh %s instead of %s",
+			    nch->string, goodsec);
 		return;
+	}
 
 	/*
 	 * Check whether our non-custom section is being repeated or is
Index: read.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/read.c,v
retrieving revision 1.181
retrieving revision 1.182
diff -Lread.c -Lread.c -u -p -r1.181 -r1.182
--- read.c
+++ read.c
@@ -98,6 +98,7 @@ static	const char * const	mandocerrs[MAN
 
 	"legacy man(7) date format",
 	"duplicate RCS id",
+	"typo in section name",
 	"useless macro",
 	"consider using OS macro",
 	"errnos out of order",
Index: mandoc.1
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandoc.1,v
retrieving revision 1.206
retrieving revision 1.207
diff -Lmandoc.1 -Lmandoc.1 -u -p -r1.206 -r1.207
--- mandoc.1
+++ mandoc.1
@@ -857,6 +857,11 @@ A single manual page contains two copies
 the same operating system.
 Consider deleting the later instance and moving the first one up
 to the top of the page.
+.It Sy "typo in section name"
+.Pq mdoc
+Fuzzy string matching revealed that the argument of an
+.Ic \&Sh
+macro is similar, but not identical to a standard section name.
 .It Sy "useless macro"
 .Pq mdoc
 A
Index: mandoc.h
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandoc.h,v
retrieving revision 1.235
retrieving revision 1.236
diff -Lmandoc.h -Lmandoc.h -u -p -r1.235 -r1.236
--- mandoc.h
+++ mandoc.h
@@ -56,6 +56,7 @@ enum	mandocerr {
 
 	MANDOCERR_DATE_LEGACY, /* legacy man(7) date format: Dd ... */
 	MANDOCERR_RCS_REP, /* duplicate RCS id: ... */
+	MANDOCERR_SEC_TYPO,  /* typo in section name: Sh ... */
 	MANDOCERR_MACRO_USELESS, /* useless macro: macro */
 	MANDOCERR_BX, /* consider using OS macro: macro */
 	MANDOCERR_ER_ORDER, /* errnos out of order: Er ... */
--
 To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2017-06-25 17:44 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-25 17:44 mandoc: Catch typos in .Sh names; suggested by jmc@ schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).