* mandoc: Catch typos in .Sh names; suggested by jmc@.
@ 2017-06-25 17:44 schwarze
0 siblings, 0 replies; only message in thread
From: schwarze @ 2017-06-25 17:44 UTC (permalink / raw)
To: source
Log Message:
-----------
Catch typos in .Sh names; suggested by jmc@.
I'm using a very simple, linear time / zero space fuzzy string
matching heuristic rather than a full Levenshtein metric, to keep
the code both simple and fast.
Modified Files:
--------------
mandoc:
mandoc.1
mandoc.h
mdoc_validate.c
read.c
Revision Data
-------------
Index: mdoc_validate.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mdoc_validate.c,v
retrieving revision 1.342
retrieving revision 1.343
diff -Lmdoc_validate.c -Lmdoc_validate.c -u -p -r1.342 -r1.343
--- mdoc_validate.c
+++ mdoc_validate.c
@@ -60,6 +60,7 @@ static void check_toptext(struct roff_m
static int child_an(const struct roff_node *);
static size_t macro2len(enum roff_tok);
static void rewrite_macro2len(struct roff_man *, char **);
+static int similar(const char *, const char *);
static void post_an(POST_ARGS);
static void post_an_norm(POST_ARGS);
@@ -2148,11 +2149,54 @@ post_sh_authors(POST_ARGS)
mdoc->last->line, mdoc->last->pos, NULL);
}
+/*
+ * Return an upper bound for the string distance (allowing
+ * transpositions). Not a full Levenshtein implementation
+ * because Levenshtein is quadratic in the string length
+ * and this function is called for every standard name,
+ * so the check for each custom name would be cubic.
+ * The following crude heuristics is linear, resulting
+ * in quadratic behaviour for checking one custom name,
+ * which does not cause measurable slowdown.
+ */
+static int
+similar(const char *s1, const char *s2)
+{
+ const int maxdist = 3;
+ int dist = 0;
+
+ while (s1[0] != '\0' && s2[0] != '\0') {
+ if (s1[0] == s2[0]) {
+ s1++;
+ s2++;
+ continue;
+ }
+ if (++dist > maxdist)
+ return INT_MAX;
+ if (s1[1] == s2[1]) { /* replacement */
+ s1++;
+ s2++;
+ } else if (s1[0] == s2[1] && s1[1] == s2[0]) {
+ s1 += 2; /* transposition */
+ s2 += 2;
+ } else if (s1[0] == s2[1]) /* insertion */
+ s2++;
+ else if (s1[1] == s2[0]) /* deletion */
+ s1++;
+ else
+ return INT_MAX;
+ }
+ dist += strlen(s1) + strlen(s2);
+ return dist > maxdist ? INT_MAX : dist;
+}
+
static void
post_sh_head(POST_ARGS)
{
struct roff_node *nch;
const char *goodsec;
+ const char *const *testsec;
+ int dist, mindist;
enum roff_sec sec;
/*
@@ -2190,8 +2234,25 @@ post_sh_head(POST_ARGS)
/* We don't care about custom sections after this. */
- if (sec == SEC_CUSTOM)
+ if (sec == SEC_CUSTOM) {
+ if ((nch = mdoc->last->child) == NULL ||
+ nch->type != ROFFT_TEXT || nch->next != NULL)
+ return;
+ goodsec = NULL;
+ mindist = INT_MAX;
+ for (testsec = secnames + 1; *testsec != NULL; testsec++) {
+ dist = similar(nch->string, *testsec);
+ if (dist < mindist) {
+ goodsec = *testsec;
+ mindist = dist;
+ }
+ }
+ if (goodsec != NULL)
+ mandoc_vmsg(MANDOCERR_SEC_TYPO, mdoc->parse,
+ nch->line, nch->pos, "Sh %s instead of %s",
+ nch->string, goodsec);
return;
+ }
/*
* Check whether our non-custom section is being repeated or is
Index: read.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/read.c,v
retrieving revision 1.181
retrieving revision 1.182
diff -Lread.c -Lread.c -u -p -r1.181 -r1.182
--- read.c
+++ read.c
@@ -98,6 +98,7 @@ static const char * const mandocerrs[MAN
"legacy man(7) date format",
"duplicate RCS id",
+ "typo in section name",
"useless macro",
"consider using OS macro",
"errnos out of order",
Index: mandoc.1
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandoc.1,v
retrieving revision 1.206
retrieving revision 1.207
diff -Lmandoc.1 -Lmandoc.1 -u -p -r1.206 -r1.207
--- mandoc.1
+++ mandoc.1
@@ -857,6 +857,11 @@ A single manual page contains two copies
the same operating system.
Consider deleting the later instance and moving the first one up
to the top of the page.
+.It Sy "typo in section name"
+.Pq mdoc
+Fuzzy string matching revealed that the argument of an
+.Ic \&Sh
+macro is similar, but not identical to a standard section name.
.It Sy "useless macro"
.Pq mdoc
A
Index: mandoc.h
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandoc.h,v
retrieving revision 1.235
retrieving revision 1.236
diff -Lmandoc.h -Lmandoc.h -u -p -r1.235 -r1.236
--- mandoc.h
+++ mandoc.h
@@ -56,6 +56,7 @@ enum mandocerr {
MANDOCERR_DATE_LEGACY, /* legacy man(7) date format: Dd ... */
MANDOCERR_RCS_REP, /* duplicate RCS id: ... */
+ MANDOCERR_SEC_TYPO, /* typo in section name: Sh ... */
MANDOCERR_MACRO_USELESS, /* useless macro: macro */
MANDOCERR_BX, /* consider using OS macro: macro */
MANDOCERR_ER_ORDER, /* errnos out of order: Er ... */
--
To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2017-06-25 17:44 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-25 17:44 mandoc: Catch typos in .Sh names; suggested by jmc@ schwarze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).