diff options
author | Ingo Schwarze <schwarze@openbsd.org> | 2017-06-25 17:43:45 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@openbsd.org> | 2017-06-25 17:43:45 +0000 |
commit | 1fdb4db7a387f66dbf9fc6b0869d19dfe765a5aa (patch) | |
tree | 78c84c03a6f5831ed6c6f758e4b370367dba22af /mdoc_validate.c | |
parent | 968237f527c0925ee4300f535b2efb1d2a9e783f (diff) | |
download | mandoc-1fdb4db7a387f66dbf9fc6b0869d19dfe765a5aa.tar.gz mandoc-1fdb4db7a387f66dbf9fc6b0869d19dfe765a5aa.tar.zst mandoc-1fdb4db7a387f66dbf9fc6b0869d19dfe765a5aa.zip |
Catch typos in .Sh names; suggested by jmc@.
I'm using a very simple, linear time / zero space fuzzy string
matching heuristic rather than a full Levenshtein metric, to keep
the code both simple and fast.
Diffstat (limited to 'mdoc_validate.c')
-rw-r--r-- | mdoc_validate.c | 65 |
1 files changed, 63 insertions, 2 deletions
diff --git a/mdoc_validate.c b/mdoc_validate.c index da9be6d1..08f23583 100644 --- a/mdoc_validate.c +++ b/mdoc_validate.c @@ -1,4 +1,4 @@ -/* $Id: mdoc_validate.c,v 1.342 2017/06/24 18:58:33 schwarze Exp $ */ +/* $Id: mdoc_validate.c,v 1.343 2017/06/25 17:43:45 schwarze Exp $ */ /* * Copyright (c) 2008-2012 Kristaps Dzonsons <kristaps@bsd.lv> * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org> @@ -60,6 +60,7 @@ static void check_toptext(struct roff_man *, int, int, const char *); static int child_an(const struct roff_node *); static size_t macro2len(enum roff_tok); static void rewrite_macro2len(struct roff_man *, char **); +static int similar(const char *, const char *); static void post_an(POST_ARGS); static void post_an_norm(POST_ARGS); @@ -2148,11 +2149,54 @@ post_sh_authors(POST_ARGS) mdoc->last->line, mdoc->last->pos, NULL); } +/* + * Return an upper bound for the string distance (allowing + * transpositions). Not a full Levenshtein implementation + * because Levenshtein is quadratic in the string length + * and this function is called for every standard name, + * so the check for each custom name would be cubic. + * The following crude heuristics is linear, resulting + * in quadratic behaviour for checking one custom name, + * which does not cause measurable slowdown. + */ +static int +similar(const char *s1, const char *s2) +{ + const int maxdist = 3; + int dist = 0; + + while (s1[0] != '\0' && s2[0] != '\0') { + if (s1[0] == s2[0]) { + s1++; + s2++; + continue; + } + if (++dist > maxdist) + return INT_MAX; + if (s1[1] == s2[1]) { /* replacement */ + s1++; + s2++; + } else if (s1[0] == s2[1] && s1[1] == s2[0]) { + s1 += 2; /* transposition */ + s2 += 2; + } else if (s1[0] == s2[1]) /* insertion */ + s2++; + else if (s1[1] == s2[0]) /* deletion */ + s1++; + else + return INT_MAX; + } + dist += strlen(s1) + strlen(s2); + return dist > maxdist ? INT_MAX : dist; +} + static void post_sh_head(POST_ARGS) { struct roff_node *nch; const char *goodsec; + const char *const *testsec; + int dist, mindist; enum roff_sec sec; /* @@ -2190,8 +2234,25 @@ post_sh_head(POST_ARGS) /* We don't care about custom sections after this. */ - if (sec == SEC_CUSTOM) + if (sec == SEC_CUSTOM) { + if ((nch = mdoc->last->child) == NULL || + nch->type != ROFFT_TEXT || nch->next != NULL) + return; + goodsec = NULL; + mindist = INT_MAX; + for (testsec = secnames + 1; *testsec != NULL; testsec++) { + dist = similar(nch->string, *testsec); + if (dist < mindist) { + goodsec = *testsec; + mindist = dist; + } + } + if (goodsec != NULL) + mandoc_vmsg(MANDOCERR_SEC_TYPO, mdoc->parse, + nch->line, nch->pos, "Sh %s instead of %s", + nch->string, goodsec); return; + } /* * Check whether our non-custom section is being repeated or is |