aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mandoc.c
diff options
context:
space:
mode:
authorKristaps Dzonsons <kristaps@bsd.lv>2011-04-09 15:29:40 +0000
committerKristaps Dzonsons <kristaps@bsd.lv>2011-04-09 15:29:40 +0000
commitdc6820cf87c36506f7d235493e75c1520231e9d9 (patch)
treee5c60975fd48bf07a47693f981a0ff9991db03b9 /mandoc.c
parent0383763dbd62d52405be4dc65ae9aaff035c8f79 (diff)
downloadmandoc-dc6820cf87c36506f7d235493e75c1520231e9d9.tar.gz
mandoc-dc6820cf87c36506f7d235493e75c1520231e9d9.tar.zst
mandoc-dc6820cf87c36506f7d235493e75c1520231e9d9.zip
Remove a2roffdeco() and mandoc_special() functions and replace them with
a public (mandoc.h) function mandoc_escape(), which merges the functionality of both prior functions. Reason: code duplication. The a2roffdeco() and mandoc_special() functions were pretty much the same thing and both quite complex. This allows one function to receive improvements in (e.g.) subexpression handling and performance, instead of having to replicate functionality. As such, the mandoc_escape() function already handles a superset of the escapes handled in previous versions and has improvements in performance (using strcspn(), for example) and reliable handling of subexpressions. This code Works For Me, but may need work to catch any regressions. Since the benefits are great (leaner code, simpler API), I'd rather have it in-tree than floating as a patch.
Diffstat (limited to 'mandoc.c')
-rw-r--r--mandoc.c446
1 files changed, 305 insertions, 141 deletions
diff --git a/mandoc.c b/mandoc.c
index da4a1606..16912060 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -1,4 +1,4 @@
-/* $Id: mandoc.c,v 1.44 2011/03/28 23:52:13 kristaps Exp $ */
+/* $Id: mandoc.c,v 1.45 2011/04/09 15:29:40 kristaps Exp $ */
/*
* Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
@@ -35,199 +35,363 @@
static int a2time(time_t *, const char *, const char *);
static char *time2a(time_t);
+static int numescape(const char *);
-int
-mandoc_special(char *p)
+/*
+ * Pass over recursive numerical expressions. This context of this
+ * function is important: it's only called within character-terminating
+ * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
+ * recursion: we don't care about what's in these blocks.
+ * This returns the number of characters skipped or -1 if an error
+ * occurs (the caller should bail).
+ */
+static int
+numescape(const char *start)
+{
+ int i;
+ size_t sz;
+ const char *cp;
+
+ i = 0;
+
+ /* The expression consists of a subexpression. */
+
+ if ('\\' == start[i]) {
+ cp = &start[++i];
+ /*
+ * Read past the end of the subexpression.
+ * Bail immediately on errors.
+ */
+ if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+ return(-1);
+ return(i + cp - &start[i]);
+ }
+
+ if ('(' != start[i++])
+ return(0);
+
+ /*
+ * A parenthesised subexpression. Read until the closing
+ * parenthesis, making sure to handle any nested subexpressions
+ * that might ruin our parse.
+ */
+
+ while (')' != start[i]) {
+ sz = strcspn(&start[i], ")\\");
+ i += (int)sz;
+
+ if ('\0' == start[i])
+ return(-1);
+ else if ('\\' != start[i])
+ continue;
+
+ cp = &start[++i];
+ if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
+ return(-1);
+ i += cp - &start[i];
+ }
+
+ /* Read past the terminating ')'. */
+ return(++i);
+}
+
+/*
+ * Handle an escaped sequeence. This should be called with any
+ * string subsequent a `\'. Pass a pointer to this substring as "end";
+ * it will be set to the supremum of the parsed escape sequence. If
+ * this returns ESCAPE_ERROR, the string is bogus and should be thrown
+ * away. If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
+ * first relevant character of the substring (font, glyph, whatever) of
+ * length sz. Both "start" and "sz" may be NULL.
+ */
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
{
- int len, i;
- char term;
- char *sv;
-
- len = 0;
+ char c, term, numeric;
+ int i, lim, ssz, rlim;
+ const char *cp, *rstart;
+ enum mandoc_esc gly;
+
+ cp = *end;
+ rstart = cp;
+ if (start)
+ *start = rstart;
+ i = 0;
+ gly = ESCAPE_ERROR;
term = '\0';
- sv = p;
+ numeric = 0;
- assert('\\' == *p);
- p++;
+ switch ((c = cp[i++])) {
+ /*
+ * First the glyphs. There are several different forms of
+ * these, but each eventually returns a substring of the glyph
+ * name.
+ */
+ case ('('):
+ gly = ESCAPE_SPECIAL;
+ lim = 2;
+ break;
+ case ('['):
+ gly = ESCAPE_SPECIAL;
+ term = ']';
+ break;
+ case ('C'):
+ if ('\'' != cp[i])
+ return(ESCAPE_ERROR);
+ gly = ESCAPE_SPECIAL;
+ term = '\'';
+ break;
- switch (*p++) {
-#if 0
- case ('Z'):
+ /*
+ * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+ * 'X' is the trigger. These have opaque sub-strings.
+ */
+ case ('F'):
/* FALLTHROUGH */
- case ('X'):
+ case ('g'):
/* FALLTHROUGH */
- case ('x'):
+ case ('k'):
/* FALLTHROUGH */
- case ('S'):
+ case ('M'):
/* FALLTHROUGH */
- case ('R'):
+ case ('m'):
/* FALLTHROUGH */
- case ('N'):
+ case ('n'):
/* FALLTHROUGH */
- case ('l'):
+ case ('V'):
/* FALLTHROUGH */
- case ('L'):
+ case ('Y'):
+ if (ESCAPE_ERROR == gly)
+ gly = ESCAPE_IGNORE;
/* FALLTHROUGH */
- case ('H'):
+ case ('*'):
+ if (ESCAPE_ERROR == gly)
+ gly = ESCAPE_PREDEF;
/* FALLTHROUGH */
- case ('h'):
+ case ('f'):
+ if (ESCAPE_ERROR == gly)
+ gly = ESCAPE_FONT;
+
+ rstart= &cp[i];
+ if (start)
+ *start = rstart;
+
+ switch (cp[i++]) {
+ case ('('):
+ lim = 2;
+ break;
+ case ('['):
+ term = ']';
+ break;
+ default:
+ lim = 1;
+ i--;
+ break;
+ }
+ break;
+
+ /*
+ * These escapes are of the form \X'Y', where 'X' is the trigger
+ * and 'Y' is any string. These have opaque sub-strings.
+ */
+ case ('A'):
/* FALLTHROUGH */
- case ('D'):
+ case ('b'):
/* FALLTHROUGH */
- case ('C'):
+ case ('D'):
/* FALLTHROUGH */
- case ('b'):
+ case ('o'):
/* FALLTHROUGH */
- case ('B'):
+ case ('R'):
/* FALLTHROUGH */
- case ('a'):
+ case ('X'):
/* FALLTHROUGH */
- case ('A'):
- if (*p++ != '\'')
- return(0);
+ case ('Z'):
+ if ('\'' != cp[i++])
+ return(ESCAPE_ERROR);
+ gly = ESCAPE_IGNORE;
term = '\'';
break;
-#endif
+
+ /*
+ * These escapes are of the form \X'N', where 'X' is the trigger
+ * and 'N' resolves to a numerical expression.
+ */
+ case ('B'):
+ /* FALLTHROUGH */
case ('h'):
/* FALLTHROUGH */
+ case ('H'):
+ /* FALLTHROUGH */
+ case ('L'):
+ /* FALLTHROUGH */
+ case ('l'):
+ /* FALLTHROUGH */
+ case ('N'):
+ if (ESCAPE_ERROR == gly)
+ gly = ESCAPE_NUMBERED;
+ /* FALLTHROUGH */
+ case ('S'):
+ /* FALLTHROUGH */
case ('v'):
/* FALLTHROUGH */
+ case ('w'):
+ /* FALLTHROUGH */
+ case ('x'):
+ if (ESCAPE_ERROR == gly)
+ gly = ESCAPE_IGNORE;
+ if ('\'' != cp[i++])
+ return(ESCAPE_ERROR);
+ term = numeric = '\'';
+ break;
+
+ /*
+ * Sizes get a special category of their own.
+ */
case ('s'):
- if (ASCII_HYPH == *p)
- *p = '-';
+ gly = ESCAPE_IGNORE;
- i = 0;
- if ('+' == *p || '-' == *p) {
- p++;
- i = 1;
- }
+ rstart = &cp[i];
+ if (start)
+ *start = rstart;
+
+ /* See +/- counts as a sign. */
+ c = cp[i];
+ if ('+' == c || '-' == c || ASCII_HYPH == c)
+ ++i;
- switch (*p++) {
+ switch (cp[i++]) {
case ('('):
- len = 2;
+ lim = 2;
break;
case ('['):
- term = ']';
+ term = numeric = ']';
break;
case ('\''):
- term = '\'';
+ term = numeric = '\'';
break;
- case ('0'):
- i = 1;
- /* FALLTHROUGH */
default:
- len = 1;
- p--;
+ lim = 1;
+ i--;
break;
}
- if (ASCII_HYPH == *p)
- *p = '-';
- if ('+' == *p || '-' == *p) {
- if (i)
- return(0);
- p++;
- }
-
- /* Handle embedded numerical subexp or escape. */
-
- if ('(' == *p) {
- while (*p && ')' != *p)
- if ('\\' == *p++) {
- i = mandoc_special(--p);
- if (0 == i)
- return(0);
- p += i;
- }
-
- if (')' == *p++)
- break;
+ /* See +/- counts as a sign. */
+ c = cp[i];
+ if ('+' == c || '-' == c || ASCII_HYPH == c)
+ ++i;
- return(0);
- } else if ('\\' == *p) {
- if (0 == (i = mandoc_special(p)))
- return(0);
- p += i;
- }
+ break;
+ /*
+ * Anything else is assumed to be a glyph.
+ */
+ default:
+ gly = ESCAPE_SPECIAL;
+ lim = 1;
+ i--;
break;
-#if 0
- case ('Y'):
- /* FALLTHROUGH */
- case ('V'):
- /* FALLTHROUGH */
- case ('$'):
- /* FALLTHROUGH */
- case ('n'):
- /* FALLTHROUGH */
-#endif
- case ('k'):
- /* FALLTHROUGH */
- case ('M'):
- /* FALLTHROUGH */
- case ('m'):
- /* FALLTHROUGH */
- case ('f'):
- /* FALLTHROUGH */
- case ('F'):
- /* FALLTHROUGH */
- case ('*'):
- switch (*p++) {
- case ('('):
- len = 2;
+ }
+
+ assert(ESCAPE_ERROR != gly);
+
+ rstart = &cp[i];
+ if (start)
+ *start = rstart;
+
+ /*
+ * If a terminating block has been specified, we need to
+ * handle the case of recursion, which could have their
+ * own terminating blocks that mess up our parse. This, by the
+ * way, means that the "start" and "size" values will be
+ * effectively meaningless.
+ */
+
+ ssz = 0;
+ if (numeric && -1 == (ssz = numescape(&cp[i])))
+ return(ESCAPE_ERROR);
+
+ i += ssz;
+ rlim = -1;
+
+ /*
+ * We have a character terminator. Try to read up to that
+ * character. If we can't (i.e., we hit the nil), then return
+ * an error; if we can, calculate our length, read past the
+ * terminating character, and exit.
+ */
+
+ if ('\0' != term) {
+ *end = strchr(&cp[i], term);
+ if ('\0' == *end)
+ return(ESCAPE_ERROR);
+
+ rlim = *end - &cp[i];
+ if (sz)
+ *sz = rlim;
+ (*end)++;
+ goto out;
+ }
+
+ assert(lim > 0);
+
+ /*
+ * We have a numeric limit. If the string is shorter than that,
+ * stop and return an error. Else adjust our endpoint, length,
+ * and return the current glyph.
+ */
+
+ if ((size_t)lim > strlen(&cp[i]))
+ return(ESCAPE_ERROR);
+
+ rlim = lim;
+ if (sz)
+ *sz = rlim;
+
+ *end = &cp[i] + lim;
+
+out:
+ assert(rlim >= 0 && rstart);
+
+ /* Run post-processors. */
+
+ switch (gly) {
+ case (ESCAPE_FONT):
+ if (1 != rlim)
break;
- case ('['):
- term = ']';
+ switch (*rstart) {
+ case ('3'):
+ /* FALLTHROUGH */
+ case ('B'):
+ gly = ESCAPE_FONTBOLD;
break;
- default:
- len = 1;
- p--;
+ case ('2'):
+ /* FALLTHROUGH */
+ case ('I'):
+ gly = ESCAPE_FONTITALIC;
break;
- }
- break;
- case ('('):
- len = 2;
- break;
- case ('['):
- term = ']';
- break;
- case ('z'):
- len = 1;
- if ('\\' == *p) {
- if (0 == (i = mandoc_special(p)))
- return(0);
- p += i;
- return(*p ? (int)(p - sv) : 0);
- }
- break;
- case ('o'):
- /* FALLTHROUGH */
- case ('w'):
- if ('\'' == *p++) {
- term = '\'';
+ case ('P'):
+ gly = ESCAPE_FONTPREV;
+ break;
+ case ('1'):
+ /* FALLTHROUGH */
+ case ('R'):
+ gly = ESCAPE_FONTROMAN;
break;
}
- /* FALLTHROUGH */
+ case (ESCAPE_SPECIAL):
+ if (1 != rlim)
+ break;
+ if ('c' == *rstart)
+ gly = ESCAPE_NOSPACE;
+ break;
default:
- len = 1;
- p--;
break;
}
- if (term) {
- for ( ; *p && term != *p; p++)
- if (ASCII_HYPH == *p)
- *p = '-';
- return(*p ? (int)(p - sv) : 0);
- }
-
- for (i = 0; *p && i < len; i++, p++)
- if (ASCII_HYPH == *p)
- *p = '-';
- return(i == len ? (int)(p - sv) : 0);
+ return(gly);
}
-
void *
mandoc_calloc(size_t num, size_t size)
{