1 files changed, 9 insertions, 385 deletions
diff --git a/mandoc.c b/mandoc.c
index 92d18665..ad00a01a 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -1,7 +1,8 @@
-/* $Id: mandoc.c,v 1.120 2022/04/13 13:19:34 schwarze Exp $ */
+/* $Id: mandoc.c,v 1.121 2022/05/19 15:37:47 schwarze Exp $ */
 /*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ *               Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -14,6 +15,11 @@
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
  */
 #include "config.h"
 
@@ -91,388 +97,6 @@ mandoc_font(const char *cp, int sz)
 	}
 }
 
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
-	const char	*local_start;
-	int		 local_sz, c, i;
-	char		 term;
-	enum mandoc_esc	 gly;
-
-	/*
-	 * When the caller doesn't provide return storage,
-	 * use local storage.
-	 */
-
-	if (NULL == start)
-		start = &local_start;
-	if (NULL == sz)
-		sz = &local_sz;
-
-	/*
-	 * Treat "\E" just like "\";
-	 * it only makes a difference in copy mode.
-	 */
-
-	while (**end == 'E')
-		++*end;
-
-	/*
-	 * Beyond the backslash, at least one input character
-	 * is part of the escape sequence.  With one exception
-	 * (see below), that character won't be returned.
-	 */
-
-	gly = ESCAPE_ERROR;
-	*start = ++*end;
-	*sz = 0;
-	term = '\0';
-
-	switch ((*start)[-1]) {
-	/*
-	 * First the glyphs.  There are several different forms of
-	 * these, but each eventually returns a substring of the glyph
-	 * name.
-	 */
-	case '(':
-		gly = ESCAPE_SPECIAL;
-		*sz = 2;
-		break;
-	case '[':
-		if (**start == ' ') {
-			++*end;
-			return ESCAPE_ERROR;
-		}
-		gly = ESCAPE_SPECIAL;
-		term = ']';
-		break;
-	case 'C':
-		if ('\'' != **start)
-			return ESCAPE_ERROR;
-		*start = ++*end;
-		gly = ESCAPE_SPECIAL;
-		term = '\'';
-		break;
-
-	/*
-	 * Escapes taking no arguments at all.
-	 */
-	case '!':
-	case '?':
-		return ESCAPE_UNSUPP;
-	case '%':
-	case '&':
-	case ')':
-	case ',':
-	case '/':
-	case '^':
-	case 'a':
-	case 'd':
-	case 'r':
-	case 't':
-	case 'u':
-	case '{':
-	case '|':
-	case '}':
-		return ESCAPE_IGNORE;
-	case 'c':
-		return ESCAPE_NOSPACE;
-	case 'p':
-		return ESCAPE_BREAK;
-
-	/*
-	 * The \z escape is supposed to output the following
-	 * character without advancing the cursor position.
-	 * Since we are mostly dealing with terminal mode,
-	 * let us just skip the next character.
-	 */
-	case 'z':
-		return ESCAPE_SKIPCHAR;
-
-	/*
-	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
-	 * 'X' is the trigger.  These have opaque sub-strings.
-	 */
-	case 'F':
-	case 'f':
-	case 'g':
-	case 'k':
-	case 'M':
-	case 'm':
-	case 'n':
-	case 'O':
-	case 'V':
-	case 'Y':
-	case '*':
-		switch ((*start)[-1]) {
-		case 'f':
-			gly = ESCAPE_FONT;
-			break;
-		case '*':
-			gly = ESCAPE_DEVICE;
-			break;
-		default:
-			gly = ESCAPE_IGNORE;
-			break;
-		}
-		switch (**start) {
-		case '(':
-			if ((*start)[-1] == 'O')
-				gly = ESCAPE_ERROR;
-			*start = ++*end;
-			*sz = 2;
-			break;
-		case '[':
-			if ((*start)[-1] == 'O')
-				gly = (*start)[1] == '5' ?
-				    ESCAPE_UNSUPP : ESCAPE_ERROR;
-			*start = ++*end;
-			term = ']';
-			break;
-		default:
-			if ((*start)[-1] == 'O') {
-				switch (**start) {
-				case '0':
-					gly = ESCAPE_UNSUPP;
-					break;
-				case '1':
-				case '2':
-				case '3':
-				case '4':
-					break;
-				default:
-					gly = ESCAPE_ERROR;
-					break;
-				}
-			}
-			*sz = 1;
-			break;
-		}
-		break;
-
-	/*
-	 * These escapes are of the form \X'Y', where 'X' is the trigger
-	 * and 'Y' is any string.  These have opaque sub-strings.
-	 * The \B and \w escapes are handled in roff.c, roff_res().
-	 */
-	case 'A':
-	case 'b':
-	case 'D':
-	case 'R':
-	case 'X':
-	case 'Z':
-		gly = ESCAPE_IGNORE;
-		/* FALLTHROUGH */
-	case 'o':
-		if (**start == '\0')
-			return ESCAPE_ERROR;
-		if (gly == ESCAPE_ERROR)
-			gly = ESCAPE_OVERSTRIKE;
-		term = **start;
-		*start = ++*end;
-		break;
-
-	/*
-	 * These escapes are of the form \X'N', where 'X' is the trigger
-	 * and 'N' resolves to a numerical expression.
-	 */
-	case 'h':
-	case 'H':
-	case 'L':
-	case 'l':
-	case 'S':
-	case 'v':
-	case 'x':
-		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
-			if ('\0' != **start)
-				++*end;
-			return ESCAPE_ERROR;
-		}
-		switch ((*start)[-1]) {
-		case 'h':
-			gly = ESCAPE_HORIZ;
-			break;
-		case 'l':
-			gly = ESCAPE_HLINE;
-			break;
-		default:
-			gly = ESCAPE_IGNORE;
-			break;
-		}
-		term = **start;
-		*start = ++*end;
-		break;
-
-	/*
-	 * Special handling for the numbered character escape.
-	 * XXX Do any other escapes need similar handling?
-	 */
-	case 'N':
-		if ('\0' == **start)
-			return ESCAPE_ERROR;
-		(*end)++;
-		if (isdigit((unsigned char)**start)) {
-			*sz = 1;
-			return ESCAPE_IGNORE;
-		}
-		(*start)++;
-		while (isdigit((unsigned char)**end))
-			(*end)++;
-		*sz = *end - *start;
-		if ('\0' != **end)
-			(*end)++;
-		return ESCAPE_NUMBERED;
-
-	/*
-	 * Sizes get a special category of their own.
-	 */
-	case 's':
-		gly = ESCAPE_IGNORE;
-
-		/* See +/- counts as a sign. */
-		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
-			*start = ++*end;
-
-		switch (**end) {
-		case '(':
-			*start = ++*end;
-			*sz = 2;
-			break;
-		case '[':
-			*start = ++*end;
-			term = ']';
-			break;
-		case '\'':
-			*start = ++*end;
-			term = '\'';
-			break;
-		case '3':
-		case '2':
-		case '1':
-			*sz = (*end)[-1] == 's' &&
-			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
-			break;
-		default:
-			*sz = 1;
-			break;
-		}
-
-		break;
-
-	/*
-	 * Several special characters can be encoded as
-	 * one-byte escape sequences without using \[].
-	 */
-	case ' ':
-	case '\'':
-	case '-':
-	case '.':
-	case '0':
-	case ':':
-	case '_':
-	case '`':
-	case 'e':
-	case '~':
-		gly = ESCAPE_SPECIAL;
-		/* FALLTHROUGH */
-	default:
-		if (gly == ESCAPE_ERROR)
-			gly = ESCAPE_UNDEF;
-		*start = --*end;
-		*sz = 1;
-		break;
-	}
-
-	/*
-	 * Read up to the terminating character,
-	 * paying attention to nested escapes.
-	 */
-
-	if ('\0' != term) {
-		while (**end != term) {
-			switch (**end) {
-			case '\0':
-				return ESCAPE_ERROR;
-			case '\\':
-				(*end)++;
-				if (ESCAPE_ERROR ==
-				    mandoc_escape(end, NULL, NULL))
-					return ESCAPE_ERROR;
-				break;
-			default:
-				(*end)++;
-				break;
-			}
-		}
-		*sz = (*end)++ - *start;
-
-		/*
-		 * The file chars.c only provides one common list
-		 * of character names, but \[-] == \- is the only
-		 * one of the characters with one-byte names that
-		 * allows enclosing the name in brackets.
-		 */
-		if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
-			return ESCAPE_ERROR;
-	} else {
-		assert(*sz > 0);
-		if ((size_t)*sz > strlen(*start))
-			return ESCAPE_ERROR;
-		*end += *sz;
-	}
-
-	/* Run post-processors. */
-
-	switch (gly) {
-	case ESCAPE_FONT:
-		gly = mandoc_font(*start, *sz);
-		break;
-	case ESCAPE_SPECIAL:
-		if (**start == 'c') {
-			if (*sz < 6 || *sz > 7 ||
-			    strncmp(*start, "char", 4) != 0 ||
-			    (int)strspn(*start + 4, "0123456789") + 4 < *sz)
-				break;
-			c = 0;
-			for (i = 4; i < *sz; i++)
-				c = 10 * c + ((*start)[i] - '0');
-			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
-				break;
-			*start += 4;
-			*sz -= 4;
-			gly = ESCAPE_NUMBERED;
-			break;
-		}
-
-		/*
-		 * Unicode escapes are defined in groff as \[u0000]
-		 * to \[u10FFFF], where the contained value must be
-		 * a valid Unicode codepoint.  Here, however, only
-		 * check the length and range.
-		 */
-		if (**start != 'u' || *sz < 5 || *sz > 7)
-			break;
-		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
-			break;
-		if (*sz == 6 && (*start)[1] == '0')
-			break;
-		if (*sz == 5 && (*start)[1] == 'D' &&
-		    strchr("89ABCDEF", (*start)[2]) != NULL)
-			break;
-		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
-		    + 1 == *sz)
-			gly = ESCAPE_UNICODE;
-		break;
-	case ESCAPE_DEVICE:
-		assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
-		break;
-	default:
-		break;
-	}
-
-	return gly;
-}
-
 static int
 a2time(time_t *t, const char *fmt, const char *p)
 {