-/* $Id: mandoc.c,v 1.120 2022/04/13 13:19:34 schwarze Exp $ */
+/* $Id: mandoc.c,v 1.121 2022/05/19 15:37:47 schwarze Exp $ */
/*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ * Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
*/
#include "config.h"
}
}
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
- const char *local_start;
- int local_sz, c, i;
- char term;
- enum mandoc_esc gly;
-
- /*
- * When the caller doesn't provide return storage,
- * use local storage.
- */
-
- if (NULL == start)
- start = &local_start;
- if (NULL == sz)
- sz = &local_sz;
-
- /*
- * Treat "\E" just like "\";
- * it only makes a difference in copy mode.
- */
-
- while (**end == 'E')
- ++*end;
-
- /*
- * Beyond the backslash, at least one input character
- * is part of the escape sequence. With one exception
- * (see below), that character won't be returned.
- */
-
- gly = ESCAPE_ERROR;
- *start = ++*end;
- *sz = 0;
- term = '\0';
-
- switch ((*start)[-1]) {
- /*
- * First the glyphs. There are several different forms of
- * these, but each eventually returns a substring of the glyph
- * name.
- */
- case '(':
- gly = ESCAPE_SPECIAL;
- *sz = 2;
- break;
- case '[':
- if (**start == ' ') {
- ++*end;
- return ESCAPE_ERROR;
- }
- gly = ESCAPE_SPECIAL;
- term = ']';
- break;
- case 'C':
- if ('\'' != **start)
- return ESCAPE_ERROR;
- *start = ++*end;
- gly = ESCAPE_SPECIAL;
- term = '\'';
- break;
-
- /*
- * Escapes taking no arguments at all.
- */
- case '!':
- case '?':
- return ESCAPE_UNSUPP;
- case '%':
- case '&':
- case ')':
- case ',':
- case '/':
- case '^':
- case 'a':
- case 'd':
- case 'r':
- case 't':
- case 'u':
- case '{':
- case '|':
- case '}':
- return ESCAPE_IGNORE;
- case 'c':
- return ESCAPE_NOSPACE;
- case 'p':
- return ESCAPE_BREAK;
-
- /*
- * The \z escape is supposed to output the following
- * character without advancing the cursor position.
- * Since we are mostly dealing with terminal mode,
- * let us just skip the next character.
- */
- case 'z':
- return ESCAPE_SKIPCHAR;
-
- /*
- * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
- * 'X' is the trigger. These have opaque sub-strings.
- */
- case 'F':
- case 'f':
- case 'g':
- case 'k':
- case 'M':
- case 'm':
- case 'n':
- case 'O':
- case 'V':
- case 'Y':
- case '*':
- switch ((*start)[-1]) {
- case 'f':
- gly = ESCAPE_FONT;
- break;
- case '*':
- gly = ESCAPE_DEVICE;
- break;
- default:
- gly = ESCAPE_IGNORE;
- break;
- }
- switch (**start) {
- case '(':
- if ((*start)[-1] == 'O')
- gly = ESCAPE_ERROR;
- *start = ++*end;
- *sz = 2;
- break;
- case '[':
- if ((*start)[-1] == 'O')
- gly = (*start)[1] == '5' ?
- ESCAPE_UNSUPP : ESCAPE_ERROR;
- *start = ++*end;
- term = ']';
- break;
- default:
- if ((*start)[-1] == 'O') {
- switch (**start) {
- case '0':
- gly = ESCAPE_UNSUPP;
- break;
- case '1':
- case '2':
- case '3':
- case '4':
- break;
- default:
- gly = ESCAPE_ERROR;
- break;
- }
- }
- *sz = 1;
- break;
- }
- break;
-
- /*
- * These escapes are of the form \X'Y', where 'X' is the trigger
- * and 'Y' is any string. These have opaque sub-strings.
- * The \B and \w escapes are handled in roff.c, roff_res().
- */
- case 'A':
- case 'b':
- case 'D':
- case 'R':
- case 'X':
- case 'Z':
- gly = ESCAPE_IGNORE;
- /* FALLTHROUGH */
- case 'o':
- if (**start == '\0')
- return ESCAPE_ERROR;
- if (gly == ESCAPE_ERROR)
- gly = ESCAPE_OVERSTRIKE;
- term = **start;
- *start = ++*end;
- break;
-
- /*
- * These escapes are of the form \X'N', where 'X' is the trigger
- * and 'N' resolves to a numerical expression.
- */
- case 'h':
- case 'H':
- case 'L':
- case 'l':
- case 'S':
- case 'v':
- case 'x':
- if (strchr(" %&()*+-./0123456789:<=>", **start)) {
- if ('\0' != **start)
- ++*end;
- return ESCAPE_ERROR;
- }
- switch ((*start)[-1]) {
- case 'h':
- gly = ESCAPE_HORIZ;
- break;
- case 'l':
- gly = ESCAPE_HLINE;
- break;
- default:
- gly = ESCAPE_IGNORE;
- break;
- }
- term = **start;
- *start = ++*end;
- break;
-
- /*
- * Special handling for the numbered character escape.
- * XXX Do any other escapes need similar handling?
- */
- case 'N':
- if ('\0' == **start)
- return ESCAPE_ERROR;
- (*end)++;
- if (isdigit((unsigned char)**start)) {
- *sz = 1;
- return ESCAPE_IGNORE;
- }
- (*start)++;
- while (isdigit((unsigned char)**end))
- (*end)++;
- *sz = *end - *start;
- if ('\0' != **end)
- (*end)++;
- return ESCAPE_NUMBERED;
-
- /*
- * Sizes get a special category of their own.
- */
- case 's':
- gly = ESCAPE_IGNORE;
-
- /* See +/- counts as a sign. */
- if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
- *start = ++*end;
-
- switch (**end) {
- case '(':
- *start = ++*end;
- *sz = 2;
- break;
- case '[':
- *start = ++*end;
- term = ']';
- break;
- case '\'':
- *start = ++*end;
- term = '\'';
- break;
- case '3':
- case '2':
- case '1':
- *sz = (*end)[-1] == 's' &&
- isdigit((unsigned char)(*end)[1]) ? 2 : 1;
- break;
- default:
- *sz = 1;
- break;
- }
-
- break;
-
- /*
- * Several special characters can be encoded as
- * one-byte escape sequences without using \[].
- */
- case ' ':
- case '\'':
- case '-':
- case '.':
- case '0':
- case ':':
- case '_':
- case '`':
- case 'e':
- case '~':
- gly = ESCAPE_SPECIAL;
- /* FALLTHROUGH */
- default:
- if (gly == ESCAPE_ERROR)
- gly = ESCAPE_UNDEF;
- *start = --*end;
- *sz = 1;
- break;
- }
-
- /*
- * Read up to the terminating character,
- * paying attention to nested escapes.
- */
-
- if ('\0' != term) {
- while (**end != term) {
- switch (**end) {
- case '\0':
- return ESCAPE_ERROR;
- case '\\':
- (*end)++;
- if (ESCAPE_ERROR ==
- mandoc_escape(end, NULL, NULL))
- return ESCAPE_ERROR;
- break;
- default:
- (*end)++;
- break;
- }
- }
- *sz = (*end)++ - *start;
-
- /*
- * The file chars.c only provides one common list
- * of character names, but \[-] == \- is the only
- * one of the characters with one-byte names that
- * allows enclosing the name in brackets.
- */
- if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
- return ESCAPE_ERROR;
- } else {
- assert(*sz > 0);
- if ((size_t)*sz > strlen(*start))
- return ESCAPE_ERROR;
- *end += *sz;
- }
-
- /* Run post-processors. */
-
- switch (gly) {
- case ESCAPE_FONT:
- gly = mandoc_font(*start, *sz);
- break;
- case ESCAPE_SPECIAL:
- if (**start == 'c') {
- if (*sz < 6 || *sz > 7 ||
- strncmp(*start, "char", 4) != 0 ||
- (int)strspn(*start + 4, "0123456789") + 4 < *sz)
- break;
- c = 0;
- for (i = 4; i < *sz; i++)
- c = 10 * c + ((*start)[i] - '0');
- if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
- break;
- *start += 4;
- *sz -= 4;
- gly = ESCAPE_NUMBERED;
- break;
- }
-
- /*
- * Unicode escapes are defined in groff as \[u0000]
- * to \[u10FFFF], where the contained value must be
- * a valid Unicode codepoint. Here, however, only
- * check the length and range.
- */
- if (**start != 'u' || *sz < 5 || *sz > 7)
- break;
- if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
- break;
- if (*sz == 6 && (*start)[1] == '0')
- break;
- if (*sz == 5 && (*start)[1] == 'D' &&
- strchr("89ABCDEF", (*start)[2]) != NULL)
- break;
- if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
- + 1 == *sz)
- gly = ESCAPE_UNICODE;
- break;
- case ESCAPE_DEVICE:
- assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
- break;
- default:
- break;
- }
-
- return gly;
-}
-
static int
a2time(time_t *t, const char *fmt, const char *p)
{