aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/roff_escape.c
diff options
context:
space:
mode:
Diffstat (limited to 'roff_escape.c')
-rw-r--r--roff_escape.c477
1 files changed, 477 insertions, 0 deletions
diff --git a/roff_escape.c b/roff_escape.c
new file mode 100644
index 00000000..1b5dc640
--- /dev/null
+++ b/roff_escape.c
@@ -0,0 +1,477 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
+ * Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Parser for roff(7) escape sequences.
+ * To be used by all mandoc(1) parsers and formatters.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mandoc.h"
+#include "roff.h"
+#include "roff_int.h"
+
+/*
+ * Traditional escape sequence interpreter for general use
+ * including in high-level formatters. This function does not issue
+ * diagnostics and is not usable for expansion in the roff(7) parser.
+ * It is documented in the mandoc_escape(3) manual page.
+ */
+enum mandoc_esc
+mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
+{
+ int iarg, iendarg, iend;
+ enum mandoc_esc rval;
+
+ rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
+ assert(rval != ESCAPE_EXPAND);
+ if (rarg != NULL)
+ *rarg = *rendarg + iarg;
+ if (rargl != NULL)
+ *rargl = iendarg - iarg;
+ *rendarg += iend;
+ return rval;
+}
+
+/*
+ * Full-featured escape sequence parser.
+ * If it encounters a nested escape sequence that requires expansion
+ * by the parser and re-parsing, the positions of that inner escape
+ * sequence are returned in *resc ... *rend.
+ * Otherwise, *resc is set to aesc and the positions of the escape
+ * sequence starting at aesc are returned.
+ * Diagnostic messages are generated if and only if resc != NULL,
+ * that is, if and only if called by roff_expand().
+ */
+enum mandoc_esc
+roff_escape(const char *buf, const int ln, const int aesc,
+ int *resc, int *rarg, int *rendarg, int *rend)
+{
+ int iesc; /* index of leading escape char */
+ int iarg; /* index beginning the argument */
+ int iendarg; /* index right after the argument */
+ int iend; /* index right after the sequence */
+ int sesc, sarg, sendarg, send; /* for sub-escape */
+ int maxl; /* expected length of the argument */
+ int argl; /* actual length of the argument */
+ int c, i; /* for \[char...] parsing */
+ enum mandoc_esc rval; /* return value */
+ enum mandocerr err; /* diagnostic code */
+ char esc_name;
+ char term; /* byte terminating the argument */
+
+ /*
+ * Treat "\E" just like "\";
+ * it only makes a difference in copy mode.
+ */
+
+ iesc = iarg = aesc;
+ do {
+ iarg++;
+ } while (buf[iarg] == 'E');
+
+ /*
+ * Sort the following cases first by syntax category,
+ * then by escape sequence type, and finally by ASCII code.
+ */
+
+ esc_name = buf[iarg];
+ iendarg = iend = ++iarg;
+ maxl = INT_MAX;
+ term = '\0';
+ switch (esc_name) {
+
+ /* Escape sequences taking no arguments at all. */
+
+ case '!':
+ case '?':
+ rval = ESCAPE_UNSUPP;
+ goto out;
+
+ case '%':
+ case '&':
+ case ')':
+ case ',':
+ case '/':
+ case '^':
+ case 'a':
+ case 'd':
+ case 'r':
+ case 't':
+ case 'u':
+ case '{':
+ case '|':
+ case '}':
+ rval = ESCAPE_IGNORE;
+ goto out;
+
+ case '\\':
+ default:
+ iarg--;
+ rval = ESCAPE_UNDEF;
+ goto out;
+
+ case ' ':
+ case '\'':
+ case '-':
+ case '.':
+ case '0':
+ case ':':
+ case '_':
+ case '`':
+ case 'e':
+ case '~':
+ iarg--;
+ argl = 1;
+ rval = ESCAPE_SPECIAL;
+ goto out;
+ case 'p':
+ rval = ESCAPE_BREAK;
+ goto out;
+ case 'c':
+ rval = ESCAPE_NOSPACE;
+ goto out;
+ case 'z':
+ rval = ESCAPE_SKIPCHAR;
+ goto out;
+
+ /* Standard argument format. */
+
+ case '$':
+ case '*':
+ case 'n':
+ rval = ESCAPE_EXPAND;
+ break;
+ case 'F':
+ case 'M':
+ case 'O':
+ case 'V':
+ case 'Y':
+ case 'g':
+ case 'k':
+ case 'm':
+ rval = ESCAPE_IGNORE;
+ break;
+ case '(':
+ case '[':
+ rval = ESCAPE_SPECIAL;
+ iendarg = iend = --iarg;
+ break;
+ case 'f':
+ rval = ESCAPE_FONT;
+ break;
+
+ /* Quoted arguments */
+
+ case 'B':
+ case 'w':
+ rval = ESCAPE_EXPAND;
+ term = '\b';
+ break;
+ case 'A':
+ case 'D':
+ case 'H':
+ case 'L':
+ case 'R':
+ case 'S':
+ case 'X':
+ case 'Z':
+ case 'b':
+ case 'v':
+ case 'x':
+ rval = ESCAPE_IGNORE;
+ term = '\b';
+ break;
+ case 'C':
+ if (buf[iarg] != '\'') {
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ rval = ESCAPE_SPECIAL;
+ term = '\b';
+ break;
+ case 'N':
+ rval = ESCAPE_NUMBERED;
+ term = '\b';
+ break;
+ case 'h':
+ rval = ESCAPE_HORIZ;
+ term = '\b';
+ break;
+ case 'l':
+ rval = ESCAPE_HLINE;
+ term = '\b';
+ break;
+ case 'o':
+ rval = ESCAPE_OVERSTRIKE;
+ term = '\b';
+ break;
+
+ /* Sizes support both forms, with additional peculiarities. */
+
+ case 's':
+ rval = ESCAPE_IGNORE;
+ if (buf[iarg] == '+' || buf[iarg] == '-'||
+ buf[iarg] == ASCII_HYPH)
+ iarg++;
+ switch (buf[iarg]) {
+ case '(':
+ maxl = 2;
+ iarg++;
+ break;
+ case '[':
+ term = ']';
+ iarg++;
+ break;
+ case '\'':
+ term = '\'';
+ iarg++;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ if (buf[iarg - 1] == 's' &&
+ isdigit((unsigned char)buf[iarg + 1])) {
+ maxl = 2;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ maxl = 1;
+ break;
+ }
+ iendarg = iend = iarg;
+ }
+
+ /* Decide how to end the argument. */
+
+ if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
+ buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
+ &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+ goto out_sub;
+
+ if (term == '\b') {
+ if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
+ (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
+ buf[iarg]) != NULL)) {
+ iendarg = iend = iarg + 1;
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ term = buf[iarg++];
+ } else if (term == '\0' && maxl == INT_MAX) {
+ if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
+ iarg++;
+ switch (buf[iarg]) {
+ case '(':
+ maxl = 2;
+ iarg++;
+ break;
+ case '[':
+ if (buf[++iarg] == ' ') {
+ iendarg = iend = iarg + 1;
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ term = ']';
+ break;
+ default:
+ maxl = 1;
+ break;
+ }
+ }
+
+ /* Advance to the end of the argument. */
+
+ iendarg = iarg;
+ while (maxl > 0) {
+ if (buf[iendarg] == '\0') {
+ /* Ignore an incomplete argument except for \w. */
+ if (esc_name != 'w')
+ iendarg = iarg;
+ break;
+ }
+ if (buf[iendarg] == term) {
+ iend = iendarg + 1;
+ break;
+ }
+ if (esc_name == 'N' &&
+ isdigit((unsigned char)buf[iendarg]) == 0) {
+ iend = iendarg + 1;
+ break;
+ }
+ if (buf[iendarg] == buf[iesc]) {
+ if (roff_escape(buf, ln, iendarg,
+ &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+ goto out_sub;
+ iendarg = iend = send;
+ } else {
+ if (maxl != INT_MAX)
+ maxl--;
+ iend = ++iendarg;
+ }
+ }
+ if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
+ (term != '\0' && buf[iendarg] != term)))
+ mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
+
+ /* Post-process depending on the content of the argument. */
+
+ argl = iendarg - iarg;
+ switch (esc_name) {
+ case '*':
+ if (resc == NULL && argl == 2 &&
+ buf[iarg] == '.' && buf[iarg + 1] == 'T')
+ rval = ESCAPE_DEVICE;
+ break;
+ case 'O':
+ switch (buf[iarg]) {
+ case '0':
+ rval = ESCAPE_UNSUPP;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
+ break;
+ case '5':
+ rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
+ ESCAPE_ERROR;
+ break;
+ default:
+ rval = ESCAPE_ERROR;
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (rval) {
+ case ESCAPE_FONT:
+ rval = mandoc_font(buf + iarg, argl);
+ break;
+
+ case ESCAPE_SPECIAL:
+
+ /*
+ * The file chars.c only provides one common list of
+ * character names, but \[-] == \- is the only one of
+ * the characters with one-byte names that allows
+ * enclosing the name in brackets.
+ */
+
+ if (term != '\0' && argl == 1 && buf[iarg] != '-') {
+ rval = ESCAPE_ERROR;
+ break;
+ }
+
+ /* Treat \[char...] as an alias for \N'...'. */
+
+ if (buf[iarg] == 'c') {
+ if (argl < 6 || argl > 7 ||
+ strncmp(buf + iarg, "char", 4) != 0 ||
+ (int)strspn(buf + iarg + 4, "0123456789")
+ + 4 < argl)
+ break;
+ c = 0;
+ for (i = iarg; i < iendarg; i++)
+ c = 10 * c + (buf[i] - '0');
+ if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
+ break;
+ iarg += 4;
+ rval = ESCAPE_NUMBERED;
+ break;
+ }
+
+ /*
+ * Unicode escapes are defined in groff as \[u0000]
+ * to \[u10FFFF], where the contained value must be
+ * a valid Unicode codepoint. Here, however, only
+ * check the length and range.
+ */
+
+ if (buf[iarg] != 'u' || argl < 5 || argl > 7)
+ break;
+ if (argl == 7 &&
+ (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
+ break;
+ if (argl == 6 && buf[iarg + 1] == '0')
+ break;
+ if (argl == 5 && buf[iarg + 1] == 'D' &&
+ strchr("89ABCDEF", buf[iarg + 2]) != NULL)
+ break;
+ if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+ + 1 == argl)
+ rval = ESCAPE_UNICODE;
+ break;
+ default:
+ break;
+ }
+ goto out;
+
+out_sub:
+ iesc = sesc;
+ iarg = sarg;
+ iendarg = sendarg;
+ iend = send;
+ rval = ESCAPE_EXPAND;
+
+out:
+ if (rarg != NULL)
+ *rarg = iarg;
+ if (rendarg != NULL)
+ *rendarg = iendarg;
+ if (rend != NULL)
+ *rend = iend;
+ if (resc == NULL)
+ return rval;
+
+ /*
+ * Diagnostic messages are only issued when called
+ * from the parser, not when called from the formatters.
+ */
+
+ *resc = iesc;
+ switch (rval) {
+ case ESCAPE_ERROR:
+ err = MANDOCERR_ESC_BAD;
+ break;
+ case ESCAPE_UNSUPP:
+ err = MANDOCERR_ESC_UNSUPP;
+ break;
+ case ESCAPE_UNDEF:
+ if (esc_name == '\\')
+ return rval;
+ err = MANDOCERR_ESC_UNDEF;
+ break;
+ case ESCAPE_SPECIAL:
+ if (mchars_spec2cp(buf + iarg, argl) >= 0)
+ return rval;
+ err = MANDOCERR_ESC_BAD;
+ break;
+ default:
+ return rval;
+ }
+ mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
+ return rval;
+}