aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--mandoc.c394
-rw-r--r--mandoc.h13
-rw-r--r--roff.c439
-rw-r--r--roff_escape.c477
-rw-r--r--roff_int.h4
6 files changed, 681 insertions, 650 deletions
diff --git a/Makefile b/Makefile
index 8acd5db5..e3b3c6c2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# $Id: Makefile,v 1.541 2022/04/14 16:43:43 schwarze Exp $
+# $Id: Makefile,v 1.542 2022/05/19 15:37:47 schwarze Exp $
#
# Copyright (c) 2011, 2013-2022 Ingo Schwarze <schwarze@openbsd.org>
# Copyright (c) 2010, 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -122,6 +122,7 @@ SRCS = arch.c \
preconv.c \
read.c \
roff.c \
+ roff_escape.c \
roff_html.c \
roff_term.c \
roff_validate.c \
@@ -235,6 +236,7 @@ LIBMDOC_OBJS = att.o \
LIBROFF_OBJS = eqn.o \
roff.o \
+ roff_escape.o \
roff_validate.o \
tbl.o \
tbl_data.o \
diff --git a/mandoc.c b/mandoc.c
index 92d18665..ad00a01a 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -1,7 +1,8 @@
-/* $Id: mandoc.c,v 1.120 2022/04/13 13:19:34 schwarze Exp $ */
+/* $Id: mandoc.c,v 1.121 2022/05/19 15:37:47 schwarze Exp $ */
/*
- * Copyright (c) 2011-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
- * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2010, 2011, 2015, 2017, 2018, 2019, 2020, 2021
+ * Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -14,6 +15,11 @@
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Utility functions to handle end of sentence punctuation
+ * and dates and times, for use by mdoc(7) and man(7) parsers.
+ * Utility functions to handle fonts and numbers,
+ * for use by mandoc(1) parsers and formatters.
*/
#include "config.h"
@@ -91,388 +97,6 @@ mandoc_font(const char *cp, int sz)
}
}
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
- const char *local_start;
- int local_sz, c, i;
- char term;
- enum mandoc_esc gly;
-
- /*
- * When the caller doesn't provide return storage,
- * use local storage.
- */
-
- if (NULL == start)
- start = &local_start;
- if (NULL == sz)
- sz = &local_sz;
-
- /*
- * Treat "\E" just like "\";
- * it only makes a difference in copy mode.
- */
-
- while (**end == 'E')
- ++*end;
-
- /*
- * Beyond the backslash, at least one input character
- * is part of the escape sequence. With one exception
- * (see below), that character won't be returned.
- */
-
- gly = ESCAPE_ERROR;
- *start = ++*end;
- *sz = 0;
- term = '\0';
-
- switch ((*start)[-1]) {
- /*
- * First the glyphs. There are several different forms of
- * these, but each eventually returns a substring of the glyph
- * name.
- */
- case '(':
- gly = ESCAPE_SPECIAL;
- *sz = 2;
- break;
- case '[':
- if (**start == ' ') {
- ++*end;
- return ESCAPE_ERROR;
- }
- gly = ESCAPE_SPECIAL;
- term = ']';
- break;
- case 'C':
- if ('\'' != **start)
- return ESCAPE_ERROR;
- *start = ++*end;
- gly = ESCAPE_SPECIAL;
- term = '\'';
- break;
-
- /*
- * Escapes taking no arguments at all.
- */
- case '!':
- case '?':
- return ESCAPE_UNSUPP;
- case '%':
- case '&':
- case ')':
- case ',':
- case '/':
- case '^':
- case 'a':
- case 'd':
- case 'r':
- case 't':
- case 'u':
- case '{':
- case '|':
- case '}':
- return ESCAPE_IGNORE;
- case 'c':
- return ESCAPE_NOSPACE;
- case 'p':
- return ESCAPE_BREAK;
-
- /*
- * The \z escape is supposed to output the following
- * character without advancing the cursor position.
- * Since we are mostly dealing with terminal mode,
- * let us just skip the next character.
- */
- case 'z':
- return ESCAPE_SKIPCHAR;
-
- /*
- * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
- * 'X' is the trigger. These have opaque sub-strings.
- */
- case 'F':
- case 'f':
- case 'g':
- case 'k':
- case 'M':
- case 'm':
- case 'n':
- case 'O':
- case 'V':
- case 'Y':
- case '*':
- switch ((*start)[-1]) {
- case 'f':
- gly = ESCAPE_FONT;
- break;
- case '*':
- gly = ESCAPE_DEVICE;
- break;
- default:
- gly = ESCAPE_IGNORE;
- break;
- }
- switch (**start) {
- case '(':
- if ((*start)[-1] == 'O')
- gly = ESCAPE_ERROR;
- *start = ++*end;
- *sz = 2;
- break;
- case '[':
- if ((*start)[-1] == 'O')
- gly = (*start)[1] == '5' ?
- ESCAPE_UNSUPP : ESCAPE_ERROR;
- *start = ++*end;
- term = ']';
- break;
- default:
- if ((*start)[-1] == 'O') {
- switch (**start) {
- case '0':
- gly = ESCAPE_UNSUPP;
- break;
- case '1':
- case '2':
- case '3':
- case '4':
- break;
- default:
- gly = ESCAPE_ERROR;
- break;
- }
- }
- *sz = 1;
- break;
- }
- break;
-
- /*
- * These escapes are of the form \X'Y', where 'X' is the trigger
- * and 'Y' is any string. These have opaque sub-strings.
- * The \B and \w escapes are handled in roff.c, roff_res().
- */
- case 'A':
- case 'b':
- case 'D':
- case 'R':
- case 'X':
- case 'Z':
- gly = ESCAPE_IGNORE;
- /* FALLTHROUGH */
- case 'o':
- if (**start == '\0')
- return ESCAPE_ERROR;
- if (gly == ESCAPE_ERROR)
- gly = ESCAPE_OVERSTRIKE;
- term = **start;
- *start = ++*end;
- break;
-
- /*
- * These escapes are of the form \X'N', where 'X' is the trigger
- * and 'N' resolves to a numerical expression.
- */
- case 'h':
- case 'H':
- case 'L':
- case 'l':
- case 'S':
- case 'v':
- case 'x':
- if (strchr(" %&()*+-./0123456789:<=>", **start)) {
- if ('\0' != **start)
- ++*end;
- return ESCAPE_ERROR;
- }
- switch ((*start)[-1]) {
- case 'h':
- gly = ESCAPE_HORIZ;
- break;
- case 'l':
- gly = ESCAPE_HLINE;
- break;
- default:
- gly = ESCAPE_IGNORE;
- break;
- }
- term = **start;
- *start = ++*end;
- break;
-
- /*
- * Special handling for the numbered character escape.
- * XXX Do any other escapes need similar handling?
- */
- case 'N':
- if ('\0' == **start)
- return ESCAPE_ERROR;
- (*end)++;
- if (isdigit((unsigned char)**start)) {
- *sz = 1;
- return ESCAPE_IGNORE;
- }
- (*start)++;
- while (isdigit((unsigned char)**end))
- (*end)++;
- *sz = *end - *start;
- if ('\0' != **end)
- (*end)++;
- return ESCAPE_NUMBERED;
-
- /*
- * Sizes get a special category of their own.
- */
- case 's':
- gly = ESCAPE_IGNORE;
-
- /* See +/- counts as a sign. */
- if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
- *start = ++*end;
-
- switch (**end) {
- case '(':
- *start = ++*end;
- *sz = 2;
- break;
- case '[':
- *start = ++*end;
- term = ']';
- break;
- case '\'':
- *start = ++*end;
- term = '\'';
- break;
- case '3':
- case '2':
- case '1':
- *sz = (*end)[-1] == 's' &&
- isdigit((unsigned char)(*end)[1]) ? 2 : 1;
- break;
- default:
- *sz = 1;
- break;
- }
-
- break;
-
- /*
- * Several special characters can be encoded as
- * one-byte escape sequences without using \[].
- */
- case ' ':
- case '\'':
- case '-':
- case '.':
- case '0':
- case ':':
- case '_':
- case '`':
- case 'e':
- case '~':
- gly = ESCAPE_SPECIAL;
- /* FALLTHROUGH */
- default:
- if (gly == ESCAPE_ERROR)
- gly = ESCAPE_UNDEF;
- *start = --*end;
- *sz = 1;
- break;
- }
-
- /*
- * Read up to the terminating character,
- * paying attention to nested escapes.
- */
-
- if ('\0' != term) {
- while (**end != term) {
- switch (**end) {
- case '\0':
- return ESCAPE_ERROR;
- case '\\':
- (*end)++;
- if (ESCAPE_ERROR ==
- mandoc_escape(end, NULL, NULL))
- return ESCAPE_ERROR;
- break;
- default:
- (*end)++;
- break;
- }
- }
- *sz = (*end)++ - *start;
-
- /*
- * The file chars.c only provides one common list
- * of character names, but \[-] == \- is the only
- * one of the characters with one-byte names that
- * allows enclosing the name in brackets.
- */
- if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
- return ESCAPE_ERROR;
- } else {
- assert(*sz > 0);
- if ((size_t)*sz > strlen(*start))
- return ESCAPE_ERROR;
- *end += *sz;
- }
-
- /* Run post-processors. */
-
- switch (gly) {
- case ESCAPE_FONT:
- gly = mandoc_font(*start, *sz);
- break;
- case ESCAPE_SPECIAL:
- if (**start == 'c') {
- if (*sz < 6 || *sz > 7 ||
- strncmp(*start, "char", 4) != 0 ||
- (int)strspn(*start + 4, "0123456789") + 4 < *sz)
- break;
- c = 0;
- for (i = 4; i < *sz; i++)
- c = 10 * c + ((*start)[i] - '0');
- if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
- break;
- *start += 4;
- *sz -= 4;
- gly = ESCAPE_NUMBERED;
- break;
- }
-
- /*
- * Unicode escapes are defined in groff as \[u0000]
- * to \[u10FFFF], where the contained value must be
- * a valid Unicode codepoint. Here, however, only
- * check the length and range.
- */
- if (**start != 'u' || *sz < 5 || *sz > 7)
- break;
- if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
- break;
- if (*sz == 6 && (*start)[1] == '0')
- break;
- if (*sz == 5 && (*start)[1] == 'D' &&
- strchr("89ABCDEF", (*start)[2]) != NULL)
- break;
- if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
- + 1 == *sz)
- gly = ESCAPE_UNICODE;
- break;
- case ESCAPE_DEVICE:
- assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
- break;
- default:
- break;
- }
-
- return gly;
-}
-
static int
a2time(time_t *t, const char *fmt, const char *p)
{
diff --git a/mandoc.h b/mandoc.h
index 1ed0d816..03e469dc 100644
--- a/mandoc.h
+++ b/mandoc.h
@@ -1,4 +1,4 @@
-/* $Id: mandoc.h,v 1.276 2022/04/28 16:21:09 schwarze Exp $ */
+/* $Id: mandoc.h,v 1.277 2022/05/19 15:37:47 schwarze Exp $ */
/*
* Copyright (c) 2012-2022 Ingo Schwarze <schwarze@openbsd.org>
* Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -285,11 +285,12 @@ enum mandocerr {
};
enum mandoc_esc {
- ESCAPE_ERROR = 0, /* bail! unparsable escape */
- ESCAPE_UNSUPP, /* unsupported escape; ignore it */
- ESCAPE_IGNORE, /* escape to be ignored */
- ESCAPE_UNDEF, /* undefined escape; print literal character */
- ESCAPE_SPECIAL, /* a regular special character */
+ ESCAPE_EXPAND = 0, /* interpolation and iterative call needed */
+ ESCAPE_ERROR, /* non-fatal error: unparsable escape */
+ ESCAPE_UNSUPP, /* unsupported escape: warn and ignore */
+ ESCAPE_IGNORE, /* valid escape to be ignored */
+ ESCAPE_UNDEF, /* undefined escape: print literal character */
+ ESCAPE_SPECIAL, /* special character escape */
ESCAPE_FONT, /* a generic font mode */
ESCAPE_FONTBOLD, /* bold font mode */
ESCAPE_FONTITALIC, /* italic font mode */
diff --git a/roff.c b/roff.c
index 567e7b02..aa42e87d 100644
--- a/roff.c
+++ b/roff.c
@@ -1,4 +1,4 @@
-/* $Id: roff.c,v 1.387 2022/05/01 16:22:06 schwarze Exp $ */
+/* $Id: roff.c,v 1.388 2022/05/19 15:37:47 schwarze Exp $ */
/*
* Copyright (c) 2010-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
* Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
@@ -207,6 +207,8 @@ static int roff_evalpar(struct roff *, int,
static int roff_evalstrcond(const char *, int *);
static int roff_expand(struct roff *, struct buf *,
int, int, char);
+static void roff_expand_patch(struct buf *, int,
+ const char *, int);
static void roff_free1(struct roff *);
static void roff_freereg(struct roffreg *);
static void roff_freestr(struct roffkv *);
@@ -1233,9 +1235,15 @@ deroff(char **dest, const struct roff_node *n)
/* --- main functions of the roff parser ---------------------------------- */
+/*
+ * Save comments preceding the title macro, for example in order to
+ * preserve Copyright and license headers in HTML output,
+ * provide diagnostics about RCS ids and trailing whitespace in comments,
+ * then discard comments including preceding whitespace.
+ * This function also handles input line continuation.
+ */
static int
-roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
- char newesc)
+roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos, char ec)
{
struct roff_node *n; /* used for header comments */
const char *start; /* start of the string to process */
@@ -1245,15 +1253,39 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
int rcsid; /* kind of RCS id seen */
for (start = stesc = buf->buf + pos;; stesc++) {
+ /*
+ * XXX Ugly hack: Remove the newline character that
+ * mparse_buf_r() appended to mark the end of input
+ * if it is not preceded by an escape character.
+ */
+ if (stesc[0] == '\n') {
+ assert(stesc[1] == '\0');
+ stesc[0] = '\0';
+ }
+
/* The line ends without continuation or comment. */
if (stesc[0] == '\0')
return ROFF_CONT;
/* Unescaped byte: skip it. */
- if (stesc[0] != newesc)
+ if (stesc[0] != ec)
continue;
- /* Backslash at end of line requests line continuation. */
+ /*
+ * XXX Ugly hack: Do not attempt to append another line
+ * if the function mparse_buf_r() appended a newline
+ * character to indicate the end of input.
+ */
+ if (stesc[1] == '\n') {
+ assert(stesc[2] == '\0');
+ stesc[0] = '\0';
+ return ROFF_CONT;
+ }
+
+ /*
+ * An escape character at the end of an input line
+ * requests line continuation.
+ */
if (stesc[1] == '\0') {
stesc[0] = '\0';
return ROFF_IGN | ROFF_APPEND;
@@ -1264,7 +1296,7 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
break;
/* Escaped escape character: skip them both. */
- if (stesc[1] == newesc)
+ if (stesc[1] == ec)
stesc++;
}
@@ -1331,325 +1363,218 @@ roff_parse_comment(struct roff *r, struct buf *buf, int ln, int pos,
* which typically produce output glyphs or change formatter state.
*/
static int
-roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char newesc)
+roff_expand(struct roff *r, struct buf *buf, int ln, int pos, char ec)
{
- struct mctx *ctx; /* current macro call context */
- char ubuf[24]; /* buffer to print the number */
- const char *start; /* start of the string to process */
- char *stesc; /* start of an escape sequence ('\\') */
- const char *esct; /* type of esccape sequence */
- const char *stnam; /* start of the name, after "[(*" */
- const char *cp; /* end of the name, e.g. before ']' */
- const char *res; /* the string to be substituted */
- char *nbuf; /* new buffer to copy buf->buf to */
- size_t maxl; /* expected length of the escape name */
- size_t naml; /* actual length of the escape name */
- size_t asz; /* length of the replacement */
- size_t rsz; /* length of the rest of the string */
- int inaml; /* length returned from mandoc_escape() */
+ char ubuf[24]; /* buffer to print a number */
+ struct mctx *ctx; /* current macro call context */
+ const char *res; /* the string to be pasted */
+ const char *src; /* source for copying */
+ char *dst; /* destination for copying */
+ int iesc; /* index of leading escape char */
+ int inam; /* index of the escape name */
+ int iarg; /* index beginning the argument */
+ int iendarg; /* index right after the argument */
+ int iend; /* index right after the sequence */
+ int deftype; /* type of definition to paste */
+ int argi; /* macro argument index */
+ int quote_args; /* true for \\$@, false for \\$* */
+ int asz; /* length of the replacement */
+ int rsz; /* length of the rest of the string */
+ int npos; /* position in numeric expression */
int expand_count; /* to avoid infinite loops */
- int npos; /* position in numeric expression */
- int arg_complete; /* argument not interrupted by eol */
- int quote_args; /* true for \\$@, false for \\$* */
- int deftype; /* type of definition to paste */
- enum mandocerr err; /* for escape sequence problems */
- char sign; /* increment number register */
- char term; /* character terminating the escape */
-
- start = buf->buf + pos;
- stesc = strchr(start, '\0') - 1;
- if (stesc >= start && *stesc == '\n')
- *stesc-- = '\0';
expand_count = 0;
- while (stesc >= start) {
- if (*stesc != newesc) {
+ while (buf->buf[pos] != '\0') {
- /*
- * If we have a non-standard escape character,
- * escape literal backslashes because all
- * processing in subsequent functions uses
- * the standard escaping rules.
- */
+ /*
+ * Skip plain ASCII characters.
+ * If we have a non-standard escape character,
+ * escape literal backslashes because all processing in
+ * subsequent functions uses the standard escaping rules.
+ */
- if (newesc != ASCII_ESC && *stesc == '\\') {
- *stesc = '\0';
- buf->sz = mandoc_asprintf(&nbuf, "%s\\e%s",
- buf->buf, stesc + 1) + 1;
- start = nbuf + pos;
- stesc = nbuf + (stesc - buf->buf);
- free(buf->buf);
- buf->buf = nbuf;
+ if (buf->buf[pos] != ec) {
+ if (ec != ASCII_ESC && buf->buf[pos] == '\\') {
+ roff_expand_patch(buf, pos, "\\e", pos + 1);
+ pos++;
}
-
- /* Search backwards for the next escape. */
-
- stesc--;
+ pos++;
continue;
}
- /* If it is escaped, skip it. */
-
- for (cp = stesc - 1; cp >= start; cp--)
- if (*cp != r->escape)
- break;
-
- if ((stesc - cp) % 2 == 0) {
- while (stesc > cp)
- *stesc-- = '\\';
- continue;
- } else if (stesc[1] == '\0') {
- *stesc-- = '\0';
- continue;
- } else
- *stesc = '\\';
-
- /* Decide whether to expand or to check only. */
+ /*
+ * Parse escape sequences,
+ * issue diagnostic messages when appropriate,
+ * and skip sequences that do not need expansion.
+ * If we have a non-standard escape character, translate
+ * it to backslashes and translate backslashes to \e.
+ */
- term = '\0';
- cp = stesc + 1;
- while (*cp == 'E')
- cp++;
- esct = cp;
- switch (*esct) {
- case '*':
- case '$':
- res = NULL;
- break;
- case 'B':
- case 'w':
- term = cp[1];
- /* FALLTHROUGH */
- case 'n':
- sign = cp[1];
- if (sign == '+' || sign == '-')
- cp++;
- res = ubuf;
- break;
- default:
- err = MANDOCERR_OK;
- switch(mandoc_escape(&cp, &stnam, &inaml)) {
- case ESCAPE_SPECIAL:
- if (mchars_spec2cp(stnam, inaml) >= 0)
- break;
- /* FALLTHROUGH */
- case ESCAPE_ERROR:
- err = MANDOCERR_ESC_BAD;
- break;
- case ESCAPE_UNDEF:
- err = MANDOCERR_ESC_UNDEF;
- break;
- case ESCAPE_UNSUPP:
- err = MANDOCERR_ESC_UNSUPP;
- break;
- default:
- break;
+ if (roff_escape(buf->buf, ln, pos,
+ &iesc, &iarg, &iendarg, &iend) != ESCAPE_EXPAND) {
+ while (pos < iend) {
+ if (buf->buf[pos] == ec) {
+ buf->buf[pos] = '\\';
+ if (pos + 1 < iend)
+ pos++;
+ } else if (buf->buf[pos] == '\\') {
+ roff_expand_patch(buf,
+ pos, "\\e", pos + 1);
+ pos++;
+ iend++;
+ }
+ pos++;
}
- if (err != MANDOCERR_OK)
- mandoc_msg(err, ln, (int)(stesc - buf->buf),
- "%.*s", (int)(cp - stesc), stesc);
- stesc--;
continue;
}
- if (EXPAND_LIMIT < ++expand_count) {
- mandoc_msg(MANDOCERR_ROFFLOOP,
- ln, (int)(stesc - buf->buf), NULL);
- return ROFF_IGN;
- }
-
/*
- * The third character decides the length
- * of the name of the string or register.
- * Save a pointer to the name.
+ * Treat "\E" just like "\";
+ * it only makes a difference in copy mode.
*/
- if (term == '\0') {
- switch (*++cp) {
- case '\0':
- maxl = 0;
- break;
- case '(':
- cp++;
- maxl = 2;
- break;
- case '[':
- cp++;
- term = ']';
- maxl = 0;
- break;
- default:
- maxl = 1;
- break;
- }
- } else {
- cp += 2;
- maxl = 0;
- }
- stnam = cp;
+ inam = iesc + 1;
+ while (buf->buf[inam] == 'E')
+ inam++;
- /* Advance to the end of the name. */
+ /* Handle expansion. */
- naml = 0;
- arg_complete = 1;
- while (maxl == 0 || naml < maxl) {
- if (*cp == '\0') {
- mandoc_msg(MANDOCERR_ESC_BAD, ln,
- (int)(stesc - buf->buf), "%s", stesc);
- arg_complete = 0;
- break;
- }
- if (maxl == 0 && *cp == term) {
- cp++;
- break;
- }
- if (*cp++ != '\\' || *esct != 'w') {
- naml++;
- continue;
- }
- switch (mandoc_escape(&cp, NULL, NULL)) {
- case ESCAPE_SPECIAL:
- case ESCAPE_UNICODE:
- case ESCAPE_NUMBERED:
- case ESCAPE_UNDEF:
- case ESCAPE_OVERSTRIKE:
- naml++;
+ res = NULL;
+ switch (buf->buf[inam]) {
+ case '*':
+ if (iendarg == iarg)
break;
- default:
+ deftype = ROFFDEF_USER | ROFFDEF_PRE;
+ if ((res = roff_getstrn(r, buf->buf + iarg,
+ iendarg - iarg, &deftype)) != NULL)
break;
- }
- }
- /*
- * Retrieve the replacement string; if it is
- * undefined, resume searching for escapes.
- */
+ /*
+ * If not overriden,
+ * let \*(.T through to the formatters.
+ */
- switch (*esct) {
- case '*':
- if (arg_complete) {
- deftype = ROFFDEF_USER | ROFFDEF_PRE;
- res = roff_getstrn(r, stnam, naml, &deftype);
-
- /*
- * If not overriden, let \*(.T
- * through to the formatters.
- */
-
- if (res == NULL && naml == 2 &&
- stnam[0] == '.' && stnam[1] == 'T') {
- roff_setstrn(&r->strtab,
- ".T", 2, NULL, 0, 0);
- stesc--;
- continue;
- }
+ if (iendarg - iarg == 2 &&
+ buf->buf[iarg] == '.' &&
+ buf->buf[iarg + 1] == 'T') {
+ roff_setstrn(&r->strtab, ".T", 2, NULL, 0, 0);
+ pos = iend;
+ continue;
}
+
+ mandoc_msg(MANDOCERR_STR_UNDEF, ln, iesc,
+ "%.*s", iendarg - iarg, buf->buf + iarg);
break;
+
case '$':
if (r->mstackpos < 0) {
- mandoc_msg(MANDOCERR_ARG_UNDEF, ln,
- (int)(stesc - buf->buf), "%.3s", stesc);
+ mandoc_msg(MANDOCERR_ARG_UNDEF, ln, iesc,
+ "%.*s", iend - iesc, buf->buf + iesc);
break;
}
ctx = r->mstack + r->mstackpos;
- npos = esct[1] - '1';
- if (npos >= 0 && npos <= 8) {
- res = npos < ctx->argc ?
- ctx->argv[npos] : "";
+ argi = buf->buf[iarg] - '1';
+ if (argi >= 0 && argi <= 8) {
+ if (argi < ctx->argc)
+ res = ctx->argv[argi];
break;
}
- if (esct[1] == '*')
+ if (buf->buf[iarg] == '*')
quote_args = 0;
- else if (esct[1] == '@')
+ else if (buf->buf[iarg] == '@')
quote_args = 1;
else {
- mandoc_msg(MANDOCERR_ARG_NONUM, ln,
- (int)(stesc - buf->buf), "%.3s", stesc);
+ mandoc_msg(MANDOCERR_ARG_NONUM, ln, iesc,
+ "%.*s", iend - iesc, buf->buf + iesc);
break;
}
asz = 0;
- for (npos = 0; npos < ctx->argc; npos++) {
- if (npos)
+ for (argi = 0; argi < ctx->argc; argi++) {
+ if (argi)
asz++; /* blank */
if (quote_args)
asz += 2; /* quotes */
- asz += strlen(ctx->argv[npos]);
+ asz += strlen(ctx->argv[argi]);
}
- if (asz != 3) {
- rsz = buf->sz - (stesc - buf->buf) - 3;
- if (asz < 3)
- memmove(stesc + asz, stesc + 3, rsz);
- buf->sz += asz - 3;
- nbuf = mandoc_realloc(buf->buf, buf->sz);
- start = nbuf + pos;
- stesc = nbuf + (stesc - buf->buf);
- buf->buf = nbuf;
- if (asz > 3)
- memmove(stesc + asz, stesc + 3, rsz);
+ if (asz != iend - iesc) {
+ rsz = buf->sz - iend;
+ if (asz < iend - iesc)
+ memmove(buf->buf + iesc + asz,
+ buf->buf + iend, rsz);
+ buf->sz = iesc + asz + rsz;
+ buf->buf = mandoc_realloc(buf->buf, buf->sz);
+ if (asz > iend - iesc)
+ memmove(buf->buf + iesc + asz,
+ buf->buf + iend, rsz);
}
- for (npos = 0; npos < ctx->argc; npos++) {
- if (npos)
- *stesc++ = ' ';
+ dst = buf->buf + iesc;
+ for (argi = 0; argi < ctx->argc; argi++) {
+ if (argi)
+ *dst++ = ' ';
if (quote_args)
- *stesc++ = '"';
- cp = ctx->argv[npos];
- while (*cp != '\0')
- *stesc++ = *cp++;
+ *dst++ = '"';
+ src = ctx->argv[argi];
+ while (*src != '\0')
+ *dst++ = *src++;
if (quote_args)
- *stesc++ = '"';
+ *dst++ = '"';
}
continue;
case 'B':
npos = 0;
- ubuf[0] = arg_complete &&
- roff_evalnum(r, ln, stnam, &npos,
- NULL, ROFFNUM_SCALE) &&
- stnam + npos + 1 == cp ? '1' : '0';
+ ubuf[0] = iendarg > iarg && iend > iendarg &&
+ roff_evalnum(r, ln, buf->buf + iarg, &npos,
+ NULL, ROFFNUM_SCALE) &&
+ npos == iendarg - iarg ? '1' : '0';
ubuf[1] = '\0';
+ res = ubuf;
break;
case 'n':
- if (arg_complete)
+ if (iendarg > iarg)
(void)snprintf(ubuf, sizeof(ubuf), "%d",
- roff_getregn(r, stnam, naml, sign));
+ roff_getregn(r, buf->buf + iarg,
+ iendarg - iarg, buf->buf[inam + 1]));
else
ubuf[0] = '\0';
+ res = ubuf;
break;
case 'w':
- /* use even incomplete args */
- (void)snprintf(ubuf, sizeof(ubuf), "%d",
- 24 * (int)naml);
+ (void)snprintf(ubuf, sizeof(ubuf),
+ "%d", (iendarg - iarg) * 24);
+ res = ubuf;
+ break;
+ default:
break;
}
-
- if (res == NULL) {
- if (*esct == '*')
- mandoc_msg(MANDOCERR_STR_UNDEF,
- ln, (int)(stesc - buf->buf),
- "%.*s", (int)naml, stnam);
+ if (res == NULL)
res = "";
- } else if (buf->sz + strlen(res) > SHRT_MAX) {
- mandoc_msg(MANDOCERR_ROFFLOOP,
- ln, (int)(stesc - buf->buf), NULL);
+ if (++expand_count > EXPAND_LIMIT ||
+ buf->sz + strlen(res) > SHRT_MAX) {
+ mandoc_msg(MANDOCERR_ROFFLOOP, ln, iesc, NULL);
return ROFF_IGN;
}
-
- /* Replace the escape sequence by the string. */
-
- *stesc = '\0';
- buf->sz = mandoc_asprintf(&nbuf, "%s%s%s",
- buf->buf, res, cp) + 1;
-
- /* Prepare for the next replacement. */
-
- start = nbuf + pos;
- stesc = nbuf + (stesc - buf->buf) + strlen(res);
- free(buf->buf);
- buf->buf = nbuf;
+ roff_expand_patch(buf, iesc, res, iend);
}
return ROFF_CONT;
}
/*
+ * Replace the substring from the start position (inclusive)
+ * to end position (exclusive) with the repl(acement) string.
+ */
+static void
+roff_expand_patch(struct buf *buf, int start, const char *repl, int end)
+{
+ char *nbuf;
+
+ buf->buf[start] = '\0';
+ buf->sz = mandoc_asprintf(&nbuf, "%s%s%s", buf->buf, repl,
+ buf->buf + end) + 1;
+ free(buf->buf);
+ buf->buf = nbuf;
+}
+
+/*
* Parse a quoted or unquoted roff-style request or macro argument.
* Return a pointer to the parsed argument, which is either the original
* pointer or advanced by one byte in case the argument is quoted.
diff --git a/roff_escape.c b/roff_escape.c
new file mode 100644
index 00000000..1b5dc640
--- /dev/null
+++ b/roff_escape.c
@@ -0,0 +1,477 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
+ * Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Parser for roff(7) escape sequences.
+ * To be used by all mandoc(1) parsers and formatters.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mandoc.h"
+#include "roff.h"
+#include "roff_int.h"
+
+/*
+ * Traditional escape sequence interpreter for general use
+ * including in high-level formatters. This function does not issue
+ * diagnostics and is not usable for expansion in the roff(7) parser.
+ * It is documented in the mandoc_escape(3) manual page.
+ */
+enum mandoc_esc
+mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
+{
+ int iarg, iendarg, iend;
+ enum mandoc_esc rval;
+
+ rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
+ assert(rval != ESCAPE_EXPAND);
+ if (rarg != NULL)
+ *rarg = *rendarg + iarg;
+ if (rargl != NULL)
+ *rargl = iendarg - iarg;
+ *rendarg += iend;
+ return rval;
+}
+
+/*
+ * Full-featured escape sequence parser.
+ * If it encounters a nested escape sequence that requires expansion
+ * by the parser and re-parsing, the positions of that inner escape
+ * sequence are returned in *resc ... *rend.
+ * Otherwise, *resc is set to aesc and the positions of the escape
+ * sequence starting at aesc are returned.
+ * Diagnostic messages are generated if and only if resc != NULL,
+ * that is, if and only if called by roff_expand().
+ */
+enum mandoc_esc
+roff_escape(const char *buf, const int ln, const int aesc,
+ int *resc, int *rarg, int *rendarg, int *rend)
+{
+ int iesc; /* index of leading escape char */
+ int iarg; /* index beginning the argument */
+ int iendarg; /* index right after the argument */
+ int iend; /* index right after the sequence */
+ int sesc, sarg, sendarg, send; /* for sub-escape */
+ int maxl; /* expected length of the argument */
+ int argl; /* actual length of the argument */
+ int c, i; /* for \[char...] parsing */
+ enum mandoc_esc rval; /* return value */
+ enum mandocerr err; /* diagnostic code */
+ char esc_name;
+ char term; /* byte terminating the argument */
+
+ /*
+ * Treat "\E" just like "\";
+ * it only makes a difference in copy mode.
+ */
+
+ iesc = iarg = aesc;
+ do {
+ iarg++;
+ } while (buf[iarg] == 'E');
+
+ /*
+ * Sort the following cases first by syntax category,
+ * then by escape sequence type, and finally by ASCII code.
+ */
+
+ esc_name = buf[iarg];
+ iendarg = iend = ++iarg;
+ maxl = INT_MAX;
+ term = '\0';
+ switch (esc_name) {
+
+ /* Escape sequences taking no arguments at all. */
+
+ case '!':
+ case '?':
+ rval = ESCAPE_UNSUPP;
+ goto out;
+
+ case '%':
+ case '&':
+ case ')':
+ case ',':
+ case '/':
+ case '^':
+ case 'a':
+ case 'd':
+ case 'r':
+ case 't':
+ case 'u':
+ case '{':
+ case '|':
+ case '}':
+ rval = ESCAPE_IGNORE;
+ goto out;
+
+ case '\\':
+ default:
+ iarg--;
+ rval = ESCAPE_UNDEF;
+ goto out;
+
+ case ' ':
+ case '\'':
+ case '-':
+ case '.':
+ case '0':
+ case ':':
+ case '_':
+ case '`':
+ case 'e':
+ case '~':
+ iarg--;
+ argl = 1;
+ rval = ESCAPE_SPECIAL;
+ goto out;
+ case 'p':
+ rval = ESCAPE_BREAK;
+ goto out;
+ case 'c':
+ rval = ESCAPE_NOSPACE;
+ goto out;
+ case 'z':
+ rval = ESCAPE_SKIPCHAR;
+ goto out;
+
+ /* Standard argument format. */
+
+ case '$':
+ case '*':
+ case 'n':
+ rval = ESCAPE_EXPAND;
+ break;
+ case 'F':
+ case 'M':
+ case 'O':
+ case 'V':
+ case 'Y':
+ case 'g':
+ case 'k':
+ case 'm':
+ rval = ESCAPE_IGNORE;
+ break;
+ case '(':
+ case '[':
+ rval = ESCAPE_SPECIAL;
+ iendarg = iend = --iarg;
+ break;
+ case 'f':
+ rval = ESCAPE_FONT;
+ break;
+
+ /* Quoted arguments */
+
+ case 'B':
+ case 'w':
+ rval = ESCAPE_EXPAND;
+ term = '\b';
+ break;
+ case 'A':
+ case 'D':
+ case 'H':
+ case 'L':
+ case 'R':
+ case 'S':
+ case 'X':
+ case 'Z':
+ case 'b':
+ case 'v':
+ case 'x':
+ rval = ESCAPE_IGNORE;
+ term = '\b';
+ break;
+ case 'C':
+ if (buf[iarg] != '\'') {
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ rval = ESCAPE_SPECIAL;
+ term = '\b';
+ break;
+ case 'N':
+ rval = ESCAPE_NUMBERED;
+ term = '\b';
+ break;
+ case 'h':
+ rval = ESCAPE_HORIZ;
+ term = '\b';
+ break;
+ case 'l':
+ rval = ESCAPE_HLINE;
+ term = '\b';
+ break;
+ case 'o':
+ rval = ESCAPE_OVERSTRIKE;
+ term = '\b';
+ break;
+
+ /* Sizes support both forms, with additional peculiarities. */
+
+ case 's':
+ rval = ESCAPE_IGNORE;
+ if (buf[iarg] == '+' || buf[iarg] == '-'||
+ buf[iarg] == ASCII_HYPH)
+ iarg++;
+ switch (buf[iarg]) {
+ case '(':
+ maxl = 2;
+ iarg++;
+ break;
+ case '[':
+ term = ']';
+ iarg++;
+ break;
+ case '\'':
+ term = '\'';
+ iarg++;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ if (buf[iarg - 1] == 's' &&
+ isdigit((unsigned char)buf[iarg + 1])) {
+ maxl = 2;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ maxl = 1;
+ break;
+ }
+ iendarg = iend = iarg;
+ }
+
+ /* Decide how to end the argument. */
+
+ if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
+ buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
+ &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+ goto out_sub;
+
+ if (term == '\b') {
+ if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
+ (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
+ buf[iarg]) != NULL)) {
+ iendarg = iend = iarg + 1;
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ term = buf[iarg++];
+ } else if (term == '\0' && maxl == INT_MAX) {
+ if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
+ iarg++;
+ switch (buf[iarg]) {
+ case '(':
+ maxl = 2;
+ iarg++;
+ break;
+ case '[':
+ if (buf[++iarg] == ' ') {
+ iendarg = iend = iarg + 1;
+ rval = ESCAPE_ERROR;
+ goto out;
+ }
+ term = ']';
+ break;
+ default:
+ maxl = 1;
+ break;
+ }
+ }
+
+ /* Advance to the end of the argument. */
+
+ iendarg = iarg;
+ while (maxl > 0) {
+ if (buf[iendarg] == '\0') {
+ /* Ignore an incomplete argument except for \w. */
+ if (esc_name != 'w')
+ iendarg = iarg;
+ break;
+ }
+ if (buf[iendarg] == term) {
+ iend = iendarg + 1;
+ break;
+ }
+ if (esc_name == 'N' &&
+ isdigit((unsigned char)buf[iendarg]) == 0) {
+ iend = iendarg + 1;
+ break;
+ }
+ if (buf[iendarg] == buf[iesc]) {
+ if (roff_escape(buf, ln, iendarg,
+ &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
+ goto out_sub;
+ iendarg = iend = send;
+ } else {
+ if (maxl != INT_MAX)
+ maxl--;
+ iend = ++iendarg;
+ }
+ }
+ if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
+ (term != '\0' && buf[iendarg] != term)))
+ mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
+
+ /* Post-process depending on the content of the argument. */
+
+ argl = iendarg - iarg;
+ switch (esc_name) {
+ case '*':
+ if (resc == NULL && argl == 2 &&
+ buf[iarg] == '.' && buf[iarg + 1] == 'T')
+ rval = ESCAPE_DEVICE;
+ break;
+ case 'O':
+ switch (buf[iarg]) {
+ case '0':
+ rval = ESCAPE_UNSUPP;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
+ break;
+ case '5':
+ rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
+ ESCAPE_ERROR;
+ break;
+ default:
+ rval = ESCAPE_ERROR;
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (rval) {
+ case ESCAPE_FONT:
+ rval = mandoc_font(buf + iarg, argl);
+ break;
+
+ case ESCAPE_SPECIAL:
+
+ /*
+ * The file chars.c only provides one common list of
+ * character names, but \[-] == \- is the only one of
+ * the characters with one-byte names that allows
+ * enclosing the name in brackets.
+ */
+
+ if (term != '\0' && argl == 1 && buf[iarg] != '-') {
+ rval = ESCAPE_ERROR;
+ break;
+ }
+
+ /* Treat \[char...] as an alias for \N'...'. */
+
+ if (buf[iarg] == 'c') {
+ if (argl < 6 || argl > 7 ||
+ strncmp(buf + iarg, "char", 4) != 0 ||
+ (int)strspn(buf + iarg + 4, "0123456789")
+ + 4 < argl)
+ break;
+ c = 0;
+ for (i = iarg; i < iendarg; i++)
+ c = 10 * c + (buf[i] - '0');
+ if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
+ break;
+ iarg += 4;
+ rval = ESCAPE_NUMBERED;
+ break;
+ }
+
+ /*
+ * Unicode escapes are defined in groff as \[u0000]
+ * to \[u10FFFF], where the contained value must be
+ * a valid Unicode codepoint. Here, however, only
+ * check the length and range.
+ */
+
+ if (buf[iarg] != 'u' || argl < 5 || argl > 7)
+ break;
+ if (argl == 7 &&
+ (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
+ break;
+ if (argl == 6 && buf[iarg + 1] == '0')
+ break;
+ if (argl == 5 && buf[iarg + 1] == 'D' &&
+ strchr("89ABCDEF", buf[iarg + 2]) != NULL)
+ break;
+ if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+ + 1 == argl)
+ rval = ESCAPE_UNICODE;
+ break;
+ default:
+ break;
+ }
+ goto out;
+
+out_sub:
+ iesc = sesc;
+ iarg = sarg;
+ iendarg = sendarg;
+ iend = send;
+ rval = ESCAPE_EXPAND;
+
+out:
+ if (rarg != NULL)
+ *rarg = iarg;
+ if (rendarg != NULL)
+ *rendarg = iendarg;
+ if (rend != NULL)
+ *rend = iend;
+ if (resc == NULL)
+ return rval;
+
+ /*
+ * Diagnostic messages are only issued when called
+ * from the parser, not when called from the formatters.
+ */
+
+ *resc = iesc;
+ switch (rval) {
+ case ESCAPE_ERROR:
+ err = MANDOCERR_ESC_BAD;
+ break;
+ case ESCAPE_UNSUPP:
+ err = MANDOCERR_ESC_UNSUPP;
+ break;
+ case ESCAPE_UNDEF:
+ if (esc_name == '\\')
+ return rval;
+ err = MANDOCERR_ESC_UNDEF;
+ break;
+ case ESCAPE_SPECIAL:
+ if (mchars_spec2cp(buf + iarg, argl) >= 0)
+ return rval;
+ err = MANDOCERR_ESC_BAD;
+ break;
+ default:
+ return rval;
+ }
+ mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
+ return rval;
+}
diff --git a/roff_int.h b/roff_int.h
index f7d688fd..ba7032b0 100644
--- a/roff_int.h
+++ b/roff_int.h
@@ -1,6 +1,6 @@
/* $OpenBSD: roff_int.h,v 1.16 2019/01/05 00:36:46 schwarze Exp $ */
/*
- * Copyright (c) 2013-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2013-2015, 2017-2022 Ingo Schwarze <schwarze@openbsd.org>
* Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
@@ -82,6 +82,8 @@ struct ohash *roffhash_alloc(enum roff_tok, enum roff_tok);
enum roff_tok roffhash_find(struct ohash *, const char *, size_t);
void roffhash_free(struct ohash *);
+enum mandoc_esc roff_escape(const char *, const int, const int,
+ int *, int *, int *, int *);
void roff_state_reset(struct roff_man *);
void roff_validate(struct roff_man *);