]>
git.cameronkatri.com Git - mandoc.git/blob - roff_escape.c
3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4 * Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 * Parser for roff(7) escape sequences.
20 * To be used by all mandoc(1) parsers and formatters.
33 * Traditional escape sequence interpreter for general use
34 * including in high-level formatters. This function does not issue
35 * diagnostics and is not usable for expansion in the roff(7) parser.
36 * It is documented in the mandoc_escape(3) manual page.
39 mandoc_escape(const char **rendarg
, const char **rarg
, int *rargl
)
41 int iarg
, iendarg
, iend
;
44 rval
= roff_escape(--*rendarg
, 0, 0, NULL
, &iarg
, &iendarg
, &iend
);
45 assert(rval
!= ESCAPE_EXPAND
);
47 *rarg
= *rendarg
+ iarg
;
49 *rargl
= iendarg
- iarg
;
55 * Full-featured escape sequence parser.
56 * If it encounters a nested escape sequence that requires expansion
57 * by the parser and re-parsing, the positions of that inner escape
58 * sequence are returned in *resc ... *rend.
59 * Otherwise, *resc is set to aesc and the positions of the escape
60 * sequence starting at aesc are returned.
61 * Diagnostic messages are generated if and only if resc != NULL,
62 * that is, if and only if called by roff_expand().
65 roff_escape(const char *buf
, const int ln
, const int aesc
,
66 int *resc
, int *rarg
, int *rendarg
, int *rend
)
68 int iesc
; /* index of leading escape char */
69 int iarg
; /* index beginning the argument */
70 int iendarg
; /* index right after the argument */
71 int iend
; /* index right after the sequence */
72 int sesc
, sarg
, sendarg
, send
; /* for sub-escape */
73 int maxl
; /* expected length of the argument */
74 int argl
; /* actual length of the argument */
75 int c
, i
; /* for \[char...] parsing */
76 enum mandoc_esc rval
; /* return value */
77 enum mandocerr err
; /* diagnostic code */
79 char term
; /* byte terminating the argument */
82 * Treat "\E" just like "\";
83 * it only makes a difference in copy mode.
89 } while (buf
[iarg
] == 'E');
92 * Sort the following cases first by syntax category,
93 * then by escape sequence type, and finally by ASCII code.
97 iendarg
= iend
= ++iarg
;
102 /* Escape sequences taking no arguments at all. */
107 rval
= ESCAPE_UNSUPP
;
123 rval
= ESCAPE_IGNORE
;
144 rval
= ESCAPE_SPECIAL
;
150 rval
= ESCAPE_NOSPACE
;
153 rval
= ESCAPE_SKIPCHAR
;
156 /* Standard argument format. */
162 rval
= ESCAPE_EXPAND
;
171 rval
= ESCAPE_IGNORE
;
175 rval
= ESCAPE_SPECIAL
;
176 iendarg
= iend
= --iarg
;
182 /* Quoted arguments */
186 rval
= ESCAPE_EXPAND
;
200 rval
= ESCAPE_IGNORE
;
204 if (buf
[iarg
] != '\'') {
208 rval
= ESCAPE_SPECIAL
;
212 rval
= ESCAPE_NUMBERED
;
224 rval
= ESCAPE_OVERSTRIKE
;
228 /* Sizes support both forms, with additional peculiarities. */
231 rval
= ESCAPE_IGNORE
;
232 if (buf
[iarg
] == '+' || buf
[iarg
] == '-'||
233 buf
[iarg
] == ASCII_HYPH
)
251 if (buf
[iarg
- 1] == 's' &&
252 isdigit((unsigned char)buf
[iarg
+ 1])) {
261 iendarg
= iend
= iarg
;
264 /* Decide how to end the argument. */
266 if ((term
== '\b' || (term
== '\0' && maxl
== INT_MAX
)) &&
267 buf
[iarg
] == buf
[iesc
] && roff_escape(buf
, ln
, iendarg
,
268 &sesc
, &sarg
, &sendarg
, &send
) == ESCAPE_EXPAND
)
272 if ((esc_name
== 'N' && isdigit((unsigned char)buf
[iarg
])) ||
273 (esc_name
== 'h' && strchr(" %&()*+-./0123456789:<=>",
274 buf
[iarg
]) != NULL
)) {
275 iendarg
= iend
= iarg
+ 1;
280 } else if (term
== '\0' && maxl
== INT_MAX
) {
281 if (esc_name
== 'n' && (buf
[iarg
] == '+' || buf
[iarg
] == '-'))
289 if (buf
[++iarg
] == ' ') {
290 iendarg
= iend
= iarg
+ 1;
302 /* Advance to the end of the argument. */
306 if (buf
[iendarg
] == '\0') {
307 /* Ignore an incomplete argument except for \w. */
312 if (buf
[iendarg
] == term
) {
316 if (esc_name
== 'N' &&
317 isdigit((unsigned char)buf
[iendarg
]) == 0) {
321 if (buf
[iendarg
] == buf
[iesc
]) {
322 if (roff_escape(buf
, ln
, iendarg
,
323 &sesc
, &sarg
, &sendarg
, &send
) == ESCAPE_EXPAND
)
325 iendarg
= iend
= send
;
332 if (resc
!= NULL
&& ((maxl
!= INT_MAX
&& maxl
!= 0) ||
333 (term
!= '\0' && buf
[iendarg
] != term
)))
334 mandoc_msg(MANDOCERR_ESC_BAD
, ln
, iesc
, "%s", buf
+ iesc
);
336 /* Post-process depending on the content of the argument. */
338 argl
= iendarg
- iarg
;
341 if (resc
== NULL
&& argl
== 2 &&
342 buf
[iarg
] == '.' && buf
[iarg
+ 1] == 'T')
343 rval
= ESCAPE_DEVICE
;
348 rval
= ESCAPE_UNSUPP
;
354 rval
= argl
== 1 ? ESCAPE_IGNORE
: ESCAPE_ERROR
;
357 rval
= buf
[iarg
- 1] == '[' ? ESCAPE_UNSUPP
:
371 rval
= mandoc_font(buf
+ iarg
, argl
);
377 * The file chars.c only provides one common list of
378 * character names, but \[-] == \- is the only one of
379 * the characters with one-byte names that allows
380 * enclosing the name in brackets.
383 if (term
!= '\0' && argl
== 1 && buf
[iarg
] != '-') {
388 /* Treat \[char...] as an alias for \N'...'. */
390 if (buf
[iarg
] == 'c') {
391 if (argl
< 6 || argl
> 7 ||
392 strncmp(buf
+ iarg
, "char", 4) != 0 ||
393 (int)strspn(buf
+ iarg
+ 4, "0123456789")
397 for (i
= iarg
; i
< iendarg
; i
++)
398 c
= 10 * c
+ (buf
[i
] - '0');
399 if (c
< 0x21 || (c
> 0x7e && c
< 0xa0) || c
> 0xff)
402 rval
= ESCAPE_NUMBERED
;
407 * Unicode escapes are defined in groff as \[u0000]
408 * to \[u10FFFF], where the contained value must be
409 * a valid Unicode codepoint. Here, however, only
410 * check the length and range.
413 if (buf
[iarg
] != 'u' || argl
< 5 || argl
> 7)
416 (buf
[iarg
+ 1] != '1' || buf
[iarg
+ 2] != '0'))
418 if (argl
== 6 && buf
[iarg
+ 1] == '0')
420 if (argl
== 5 && buf
[iarg
+ 1] == 'D' &&
421 strchr("89ABCDEF", buf
[iarg
+ 2]) != NULL
)
423 if ((int)strspn(buf
+ iarg
+ 1, "0123456789ABCDEFabcdef")
425 rval
= ESCAPE_UNICODE
;
437 rval
= ESCAPE_EXPAND
;
450 * Diagnostic messages are only issued when called
451 * from the parser, not when called from the formatters.
457 err
= MANDOCERR_ESC_BAD
;
460 err
= MANDOCERR_ESC_UNSUPP
;
463 if (esc_name
== '\\')
465 err
= MANDOCERR_ESC_UNDEF
;
468 if (mchars_spec2cp(buf
+ iarg
, argl
) >= 0)
470 err
= MANDOCERR_ESC_BAD
;
475 mandoc_msg(err
, ln
, iesc
, "%.*s", iend
- iesc
, buf
+ iesc
);