]> git.cameronkatri.com Git - mandoc.git/blob - roff_escape.c
Avoid the layering violation of re-parsing for \E in roff_expand().
[mandoc.git] / roff_escape.c
1 /* $OpenBSD$ */
2 /*
3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4 * Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 * Parser for roff(7) escape sequences.
20 * To be used by all mandoc(1) parsers and formatters.
21 */
22 #include <assert.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include "mandoc.h"
29 #include "roff.h"
30 #include "roff_int.h"
31
32 /*
33 * Traditional escape sequence interpreter for general use
34 * including in high-level formatters. This function does not issue
35 * diagnostics and is not usable for expansion in the roff(7) parser.
36 * It is documented in the mandoc_escape(3) manual page.
37 */
38 enum mandoc_esc
39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40 {
41 int iarg, iendarg, iend;
42 enum mandoc_esc rval;
43
44 rval = roff_escape(--*rendarg, 0, 0,
45 NULL, NULL, &iarg, &iendarg, &iend);
46 assert(rval != ESCAPE_EXPAND);
47 if (rarg != NULL)
48 *rarg = *rendarg + iarg;
49 if (rargl != NULL)
50 *rargl = iendarg - iarg;
51 *rendarg += iend;
52 return rval;
53 }
54
55 /*
56 * Full-featured escape sequence parser.
57 * If it encounters a nested escape sequence that requires expansion
58 * by the parser and re-parsing, the positions of that inner escape
59 * sequence are returned in *resc ... *rend.
60 * Otherwise, *resc is set to aesc and the positions of the escape
61 * sequence starting at aesc are returned.
62 * Diagnostic messages are generated if and only if resc != NULL,
63 * that is, if and only if called by roff_expand().
64 */
65 enum mandoc_esc
66 roff_escape(const char *buf, const int ln, const int aesc,
67 int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
68 {
69 int iesc; /* index of leading escape char */
70 int inam; /* index of escape name */
71 int iarg; /* index beginning the argument */
72 int iendarg; /* index right after the argument */
73 int iend; /* index right after the sequence */
74 int sesc, snam, sarg, sendarg, send; /* for sub-escape */
75 int maxl; /* expected length of the argument */
76 int argl; /* actual length of the argument */
77 int c, i; /* for \[char...] parsing */
78 int valid_A; /* for \A parsing */
79 enum mandoc_esc rval; /* return value */
80 enum mandocerr err; /* diagnostic code */
81 char term; /* byte terminating the argument */
82
83 /*
84 * Treat "\E" just like "\";
85 * it only makes a difference in copy mode.
86 */
87
88 iesc = inam = aesc;
89 do {
90 inam++;
91 } while (buf[inam] == 'E');
92
93 /*
94 * Sort the following cases first by syntax category,
95 * then by escape sequence type, and finally by ASCII code.
96 */
97
98 iarg = iendarg = iend = inam + 1;
99 maxl = INT_MAX;
100 term = '\0';
101 switch (buf[inam]) {
102
103 /* Escape sequences taking no arguments at all. */
104
105 case '!':
106 case '?':
107 case 'r':
108 rval = ESCAPE_UNSUPP;
109 goto out;
110
111 case '%':
112 case '&':
113 case ')':
114 case ',':
115 case '/':
116 case '^':
117 case 'a':
118 case 'd':
119 case 't':
120 case 'u':
121 case '{':
122 case '|':
123 case '}':
124 rval = ESCAPE_IGNORE;
125 goto out;
126
127 case '\0':
128 iendarg = --iend;
129 /* FALLTHROUGH */
130 case '\\':
131 default:
132 iarg--;
133 rval = ESCAPE_UNDEF;
134 goto out;
135
136 case ' ':
137 case '\'':
138 case '-':
139 case '.':
140 case '0':
141 case ':':
142 case '_':
143 case '`':
144 case 'e':
145 case '~':
146 iarg--;
147 argl = 1;
148 rval = ESCAPE_SPECIAL;
149 goto out;
150 case 'p':
151 rval = ESCAPE_BREAK;
152 goto out;
153 case 'c':
154 rval = ESCAPE_NOSPACE;
155 goto out;
156 case 'z':
157 rval = ESCAPE_SKIPCHAR;
158 goto out;
159
160 /* Standard argument format. */
161
162 case '$':
163 case '*':
164 case 'V':
165 case 'g':
166 case 'n':
167 rval = ESCAPE_EXPAND;
168 break;
169 case 'F':
170 case 'M':
171 case 'O':
172 case 'Y':
173 case 'k':
174 case 'm':
175 rval = ESCAPE_IGNORE;
176 break;
177 case '(':
178 case '[':
179 rval = ESCAPE_SPECIAL;
180 iendarg = iend = --iarg;
181 break;
182 case 'f':
183 rval = ESCAPE_FONT;
184 break;
185
186 /* Quoted arguments */
187
188 case 'A':
189 case 'B':
190 case 'w':
191 rval = ESCAPE_EXPAND;
192 term = '\b';
193 break;
194 case 'D':
195 case 'H':
196 case 'L':
197 case 'R':
198 case 'S':
199 case 'X':
200 case 'Z':
201 case 'b':
202 case 'v':
203 case 'x':
204 rval = ESCAPE_IGNORE;
205 term = '\b';
206 break;
207 case 'C':
208 if (buf[iarg] != '\'') {
209 rval = ESCAPE_ERROR;
210 goto out;
211 }
212 rval = ESCAPE_SPECIAL;
213 term = '\b';
214 break;
215 case 'N':
216 rval = ESCAPE_NUMBERED;
217 term = '\b';
218 break;
219 case 'h':
220 rval = ESCAPE_HORIZ;
221 term = '\b';
222 break;
223 case 'l':
224 rval = ESCAPE_HLINE;
225 term = '\b';
226 break;
227 case 'o':
228 rval = ESCAPE_OVERSTRIKE;
229 term = '\b';
230 break;
231
232 /* Sizes support both forms, with additional peculiarities. */
233
234 case 's':
235 rval = ESCAPE_IGNORE;
236 if (buf[iarg] == '+' || buf[iarg] == '-'||
237 buf[iarg] == ASCII_HYPH)
238 iarg++;
239 switch (buf[iarg]) {
240 case '(':
241 maxl = 2;
242 iarg++;
243 break;
244 case '[':
245 term = ']';
246 iarg++;
247 break;
248 case '\'':
249 term = '\'';
250 iarg++;
251 break;
252 case '1':
253 case '2':
254 case '3':
255 if (buf[iarg - 1] == 's' &&
256 isdigit((unsigned char)buf[iarg + 1])) {
257 maxl = 2;
258 break;
259 }
260 /* FALLTHROUGH */
261 default:
262 maxl = 1;
263 break;
264 }
265 iendarg = iend = iarg;
266 }
267
268 /* Decide how to end the argument. */
269
270 if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
271 buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
272 &sesc, &snam, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
273 goto out_sub;
274
275 if (term == '\b') {
276 if ((buf[inam] == 'N' && isdigit((unsigned char)buf[iarg])) ||
277 (buf[inam] == 'h' && strchr(" %&()*+-./0123456789:<=>",
278 buf[iarg]) != NULL)) {
279 iendarg = iend = iarg + 1;
280 rval = ESCAPE_ERROR;
281 goto out;
282 }
283 term = buf[iarg++];
284 } else if (term == '\0' && maxl == INT_MAX) {
285 if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
286 iarg++;
287 switch (buf[iarg]) {
288 case '(':
289 maxl = 2;
290 iarg++;
291 break;
292 case '[':
293 if (buf[++iarg] == ' ') {
294 iendarg = iend = iarg + 1;
295 rval = ESCAPE_ERROR;
296 goto out;
297 }
298 term = ']';
299 break;
300 default:
301 maxl = 1;
302 break;
303 }
304 }
305
306 /* Advance to the end of the argument. */
307
308 valid_A = 1;
309 iendarg = iarg;
310 while (maxl > 0) {
311 if (buf[iendarg] == '\0') {
312 /* Ignore an incomplete argument except for \w. */
313 if (buf[inam] != 'w')
314 iendarg = iarg;
315 break;
316 }
317 if (buf[iendarg] == term) {
318 iend = iendarg + 1;
319 break;
320 }
321 if (buf[inam] == 'N' &&
322 isdigit((unsigned char)buf[iendarg]) == 0) {
323 iend = iendarg + 1;
324 break;
325 }
326 if (buf[iendarg] == buf[iesc]) {
327 switch (roff_escape(buf, ln, iendarg,
328 &sesc, &snam, &sarg, &sendarg, &send)) {
329 case ESCAPE_EXPAND:
330 goto out_sub;
331 case ESCAPE_UNDEF:
332 break;
333 default:
334 valid_A = 0;
335 break;
336 }
337 iendarg = iend = send;
338 } else {
339 if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
340 valid_A = 0;
341 if (maxl != INT_MAX)
342 maxl--;
343 iend = ++iendarg;
344 }
345 }
346 if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
347 (term != '\0' && buf[iendarg] != term)))
348 mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
349
350 /* Post-process depending on the content of the argument. */
351
352 argl = iendarg - iarg;
353 switch (buf[inam]) {
354 case '*':
355 if (resc == NULL && argl == 2 &&
356 buf[iarg] == '.' && buf[iarg + 1] == 'T')
357 rval = ESCAPE_DEVICE;
358 break;
359 case 'A':
360 if (valid_A == 0)
361 iendarg = iarg;
362 break;
363 case 'O':
364 switch (buf[iarg]) {
365 case '0':
366 rval = ESCAPE_UNSUPP;
367 break;
368 case '1':
369 case '2':
370 case '3':
371 case '4':
372 rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
373 break;
374 case '5':
375 rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
376 ESCAPE_ERROR;
377 break;
378 default:
379 rval = ESCAPE_ERROR;
380 break;
381 }
382 break;
383 default:
384 break;
385 }
386
387 switch (rval) {
388 case ESCAPE_FONT:
389 rval = mandoc_font(buf + iarg, argl);
390 break;
391
392 case ESCAPE_SPECIAL:
393
394 /*
395 * The file chars.c only provides one common list of
396 * character names, but \[-] == \- is the only one of
397 * the characters with one-byte names that allows
398 * enclosing the name in brackets.
399 */
400
401 if (term != '\0' && argl == 1 && buf[iarg] != '-') {
402 rval = ESCAPE_ERROR;
403 break;
404 }
405
406 /* Treat \[char...] as an alias for \N'...'. */
407
408 if (buf[iarg] == 'c') {
409 if (argl < 6 || argl > 7 ||
410 strncmp(buf + iarg, "char", 4) != 0 ||
411 (int)strspn(buf + iarg + 4, "0123456789")
412 + 4 < argl)
413 break;
414 c = 0;
415 for (i = iarg; i < iendarg; i++)
416 c = 10 * c + (buf[i] - '0');
417 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
418 break;
419 iarg += 4;
420 rval = ESCAPE_NUMBERED;
421 break;
422 }
423
424 /*
425 * Unicode escapes are defined in groff as \[u0000]
426 * to \[u10FFFF], where the contained value must be
427 * a valid Unicode codepoint. Here, however, only
428 * check the length and range.
429 */
430
431 if (buf[iarg] != 'u' || argl < 5 || argl > 7)
432 break;
433 if (argl == 7 &&
434 (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
435 break;
436 if (argl == 6 && buf[iarg + 1] == '0')
437 break;
438 if (argl == 5 && buf[iarg + 1] == 'D' &&
439 strchr("89ABCDEF", buf[iarg + 2]) != NULL)
440 break;
441 if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
442 + 1 == argl)
443 rval = ESCAPE_UNICODE;
444 break;
445 default:
446 break;
447 }
448 goto out;
449
450 out_sub:
451 iesc = sesc;
452 inam = snam;
453 iarg = sarg;
454 iendarg = sendarg;
455 iend = send;
456 rval = ESCAPE_EXPAND;
457
458 out:
459 if (rnam != NULL)
460 *rnam = inam;
461 if (rarg != NULL)
462 *rarg = iarg;
463 if (rendarg != NULL)
464 *rendarg = iendarg;
465 if (rend != NULL)
466 *rend = iend;
467 if (resc == NULL)
468 return rval;
469
470 /*
471 * Diagnostic messages are only issued when called
472 * from the parser, not when called from the formatters.
473 */
474
475 *resc = iesc;
476 switch (rval) {
477 case ESCAPE_ERROR:
478 err = MANDOCERR_ESC_BAD;
479 break;
480 case ESCAPE_UNSUPP:
481 err = MANDOCERR_ESC_UNSUPP;
482 break;
483 case ESCAPE_UNDEF:
484 if (buf[inam] == '\\')
485 return rval;
486 err = MANDOCERR_ESC_UNDEF;
487 break;
488 case ESCAPE_SPECIAL:
489 if (mchars_spec2cp(buf + iarg, argl) >= 0)
490 return rval;
491 err = MANDOCERR_ESC_BAD;
492 break;
493 default:
494 return rval;
495 }
496 mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
497 return rval;
498 }