]> git.cameronkatri.com Git - mandoc.git/blob - roff_escape.c
3fdcf8d251305701bb89178253a84d9283e687b4
[mandoc.git] / roff_escape.c
1 /* $Id: roff_escape.c,v 1.12 2022/06/06 19:23:13 schwarze Exp $ */
2 /*
3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4 * Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 * Parser for roff(7) escape sequences.
20 * To be used by all mandoc(1) parsers and formatters.
21 */
22 #include <assert.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include "mandoc.h"
29 #include "roff.h"
30 #include "roff_int.h"
31
32 /*
33 * Traditional escape sequence interpreter for general use
34 * including in high-level formatters. This function does not issue
35 * diagnostics and is not usable for expansion in the roff(7) parser.
36 * It is documented in the mandoc_escape(3) manual page.
37 */
38 enum mandoc_esc
39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40 {
41 int iarg, iendarg, iend;
42 enum mandoc_esc rval;
43
44 rval = roff_escape(--*rendarg, 0, 0,
45 NULL, NULL, &iarg, &iendarg, &iend);
46 assert(rval != ESCAPE_EXPAND);
47 if (rarg != NULL)
48 *rarg = *rendarg + iarg;
49 if (rargl != NULL)
50 *rargl = iendarg - iarg;
51 *rendarg += iend;
52 return rval;
53 }
54
55 /*
56 * Full-featured escape sequence parser.
57 * If it encounters a nested escape sequence that requires expansion
58 * by the parser and re-parsing, the positions of that inner escape
59 * sequence are returned in *resc ... *rend.
60 * Otherwise, *resc is set to aesc and the positions of the escape
61 * sequence starting at aesc are returned.
62 * Diagnostic messages are generated if and only if resc != NULL,
63 * that is, if and only if called by roff_expand().
64 */
65 enum mandoc_esc
66 roff_escape(const char *buf, const int ln, const int aesc,
67 int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
68 {
69 int iesc; /* index of leading escape char */
70 int inam; /* index of escape name */
71 int iarg; /* index beginning the argument */
72 int iendarg; /* index right after the argument */
73 int iend; /* index right after the sequence */
74 int sesc, snam, sarg, sendarg, send; /* for sub-escape */
75 int maxl; /* expected length of the argument */
76 int argl; /* actual length of the argument */
77 int c, i; /* for \[char...] parsing */
78 int valid_A; /* for \A parsing */
79 enum mandoc_esc rval; /* return value */
80 enum mandocerr err; /* diagnostic code */
81 char term; /* byte terminating the argument */
82
83 /*
84 * Treat "\E" just like "\";
85 * it only makes a difference in copy mode.
86 */
87
88 iesc = inam = aesc;
89 do {
90 inam++;
91 } while (buf[inam] == 'E');
92
93 /*
94 * Sort the following cases first by syntax category,
95 * then by escape sequence type, and finally by ASCII code.
96 */
97
98 iarg = iendarg = iend = inam + 1;
99 maxl = INT_MAX;
100 term = '\0';
101 err = MANDOCERR_OK;
102 switch (buf[inam]) {
103
104 /* Escape sequences taking no arguments at all. */
105
106 case '!':
107 case '?':
108 case 'r':
109 rval = ESCAPE_UNSUPP;
110 goto out;
111
112 case '%':
113 case '&':
114 case ')':
115 case ',':
116 case '/':
117 case '^':
118 case 'a':
119 case 'd':
120 case 't':
121 case 'u':
122 case '{':
123 case '|':
124 case '}':
125 rval = ESCAPE_IGNORE;
126 goto out;
127
128 case '\0':
129 iendarg = --iend;
130 /* FALLTHROUGH */
131 case '.':
132 case '\\':
133 default:
134 iarg--;
135 rval = ESCAPE_UNDEF;
136 goto out;
137
138 case ' ':
139 case '\'':
140 case '-':
141 case '0':
142 case ':':
143 case '_':
144 case '`':
145 case 'e':
146 case '~':
147 iarg--;
148 argl = 1;
149 rval = ESCAPE_SPECIAL;
150 goto out;
151 case 'p':
152 rval = ESCAPE_BREAK;
153 goto out;
154 case 'c':
155 rval = ESCAPE_NOSPACE;
156 goto out;
157 case 'z':
158 rval = ESCAPE_SKIPCHAR;
159 goto out;
160
161 /* Standard argument format. */
162
163 case '$':
164 case '*':
165 case 'V':
166 case 'g':
167 case 'n':
168 rval = ESCAPE_EXPAND;
169 break;
170 case 'F':
171 case 'M':
172 case 'O':
173 case 'Y':
174 case 'k':
175 case 'm':
176 rval = ESCAPE_IGNORE;
177 break;
178 case '(':
179 case '[':
180 rval = ESCAPE_SPECIAL;
181 iendarg = iend = --iarg;
182 break;
183 case 'f':
184 rval = ESCAPE_FONT;
185 break;
186
187 /* Quoted arguments */
188
189 case 'A':
190 case 'B':
191 case 'w':
192 rval = ESCAPE_EXPAND;
193 term = '\b';
194 break;
195 case 'D':
196 case 'H':
197 case 'L':
198 case 'R':
199 case 'S':
200 case 'X':
201 case 'Z':
202 case 'b':
203 case 'v':
204 case 'x':
205 rval = ESCAPE_IGNORE;
206 term = '\b';
207 break;
208 case 'C':
209 rval = ESCAPE_SPECIAL;
210 term = '\b';
211 break;
212 case 'N':
213 rval = ESCAPE_NUMBERED;
214 term = '\b';
215 break;
216 case 'h':
217 rval = ESCAPE_HORIZ;
218 term = '\b';
219 break;
220 case 'l':
221 rval = ESCAPE_HLINE;
222 term = '\b';
223 break;
224 case 'o':
225 rval = ESCAPE_OVERSTRIKE;
226 term = '\b';
227 break;
228
229 /* Sizes support both forms, with additional peculiarities. */
230
231 case 's':
232 rval = ESCAPE_IGNORE;
233 if (buf[iarg] == '+' || buf[iarg] == '-'||
234 buf[iarg] == ASCII_HYPH)
235 iarg++;
236 switch (buf[iarg]) {
237 case '(':
238 maxl = 2;
239 iarg++;
240 break;
241 case '[':
242 term = ']';
243 iarg++;
244 break;
245 case '\'':
246 term = '\'';
247 iarg++;
248 break;
249 case '1':
250 case '2':
251 case '3':
252 if (buf[iarg - 1] == 's' &&
253 isdigit((unsigned char)buf[iarg + 1])) {
254 maxl = 2;
255 break;
256 }
257 /* FALLTHROUGH */
258 default:
259 maxl = 1;
260 break;
261 }
262 iendarg = iend = iarg;
263 }
264
265 /* Decide how to end the argument. */
266
267 if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
268 buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
269 &sesc, &snam, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
270 goto out_sub;
271
272 if (term == '\b') {
273 if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
274 strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
275 if (rval != ESCAPE_EXPAND)
276 rval = ESCAPE_ERROR;
277 if (buf[inam] != 'D') {
278 iendarg = iend = iarg + 1;
279 goto out;
280 }
281 }
282 term = buf[iarg++];
283 } else if (term == '\0' && maxl == INT_MAX) {
284 if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
285 iarg++;
286 switch (buf[iarg]) {
287 case '(':
288 maxl = 2;
289 iarg++;
290 break;
291 case '[':
292 if (buf[++iarg] == ' ') {
293 iendarg = iend = iarg + 1;
294 rval = ESCAPE_ERROR;
295 goto out;
296 }
297 term = ']';
298 break;
299 default:
300 maxl = 1;
301 break;
302 }
303 }
304
305 /* Advance to the end of the argument. */
306
307 valid_A = 1;
308 iendarg = iarg;
309 while (maxl > 0) {
310 if (buf[iendarg] == '\0') {
311 err = MANDOCERR_ESC_INCOMPLETE;
312 if (rval != ESCAPE_EXPAND)
313 rval = ESCAPE_ERROR;
314 /* Ignore an incomplete argument except for \w. */
315 if (buf[inam] != 'w')
316 iendarg = iarg;
317 break;
318 }
319 if (buf[iendarg] == term) {
320 iend = iendarg + 1;
321 break;
322 }
323 if (buf[inam] == 'N' &&
324 isdigit((unsigned char)buf[iendarg]) == 0) {
325 iend = iendarg + 1;
326 break;
327 }
328 if (buf[iendarg] == buf[iesc]) {
329 switch (roff_escape(buf, ln, iendarg,
330 &sesc, &snam, &sarg, &sendarg, &send)) {
331 case ESCAPE_EXPAND:
332 goto out_sub;
333 case ESCAPE_UNDEF:
334 break;
335 default:
336 valid_A = 0;
337 break;
338 }
339 iendarg = iend = send;
340 } else {
341 if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
342 valid_A = 0;
343 if (maxl != INT_MAX)
344 maxl--;
345 iend = ++iendarg;
346 }
347 }
348
349 /* Post-process depending on the content of the argument. */
350
351 argl = iendarg - iarg;
352 switch (buf[inam]) {
353 case '*':
354 if (resc == NULL && argl == 2 &&
355 buf[iarg] == '.' && buf[iarg + 1] == 'T')
356 rval = ESCAPE_DEVICE;
357 break;
358 case 'A':
359 if (valid_A == 0)
360 iendarg = iarg;
361 break;
362 case 'O':
363 switch (buf[iarg]) {
364 case '0':
365 rval = ESCAPE_UNSUPP;
366 break;
367 case '1':
368 case '2':
369 case '3':
370 case '4':
371 rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
372 break;
373 case '5':
374 rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
375 ESCAPE_ERROR;
376 break;
377 default:
378 rval = ESCAPE_ERROR;
379 break;
380 }
381 break;
382 default:
383 break;
384 }
385
386 switch (rval) {
387 case ESCAPE_FONT:
388 rval = mandoc_font(buf + iarg, argl);
389 break;
390
391 case ESCAPE_SPECIAL:
392 if (argl == 0) {
393 err = MANDOCERR_ESC_BADCHAR;
394 rval = ESCAPE_ERROR;
395 break;
396 }
397
398 /*
399 * The file chars.c only provides one common list of
400 * character names, but \[-] == \- is the only one of
401 * the characters with one-byte names that allows
402 * enclosing the name in brackets.
403 */
404
405 if (term != '\0' && argl == 1 && buf[iarg] != '-') {
406 err = MANDOCERR_ESC_BADCHAR;
407 rval = ESCAPE_ERROR;
408 break;
409 }
410
411 /* Treat \[char...] as an alias for \N'...'. */
412
413 if (buf[iarg] == 'c') {
414 if (argl < 6 || argl > 7 ||
415 strncmp(buf + iarg, "char", 4) != 0 ||
416 (int)strspn(buf + iarg + 4, "0123456789")
417 + 4 < argl)
418 break;
419 c = 0;
420 for (i = iarg; i < iendarg; i++)
421 c = 10 * c + (buf[i] - '0');
422 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
423 err = MANDOCERR_ESC_BADCHAR;
424 break;
425 }
426 iarg += 4;
427 rval = ESCAPE_NUMBERED;
428 break;
429 }
430
431 /*
432 * Unicode escapes are defined in groff as \[u0000]
433 * to \[u10FFFF], where the contained value must be
434 * a valid Unicode codepoint. Here, however, only
435 * check the length and range.
436 */
437
438 if (buf[iarg] != 'u' || argl < 5 || argl > 7)
439 break;
440 if (argl == 7 &&
441 (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
442 err = MANDOCERR_ESC_BADCHAR;
443 break;
444 }
445 if (argl == 6 && buf[iarg + 1] == '0') {
446 err = MANDOCERR_ESC_BADCHAR;
447 break;
448 }
449 if (argl == 5 && buf[iarg + 1] == 'D' &&
450 strchr("89ABCDEF", buf[iarg + 2]) != NULL) {
451 err = MANDOCERR_ESC_BADCHAR;
452 break;
453 }
454 if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
455 + 1 == argl)
456 rval = ESCAPE_UNICODE;
457 break;
458 default:
459 break;
460 }
461 goto out;
462
463 out_sub:
464 iesc = sesc;
465 inam = snam;
466 iarg = sarg;
467 iendarg = sendarg;
468 iend = send;
469 rval = ESCAPE_EXPAND;
470
471 out:
472 if (rnam != NULL)
473 *rnam = inam;
474 if (rarg != NULL)
475 *rarg = iarg;
476 if (rendarg != NULL)
477 *rendarg = iendarg;
478 if (rend != NULL)
479 *rend = iend;
480 if (resc == NULL)
481 return rval;
482
483 /*
484 * Diagnostic messages are only issued when called
485 * from the parser, not when called from the formatters.
486 */
487
488 *resc = iesc;
489 switch (rval) {
490 case ESCAPE_ERROR:
491 if (err == MANDOCERR_OK)
492 err = MANDOCERR_ESC_BAD;
493 break;
494 case ESCAPE_UNSUPP:
495 err = MANDOCERR_ESC_UNSUPP;
496 break;
497 case ESCAPE_UNDEF:
498 if (buf[inam] != '\\' && buf[inam] != '.')
499 err = MANDOCERR_ESC_UNDEF;
500 break;
501 case ESCAPE_SPECIAL:
502 if (mchars_spec2cp(buf + iarg, argl) >= 0)
503 err = MANDOCERR_OK;
504 else if (err == MANDOCERR_OK)
505 err = MANDOCERR_ESC_UNKCHAR;
506 break;
507 default:
508 break;
509 }
510 if (err != MANDOCERR_OK)
511 mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
512 return rval;
513 }