]> git.cameronkatri.com Git - mandoc.git/blob - roff_escape.c
Re-classify the roff(7) \r (reverse line feed) escape sequence
[mandoc.git] / roff_escape.c
1 /* $OpenBSD$ */
2 /*
3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4 * Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 * Parser for roff(7) escape sequences.
20 * To be used by all mandoc(1) parsers and formatters.
21 */
22 #include <assert.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include "mandoc.h"
29 #include "roff.h"
30 #include "roff_int.h"
31
32 /*
33 * Traditional escape sequence interpreter for general use
34 * including in high-level formatters. This function does not issue
35 * diagnostics and is not usable for expansion in the roff(7) parser.
36 * It is documented in the mandoc_escape(3) manual page.
37 */
38 enum mandoc_esc
39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40 {
41 int iarg, iendarg, iend;
42 enum mandoc_esc rval;
43
44 rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
45 assert(rval != ESCAPE_EXPAND);
46 if (rarg != NULL)
47 *rarg = *rendarg + iarg;
48 if (rargl != NULL)
49 *rargl = iendarg - iarg;
50 *rendarg += iend;
51 return rval;
52 }
53
54 /*
55 * Full-featured escape sequence parser.
56 * If it encounters a nested escape sequence that requires expansion
57 * by the parser and re-parsing, the positions of that inner escape
58 * sequence are returned in *resc ... *rend.
59 * Otherwise, *resc is set to aesc and the positions of the escape
60 * sequence starting at aesc are returned.
61 * Diagnostic messages are generated if and only if resc != NULL,
62 * that is, if and only if called by roff_expand().
63 */
64 enum mandoc_esc
65 roff_escape(const char *buf, const int ln, const int aesc,
66 int *resc, int *rarg, int *rendarg, int *rend)
67 {
68 int iesc; /* index of leading escape char */
69 int iarg; /* index beginning the argument */
70 int iendarg; /* index right after the argument */
71 int iend; /* index right after the sequence */
72 int sesc, sarg, sendarg, send; /* for sub-escape */
73 int maxl; /* expected length of the argument */
74 int argl; /* actual length of the argument */
75 int c, i; /* for \[char...] parsing */
76 enum mandoc_esc rval; /* return value */
77 enum mandocerr err; /* diagnostic code */
78 char esc_name;
79 char term; /* byte terminating the argument */
80
81 /*
82 * Treat "\E" just like "\";
83 * it only makes a difference in copy mode.
84 */
85
86 iesc = iarg = aesc;
87 do {
88 iarg++;
89 } while (buf[iarg] == 'E');
90
91 /*
92 * Sort the following cases first by syntax category,
93 * then by escape sequence type, and finally by ASCII code.
94 */
95
96 esc_name = buf[iarg];
97 iendarg = iend = ++iarg;
98 maxl = INT_MAX;
99 term = '\0';
100 switch (esc_name) {
101
102 /* Escape sequences taking no arguments at all. */
103
104 case '!':
105 case '?':
106 case 'r':
107 rval = ESCAPE_UNSUPP;
108 goto out;
109
110 case '%':
111 case '&':
112 case ')':
113 case ',':
114 case '/':
115 case '^':
116 case 'a':
117 case 'd':
118 case 't':
119 case 'u':
120 case '{':
121 case '|':
122 case '}':
123 rval = ESCAPE_IGNORE;
124 goto out;
125
126 case '\\':
127 default:
128 iarg--;
129 rval = ESCAPE_UNDEF;
130 goto out;
131
132 case ' ':
133 case '\'':
134 case '-':
135 case '.':
136 case '0':
137 case ':':
138 case '_':
139 case '`':
140 case 'e':
141 case '~':
142 iarg--;
143 argl = 1;
144 rval = ESCAPE_SPECIAL;
145 goto out;
146 case 'p':
147 rval = ESCAPE_BREAK;
148 goto out;
149 case 'c':
150 rval = ESCAPE_NOSPACE;
151 goto out;
152 case 'z':
153 rval = ESCAPE_SKIPCHAR;
154 goto out;
155
156 /* Standard argument format. */
157
158 case '$':
159 case '*':
160 case 'n':
161 rval = ESCAPE_EXPAND;
162 break;
163 case 'F':
164 case 'M':
165 case 'O':
166 case 'V':
167 case 'Y':
168 case 'g':
169 case 'k':
170 case 'm':
171 rval = ESCAPE_IGNORE;
172 break;
173 case '(':
174 case '[':
175 rval = ESCAPE_SPECIAL;
176 iendarg = iend = --iarg;
177 break;
178 case 'f':
179 rval = ESCAPE_FONT;
180 break;
181
182 /* Quoted arguments */
183
184 case 'B':
185 case 'w':
186 rval = ESCAPE_EXPAND;
187 term = '\b';
188 break;
189 case 'A':
190 case 'D':
191 case 'H':
192 case 'L':
193 case 'R':
194 case 'S':
195 case 'X':
196 case 'Z':
197 case 'b':
198 case 'v':
199 case 'x':
200 rval = ESCAPE_IGNORE;
201 term = '\b';
202 break;
203 case 'C':
204 if (buf[iarg] != '\'') {
205 rval = ESCAPE_ERROR;
206 goto out;
207 }
208 rval = ESCAPE_SPECIAL;
209 term = '\b';
210 break;
211 case 'N':
212 rval = ESCAPE_NUMBERED;
213 term = '\b';
214 break;
215 case 'h':
216 rval = ESCAPE_HORIZ;
217 term = '\b';
218 break;
219 case 'l':
220 rval = ESCAPE_HLINE;
221 term = '\b';
222 break;
223 case 'o':
224 rval = ESCAPE_OVERSTRIKE;
225 term = '\b';
226 break;
227
228 /* Sizes support both forms, with additional peculiarities. */
229
230 case 's':
231 rval = ESCAPE_IGNORE;
232 if (buf[iarg] == '+' || buf[iarg] == '-'||
233 buf[iarg] == ASCII_HYPH)
234 iarg++;
235 switch (buf[iarg]) {
236 case '(':
237 maxl = 2;
238 iarg++;
239 break;
240 case '[':
241 term = ']';
242 iarg++;
243 break;
244 case '\'':
245 term = '\'';
246 iarg++;
247 break;
248 case '1':
249 case '2':
250 case '3':
251 if (buf[iarg - 1] == 's' &&
252 isdigit((unsigned char)buf[iarg + 1])) {
253 maxl = 2;
254 break;
255 }
256 /* FALLTHROUGH */
257 default:
258 maxl = 1;
259 break;
260 }
261 iendarg = iend = iarg;
262 }
263
264 /* Decide how to end the argument. */
265
266 if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
267 buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
268 &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
269 goto out_sub;
270
271 if (term == '\b') {
272 if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
273 (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
274 buf[iarg]) != NULL)) {
275 iendarg = iend = iarg + 1;
276 rval = ESCAPE_ERROR;
277 goto out;
278 }
279 term = buf[iarg++];
280 } else if (term == '\0' && maxl == INT_MAX) {
281 if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
282 iarg++;
283 switch (buf[iarg]) {
284 case '(':
285 maxl = 2;
286 iarg++;
287 break;
288 case '[':
289 if (buf[++iarg] == ' ') {
290 iendarg = iend = iarg + 1;
291 rval = ESCAPE_ERROR;
292 goto out;
293 }
294 term = ']';
295 break;
296 default:
297 maxl = 1;
298 break;
299 }
300 }
301
302 /* Advance to the end of the argument. */
303
304 iendarg = iarg;
305 while (maxl > 0) {
306 if (buf[iendarg] == '\0') {
307 /* Ignore an incomplete argument except for \w. */
308 if (esc_name != 'w')
309 iendarg = iarg;
310 break;
311 }
312 if (buf[iendarg] == term) {
313 iend = iendarg + 1;
314 break;
315 }
316 if (esc_name == 'N' &&
317 isdigit((unsigned char)buf[iendarg]) == 0) {
318 iend = iendarg + 1;
319 break;
320 }
321 if (buf[iendarg] == buf[iesc]) {
322 if (roff_escape(buf, ln, iendarg,
323 &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
324 goto out_sub;
325 iendarg = iend = send;
326 } else {
327 if (maxl != INT_MAX)
328 maxl--;
329 iend = ++iendarg;
330 }
331 }
332 if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
333 (term != '\0' && buf[iendarg] != term)))
334 mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
335
336 /* Post-process depending on the content of the argument. */
337
338 argl = iendarg - iarg;
339 switch (esc_name) {
340 case '*':
341 if (resc == NULL && argl == 2 &&
342 buf[iarg] == '.' && buf[iarg + 1] == 'T')
343 rval = ESCAPE_DEVICE;
344 break;
345 case 'O':
346 switch (buf[iarg]) {
347 case '0':
348 rval = ESCAPE_UNSUPP;
349 break;
350 case '1':
351 case '2':
352 case '3':
353 case '4':
354 rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
355 break;
356 case '5':
357 rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
358 ESCAPE_ERROR;
359 break;
360 default:
361 rval = ESCAPE_ERROR;
362 break;
363 }
364 break;
365 default:
366 break;
367 }
368
369 switch (rval) {
370 case ESCAPE_FONT:
371 rval = mandoc_font(buf + iarg, argl);
372 break;
373
374 case ESCAPE_SPECIAL:
375
376 /*
377 * The file chars.c only provides one common list of
378 * character names, but \[-] == \- is the only one of
379 * the characters with one-byte names that allows
380 * enclosing the name in brackets.
381 */
382
383 if (term != '\0' && argl == 1 && buf[iarg] != '-') {
384 rval = ESCAPE_ERROR;
385 break;
386 }
387
388 /* Treat \[char...] as an alias for \N'...'. */
389
390 if (buf[iarg] == 'c') {
391 if (argl < 6 || argl > 7 ||
392 strncmp(buf + iarg, "char", 4) != 0 ||
393 (int)strspn(buf + iarg + 4, "0123456789")
394 + 4 < argl)
395 break;
396 c = 0;
397 for (i = iarg; i < iendarg; i++)
398 c = 10 * c + (buf[i] - '0');
399 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
400 break;
401 iarg += 4;
402 rval = ESCAPE_NUMBERED;
403 break;
404 }
405
406 /*
407 * Unicode escapes are defined in groff as \[u0000]
408 * to \[u10FFFF], where the contained value must be
409 * a valid Unicode codepoint. Here, however, only
410 * check the length and range.
411 */
412
413 if (buf[iarg] != 'u' || argl < 5 || argl > 7)
414 break;
415 if (argl == 7 &&
416 (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
417 break;
418 if (argl == 6 && buf[iarg + 1] == '0')
419 break;
420 if (argl == 5 && buf[iarg + 1] == 'D' &&
421 strchr("89ABCDEF", buf[iarg + 2]) != NULL)
422 break;
423 if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
424 + 1 == argl)
425 rval = ESCAPE_UNICODE;
426 break;
427 default:
428 break;
429 }
430 goto out;
431
432 out_sub:
433 iesc = sesc;
434 iarg = sarg;
435 iendarg = sendarg;
436 iend = send;
437 rval = ESCAPE_EXPAND;
438
439 out:
440 if (rarg != NULL)
441 *rarg = iarg;
442 if (rendarg != NULL)
443 *rendarg = iendarg;
444 if (rend != NULL)
445 *rend = iend;
446 if (resc == NULL)
447 return rval;
448
449 /*
450 * Diagnostic messages are only issued when called
451 * from the parser, not when called from the formatters.
452 */
453
454 *resc = iesc;
455 switch (rval) {
456 case ESCAPE_ERROR:
457 err = MANDOCERR_ESC_BAD;
458 break;
459 case ESCAPE_UNSUPP:
460 err = MANDOCERR_ESC_UNSUPP;
461 break;
462 case ESCAPE_UNDEF:
463 if (esc_name == '\\')
464 return rval;
465 err = MANDOCERR_ESC_UNDEF;
466 break;
467 case ESCAPE_SPECIAL:
468 if (mchars_spec2cp(buf + iarg, argl) >= 0)
469 return rval;
470 err = MANDOCERR_ESC_BAD;
471 break;
472 default:
473 return rval;
474 }
475 mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
476 return rval;
477 }