]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.c
Several improvements to escape sequence handling.
[mandoc.git] / mandoc.c
1 /* $Id: mandoc.c,v 1.111 2018/12/15 19:30:26 schwarze Exp $ */
2 /*
3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30
31 #include "mandoc_aux.h"
32 #include "mandoc.h"
33 #include "roff.h"
34 #include "libmandoc.h"
35
36 static int a2time(time_t *, const char *, const char *);
37 static char *time2a(time_t);
38
39
40 enum mandoc_esc
41 mandoc_escape(const char **end, const char **start, int *sz)
42 {
43 const char *local_start;
44 int local_sz, c, i;
45 char term;
46 enum mandoc_esc gly;
47
48 /*
49 * When the caller doesn't provide return storage,
50 * use local storage.
51 */
52
53 if (NULL == start)
54 start = &local_start;
55 if (NULL == sz)
56 sz = &local_sz;
57
58 /*
59 * Treat "\E" just like "\";
60 * it only makes a difference in copy mode.
61 */
62
63 if (**end == 'E')
64 ++*end;
65
66 /*
67 * Beyond the backslash, at least one input character
68 * is part of the escape sequence. With one exception
69 * (see below), that character won't be returned.
70 */
71
72 gly = ESCAPE_ERROR;
73 *start = ++*end;
74 *sz = 0;
75 term = '\0';
76
77 switch ((*start)[-1]) {
78 /*
79 * First the glyphs. There are several different forms of
80 * these, but each eventually returns a substring of the glyph
81 * name.
82 */
83 case '(':
84 gly = ESCAPE_SPECIAL;
85 *sz = 2;
86 break;
87 case '[':
88 if (**start == ' ') {
89 ++*end;
90 return ESCAPE_ERROR;
91 }
92 gly = ESCAPE_SPECIAL;
93 term = ']';
94 break;
95 case 'C':
96 if ('\'' != **start)
97 return ESCAPE_ERROR;
98 *start = ++*end;
99 gly = ESCAPE_SPECIAL;
100 term = '\'';
101 break;
102
103 /*
104 * Escapes taking no arguments at all.
105 */
106 case '!':
107 case '?':
108 return ESCAPE_UNSUPP;
109 case '%':
110 case '&':
111 case ')':
112 case ',':
113 case '/':
114 case '^':
115 case 'a':
116 case 'd':
117 case 'r':
118 case 't':
119 case 'u':
120 case '{':
121 case '|':
122 case '}':
123 return ESCAPE_IGNORE;
124 case 'c':
125 return ESCAPE_NOSPACE;
126 case 'p':
127 return ESCAPE_BREAK;
128
129 /*
130 * The \z escape is supposed to output the following
131 * character without advancing the cursor position.
132 * Since we are mostly dealing with terminal mode,
133 * let us just skip the next character.
134 */
135 case 'z':
136 return ESCAPE_SKIPCHAR;
137
138 /*
139 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
140 * 'X' is the trigger. These have opaque sub-strings.
141 */
142 case 'F':
143 case 'f':
144 case 'g':
145 case 'k':
146 case 'M':
147 case 'm':
148 case 'n':
149 case 'O':
150 case 'V':
151 case 'Y':
152 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
153 switch (**start) {
154 case '(':
155 if ((*start)[-1] == 'O')
156 gly = ESCAPE_ERROR;
157 *start = ++*end;
158 *sz = 2;
159 break;
160 case '[':
161 if ((*start)[-1] == 'O')
162 gly = (*start)[1] == '5' ?
163 ESCAPE_UNSUPP : ESCAPE_ERROR;
164 *start = ++*end;
165 term = ']';
166 break;
167 default:
168 if ((*start)[-1] == 'O') {
169 switch (**start) {
170 case '0':
171 gly = ESCAPE_UNSUPP;
172 break;
173 case '1':
174 case '2':
175 case '3':
176 case '4':
177 break;
178 default:
179 gly = ESCAPE_ERROR;
180 break;
181 }
182 }
183 *sz = 1;
184 break;
185 }
186 break;
187 case '*':
188 if (strncmp(*start, "(.T", 3) != 0)
189 abort();
190 gly = ESCAPE_DEVICE;
191 *start = ++*end;
192 *sz = 2;
193 break;
194
195 /*
196 * These escapes are of the form \X'Y', where 'X' is the trigger
197 * and 'Y' is any string. These have opaque sub-strings.
198 * The \B and \w escapes are handled in roff.c, roff_res().
199 */
200 case 'A':
201 case 'b':
202 case 'D':
203 case 'R':
204 case 'X':
205 case 'Z':
206 gly = ESCAPE_IGNORE;
207 /* FALLTHROUGH */
208 case 'o':
209 if (**start == '\0')
210 return ESCAPE_ERROR;
211 if (gly == ESCAPE_ERROR)
212 gly = ESCAPE_OVERSTRIKE;
213 term = **start;
214 *start = ++*end;
215 break;
216
217 /*
218 * These escapes are of the form \X'N', where 'X' is the trigger
219 * and 'N' resolves to a numerical expression.
220 */
221 case 'h':
222 case 'H':
223 case 'L':
224 case 'l':
225 case 'S':
226 case 'v':
227 case 'x':
228 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
229 if ('\0' != **start)
230 ++*end;
231 return ESCAPE_ERROR;
232 }
233 switch ((*start)[-1]) {
234 case 'h':
235 gly = ESCAPE_HORIZ;
236 break;
237 case 'l':
238 gly = ESCAPE_HLINE;
239 break;
240 default:
241 gly = ESCAPE_IGNORE;
242 break;
243 }
244 term = **start;
245 *start = ++*end;
246 break;
247
248 /*
249 * Special handling for the numbered character escape.
250 * XXX Do any other escapes need similar handling?
251 */
252 case 'N':
253 if ('\0' == **start)
254 return ESCAPE_ERROR;
255 (*end)++;
256 if (isdigit((unsigned char)**start)) {
257 *sz = 1;
258 return ESCAPE_IGNORE;
259 }
260 (*start)++;
261 while (isdigit((unsigned char)**end))
262 (*end)++;
263 *sz = *end - *start;
264 if ('\0' != **end)
265 (*end)++;
266 return ESCAPE_NUMBERED;
267
268 /*
269 * Sizes get a special category of their own.
270 */
271 case 's':
272 gly = ESCAPE_IGNORE;
273
274 /* See +/- counts as a sign. */
275 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
276 *start = ++*end;
277
278 switch (**end) {
279 case '(':
280 *start = ++*end;
281 *sz = 2;
282 break;
283 case '[':
284 *start = ++*end;
285 term = ']';
286 break;
287 case '\'':
288 *start = ++*end;
289 term = '\'';
290 break;
291 case '3':
292 case '2':
293 case '1':
294 *sz = (*end)[-1] == 's' &&
295 isdigit((unsigned char)(*end)[1]) ? 2 : 1;
296 break;
297 default:
298 *sz = 1;
299 break;
300 }
301
302 break;
303
304 /*
305 * Several special characters can be encoded as
306 * one-byte escape sequences without using \[].
307 */
308 case ' ':
309 case '\'':
310 case '-':
311 case '.':
312 case '0':
313 case ':':
314 case '_':
315 case '`':
316 case 'e':
317 case '~':
318 gly = ESCAPE_SPECIAL;
319 /* FALLTHROUGH */
320 default:
321 if (gly == ESCAPE_ERROR)
322 gly = ESCAPE_UNDEF;
323 *start = --*end;
324 *sz = 1;
325 break;
326 }
327
328 /*
329 * Read up to the terminating character,
330 * paying attention to nested escapes.
331 */
332
333 if ('\0' != term) {
334 while (**end != term) {
335 switch (**end) {
336 case '\0':
337 return ESCAPE_ERROR;
338 case '\\':
339 (*end)++;
340 if (ESCAPE_ERROR ==
341 mandoc_escape(end, NULL, NULL))
342 return ESCAPE_ERROR;
343 break;
344 default:
345 (*end)++;
346 break;
347 }
348 }
349 *sz = (*end)++ - *start;
350
351 /*
352 * The file chars.c only provides one common list
353 * of character names, but \[-] == \- is the only
354 * one of the characters with one-byte names that
355 * allows enclosing the name in brackets.
356 */
357 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
358 return ESCAPE_ERROR;
359 } else {
360 assert(*sz > 0);
361 if ((size_t)*sz > strlen(*start))
362 return ESCAPE_ERROR;
363 *end += *sz;
364 }
365
366 /* Run post-processors. */
367
368 switch (gly) {
369 case ESCAPE_FONT:
370 if (*sz == 2) {
371 if (**start == 'C') {
372 if ((*start)[1] == 'W' ||
373 (*start)[1] == 'R') {
374 gly = ESCAPE_FONTCW;
375 break;
376 }
377 /*
378 * Treat other constant-width font modes
379 * just like regular font modes.
380 */
381 (*start)++;
382 (*sz)--;
383 } else {
384 if ((*start)[0] == 'B' && (*start)[1] == 'I')
385 gly = ESCAPE_FONTBI;
386 break;
387 }
388 } else if (*sz != 1) {
389 if (*sz == 0)
390 gly = ESCAPE_FONTPREV;
391 break;
392 }
393
394 switch (**start) {
395 case '3':
396 case 'B':
397 gly = ESCAPE_FONTBOLD;
398 break;
399 case '2':
400 case 'I':
401 gly = ESCAPE_FONTITALIC;
402 break;
403 case 'P':
404 gly = ESCAPE_FONTPREV;
405 break;
406 case '1':
407 case 'R':
408 gly = ESCAPE_FONTROMAN;
409 break;
410 }
411 break;
412 case ESCAPE_SPECIAL:
413 if (**start == 'c') {
414 if (*sz < 6 || *sz > 7 ||
415 strncmp(*start, "char", 4) != 0 ||
416 (int)strspn(*start + 4, "0123456789") + 4 < *sz)
417 break;
418 c = 0;
419 for (i = 4; i < *sz; i++)
420 c = 10 * c + ((*start)[i] - '0');
421 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
422 break;
423 *start += 4;
424 *sz -= 4;
425 gly = ESCAPE_NUMBERED;
426 break;
427 }
428
429 /*
430 * Unicode escapes are defined in groff as \[u0000]
431 * to \[u10FFFF], where the contained value must be
432 * a valid Unicode codepoint. Here, however, only
433 * check the length and range.
434 */
435 if (**start != 'u' || *sz < 5 || *sz > 7)
436 break;
437 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
438 break;
439 if (*sz == 6 && (*start)[1] == '0')
440 break;
441 if (*sz == 5 && (*start)[1] == 'D' &&
442 strchr("89ABCDEF", (*start)[2]) != NULL)
443 break;
444 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
445 + 1 == *sz)
446 gly = ESCAPE_UNICODE;
447 break;
448 default:
449 break;
450 }
451
452 return gly;
453 }
454
455 /*
456 * Parse a quoted or unquoted roff-style request or macro argument.
457 * Return a pointer to the parsed argument, which is either the original
458 * pointer or advanced by one byte in case the argument is quoted.
459 * NUL-terminate the argument in place.
460 * Collapse pairs of quotes inside quoted arguments.
461 * Advance the argument pointer to the next argument,
462 * or to the NUL byte terminating the argument line.
463 */
464 char *
465 mandoc_getarg(char **cpp, int ln, int *pos)
466 {
467 char *start, *cp;
468 int quoted, pairs, white;
469
470 /* Quoting can only start with a new word. */
471 start = *cpp;
472 quoted = 0;
473 if ('"' == *start) {
474 quoted = 1;
475 start++;
476 }
477
478 pairs = 0;
479 white = 0;
480 for (cp = start; '\0' != *cp; cp++) {
481
482 /*
483 * Move the following text left
484 * after quoted quotes and after "\\" and "\t".
485 */
486 if (pairs)
487 cp[-pairs] = cp[0];
488
489 if ('\\' == cp[0]) {
490 /*
491 * In copy mode, translate double to single
492 * backslashes and backslash-t to literal tabs.
493 */
494 switch (cp[1]) {
495 case 'a':
496 case 't':
497 cp[0] = '\t';
498 /* FALLTHROUGH */
499 case '\\':
500 pairs++;
501 cp++;
502 break;
503 case ' ':
504 /* Skip escaped blanks. */
505 if (0 == quoted)
506 cp++;
507 break;
508 default:
509 break;
510 }
511 } else if (0 == quoted) {
512 if (' ' == cp[0]) {
513 /* Unescaped blanks end unquoted args. */
514 white = 1;
515 break;
516 }
517 } else if ('"' == cp[0]) {
518 if ('"' == cp[1]) {
519 /* Quoted quotes collapse. */
520 pairs++;
521 cp++;
522 } else {
523 /* Unquoted quotes end quoted args. */
524 quoted = 2;
525 break;
526 }
527 }
528 }
529
530 /* Quoted argument without a closing quote. */
531 if (1 == quoted)
532 mandoc_msg(MANDOCERR_ARG_QUOTE, ln, *pos, NULL);
533
534 /* NUL-terminate this argument and move to the next one. */
535 if (pairs)
536 cp[-pairs] = '\0';
537 if ('\0' != *cp) {
538 *cp++ = '\0';
539 while (' ' == *cp)
540 cp++;
541 }
542 *pos += (int)(cp - start) + (quoted ? 1 : 0);
543 *cpp = cp;
544
545 if ('\0' == *cp && (white || ' ' == cp[-1]))
546 mandoc_msg(MANDOCERR_SPACE_EOL, ln, *pos, NULL);
547
548 return start;
549 }
550
551 static int
552 a2time(time_t *t, const char *fmt, const char *p)
553 {
554 struct tm tm;
555 char *pp;
556
557 memset(&tm, 0, sizeof(struct tm));
558
559 pp = NULL;
560 #if HAVE_STRPTIME
561 pp = strptime(p, fmt, &tm);
562 #endif
563 if (NULL != pp && '\0' == *pp) {
564 *t = mktime(&tm);
565 return 1;
566 }
567
568 return 0;
569 }
570
571 static char *
572 time2a(time_t t)
573 {
574 struct tm *tm;
575 char *buf, *p;
576 size_t ssz;
577 int isz;
578
579 tm = localtime(&t);
580 if (tm == NULL)
581 return NULL;
582
583 /*
584 * Reserve space:
585 * up to 9 characters for the month (September) + blank
586 * up to 2 characters for the day + comma + blank
587 * 4 characters for the year and a terminating '\0'
588 */
589
590 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
591
592 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
593 goto fail;
594 p += (int)ssz;
595
596 /*
597 * The output format is just "%d" here, not "%2d" or "%02d".
598 * That's also the reason why we can't just format the
599 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
600 * Besides, the present approach is less prone to buffer
601 * overflows, in case anybody should ever introduce the bug
602 * of looking at LC_TIME.
603 */
604
605 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
606 goto fail;
607 p += isz;
608
609 if (strftime(p, 4 + 1, "%Y", tm) == 0)
610 goto fail;
611 return buf;
612
613 fail:
614 free(buf);
615 return NULL;
616 }
617
618 char *
619 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
620 {
621 char *cp;
622 time_t t;
623
624 /* No date specified: use today's date. */
625
626 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
627 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
628 return time2a(time(NULL));
629 }
630
631 /* Valid mdoc(7) date format. */
632
633 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
634 a2time(&t, "%b %d, %Y", in)) {
635 cp = time2a(t);
636 if (t > time(NULL) + 86400)
637 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
638 else if (*in != '$' && strcmp(in, cp) != 0)
639 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
640 return cp;
641 }
642
643 /* In man(7), do not warn about the legacy format. */
644
645 if (a2time(&t, "%Y-%m-%d", in) == 0)
646 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
647 else if (t > time(NULL) + 86400)
648 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
649 else if (man->macroset == MACROSET_MDOC)
650 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
651
652 /* Use any non-mdoc(7) date verbatim. */
653
654 return mandoc_strdup(in);
655 }
656
657 int
658 mandoc_eos(const char *p, size_t sz)
659 {
660 const char *q;
661 int enclosed, found;
662
663 if (0 == sz)
664 return 0;
665
666 /*
667 * End-of-sentence recognition must include situations where
668 * some symbols, such as `)', allow prior EOS punctuation to
669 * propagate outward.
670 */
671
672 enclosed = found = 0;
673 for (q = p + (int)sz - 1; q >= p; q--) {
674 switch (*q) {
675 case '\"':
676 case '\'':
677 case ']':
678 case ')':
679 if (0 == found)
680 enclosed = 1;
681 break;
682 case '.':
683 case '!':
684 case '?':
685 found = 1;
686 break;
687 default:
688 return found &&
689 (!enclosed || isalnum((unsigned char)*q));
690 }
691 }
692
693 return found && !enclosed;
694 }
695
696 /*
697 * Convert a string to a long that may not be <0.
698 * If the string is invalid, or is less than 0, return -1.
699 */
700 int
701 mandoc_strntoi(const char *p, size_t sz, int base)
702 {
703 char buf[32];
704 char *ep;
705 long v;
706
707 if (sz > 31)
708 return -1;
709
710 memcpy(buf, p, sz);
711 buf[(int)sz] = '\0';
712
713 errno = 0;
714 v = strtol(buf, &ep, base);
715
716 if (buf[0] == '\0' || *ep != '\0')
717 return -1;
718
719 if (v > INT_MAX)
720 v = INT_MAX;
721 if (v < INT_MIN)
722 v = INT_MIN;
723
724 return (int)v;
725 }