]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.c
Bugfix:
[mandoc.git] / mandoc.c
1 /* $Id: mandoc.c,v 1.113 2018/12/18 22:00:02 schwarze Exp $ */
2 /*
3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30
31 #include "mandoc_aux.h"
32 #include "mandoc.h"
33 #include "roff.h"
34 #include "libmandoc.h"
35
36 static int a2time(time_t *, const char *, const char *);
37 static char *time2a(time_t);
38
39
40 enum mandoc_esc
41 mandoc_font(const char *cp, int sz)
42 {
43 switch (sz) {
44 case 0:
45 return ESCAPE_FONTPREV;
46 case 1:
47 switch (cp[0]) {
48 case 'B':
49 case '3':
50 return ESCAPE_FONTBOLD;
51 case 'I':
52 case '2':
53 return ESCAPE_FONTITALIC;
54 case 'P':
55 return ESCAPE_FONTPREV;
56 case 'R':
57 case '1':
58 return ESCAPE_FONTROMAN;
59 case '4':
60 return ESCAPE_FONTBI;
61 default:
62 return ESCAPE_ERROR;
63 }
64 case 2:
65 switch (cp[0]) {
66 case 'B':
67 switch (cp[1]) {
68 case 'I':
69 return ESCAPE_FONTBI;
70 default:
71 return ESCAPE_ERROR;
72 }
73 case 'C':
74 switch (cp[1]) {
75 case 'B':
76 return ESCAPE_FONTBOLD;
77 case 'I':
78 return ESCAPE_FONTITALIC;
79 case 'R':
80 case 'W':
81 return ESCAPE_FONTCW;
82 default:
83 return ESCAPE_ERROR;
84 }
85 default:
86 return ESCAPE_ERROR;
87 }
88 default:
89 return ESCAPE_ERROR;
90 }
91 }
92
93 enum mandoc_esc
94 mandoc_escape(const char **end, const char **start, int *sz)
95 {
96 const char *local_start;
97 int local_sz, c, i;
98 char term;
99 enum mandoc_esc gly;
100
101 /*
102 * When the caller doesn't provide return storage,
103 * use local storage.
104 */
105
106 if (NULL == start)
107 start = &local_start;
108 if (NULL == sz)
109 sz = &local_sz;
110
111 /*
112 * Treat "\E" just like "\";
113 * it only makes a difference in copy mode.
114 */
115
116 if (**end == 'E')
117 ++*end;
118
119 /*
120 * Beyond the backslash, at least one input character
121 * is part of the escape sequence. With one exception
122 * (see below), that character won't be returned.
123 */
124
125 gly = ESCAPE_ERROR;
126 *start = ++*end;
127 *sz = 0;
128 term = '\0';
129
130 switch ((*start)[-1]) {
131 /*
132 * First the glyphs. There are several different forms of
133 * these, but each eventually returns a substring of the glyph
134 * name.
135 */
136 case '(':
137 gly = ESCAPE_SPECIAL;
138 *sz = 2;
139 break;
140 case '[':
141 if (**start == ' ') {
142 ++*end;
143 return ESCAPE_ERROR;
144 }
145 gly = ESCAPE_SPECIAL;
146 term = ']';
147 break;
148 case 'C':
149 if ('\'' != **start)
150 return ESCAPE_ERROR;
151 *start = ++*end;
152 gly = ESCAPE_SPECIAL;
153 term = '\'';
154 break;
155
156 /*
157 * Escapes taking no arguments at all.
158 */
159 case '!':
160 case '?':
161 return ESCAPE_UNSUPP;
162 case '%':
163 case '&':
164 case ')':
165 case ',':
166 case '/':
167 case '^':
168 case 'a':
169 case 'd':
170 case 'r':
171 case 't':
172 case 'u':
173 case '{':
174 case '|':
175 case '}':
176 return ESCAPE_IGNORE;
177 case 'c':
178 return ESCAPE_NOSPACE;
179 case 'p':
180 return ESCAPE_BREAK;
181
182 /*
183 * The \z escape is supposed to output the following
184 * character without advancing the cursor position.
185 * Since we are mostly dealing with terminal mode,
186 * let us just skip the next character.
187 */
188 case 'z':
189 return ESCAPE_SKIPCHAR;
190
191 /*
192 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
193 * 'X' is the trigger. These have opaque sub-strings.
194 */
195 case 'F':
196 case 'f':
197 case 'g':
198 case 'k':
199 case 'M':
200 case 'm':
201 case 'n':
202 case 'O':
203 case 'V':
204 case 'Y':
205 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
206 switch (**start) {
207 case '(':
208 if ((*start)[-1] == 'O')
209 gly = ESCAPE_ERROR;
210 *start = ++*end;
211 *sz = 2;
212 break;
213 case '[':
214 if ((*start)[-1] == 'O')
215 gly = (*start)[1] == '5' ?
216 ESCAPE_UNSUPP : ESCAPE_ERROR;
217 *start = ++*end;
218 term = ']';
219 break;
220 default:
221 if ((*start)[-1] == 'O') {
222 switch (**start) {
223 case '0':
224 gly = ESCAPE_UNSUPP;
225 break;
226 case '1':
227 case '2':
228 case '3':
229 case '4':
230 break;
231 default:
232 gly = ESCAPE_ERROR;
233 break;
234 }
235 }
236 *sz = 1;
237 break;
238 }
239 break;
240 case '*':
241 if (strncmp(*start, "(.T", 3) != 0)
242 abort();
243 gly = ESCAPE_DEVICE;
244 *start = ++*end;
245 *sz = 2;
246 break;
247
248 /*
249 * These escapes are of the form \X'Y', where 'X' is the trigger
250 * and 'Y' is any string. These have opaque sub-strings.
251 * The \B and \w escapes are handled in roff.c, roff_res().
252 */
253 case 'A':
254 case 'b':
255 case 'D':
256 case 'R':
257 case 'X':
258 case 'Z':
259 gly = ESCAPE_IGNORE;
260 /* FALLTHROUGH */
261 case 'o':
262 if (**start == '\0')
263 return ESCAPE_ERROR;
264 if (gly == ESCAPE_ERROR)
265 gly = ESCAPE_OVERSTRIKE;
266 term = **start;
267 *start = ++*end;
268 break;
269
270 /*
271 * These escapes are of the form \X'N', where 'X' is the trigger
272 * and 'N' resolves to a numerical expression.
273 */
274 case 'h':
275 case 'H':
276 case 'L':
277 case 'l':
278 case 'S':
279 case 'v':
280 case 'x':
281 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
282 if ('\0' != **start)
283 ++*end;
284 return ESCAPE_ERROR;
285 }
286 switch ((*start)[-1]) {
287 case 'h':
288 gly = ESCAPE_HORIZ;
289 break;
290 case 'l':
291 gly = ESCAPE_HLINE;
292 break;
293 default:
294 gly = ESCAPE_IGNORE;
295 break;
296 }
297 term = **start;
298 *start = ++*end;
299 break;
300
301 /*
302 * Special handling for the numbered character escape.
303 * XXX Do any other escapes need similar handling?
304 */
305 case 'N':
306 if ('\0' == **start)
307 return ESCAPE_ERROR;
308 (*end)++;
309 if (isdigit((unsigned char)**start)) {
310 *sz = 1;
311 return ESCAPE_IGNORE;
312 }
313 (*start)++;
314 while (isdigit((unsigned char)**end))
315 (*end)++;
316 *sz = *end - *start;
317 if ('\0' != **end)
318 (*end)++;
319 return ESCAPE_NUMBERED;
320
321 /*
322 * Sizes get a special category of their own.
323 */
324 case 's':
325 gly = ESCAPE_IGNORE;
326
327 /* See +/- counts as a sign. */
328 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
329 *start = ++*end;
330
331 switch (**end) {
332 case '(':
333 *start = ++*end;
334 *sz = 2;
335 break;
336 case '[':
337 *start = ++*end;
338 term = ']';
339 break;
340 case '\'':
341 *start = ++*end;
342 term = '\'';
343 break;
344 case '3':
345 case '2':
346 case '1':
347 *sz = (*end)[-1] == 's' &&
348 isdigit((unsigned char)(*end)[1]) ? 2 : 1;
349 break;
350 default:
351 *sz = 1;
352 break;
353 }
354
355 break;
356
357 /*
358 * Several special characters can be encoded as
359 * one-byte escape sequences without using \[].
360 */
361 case ' ':
362 case '\'':
363 case '-':
364 case '.':
365 case '0':
366 case ':':
367 case '_':
368 case '`':
369 case 'e':
370 case '~':
371 gly = ESCAPE_SPECIAL;
372 /* FALLTHROUGH */
373 default:
374 if (gly == ESCAPE_ERROR)
375 gly = ESCAPE_UNDEF;
376 *start = --*end;
377 *sz = 1;
378 break;
379 }
380
381 /*
382 * Read up to the terminating character,
383 * paying attention to nested escapes.
384 */
385
386 if ('\0' != term) {
387 while (**end != term) {
388 switch (**end) {
389 case '\0':
390 return ESCAPE_ERROR;
391 case '\\':
392 (*end)++;
393 if (ESCAPE_ERROR ==
394 mandoc_escape(end, NULL, NULL))
395 return ESCAPE_ERROR;
396 break;
397 default:
398 (*end)++;
399 break;
400 }
401 }
402 *sz = (*end)++ - *start;
403
404 /*
405 * The file chars.c only provides one common list
406 * of character names, but \[-] == \- is the only
407 * one of the characters with one-byte names that
408 * allows enclosing the name in brackets.
409 */
410 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
411 return ESCAPE_ERROR;
412 } else {
413 assert(*sz > 0);
414 if ((size_t)*sz > strlen(*start))
415 return ESCAPE_ERROR;
416 *end += *sz;
417 }
418
419 /* Run post-processors. */
420
421 switch (gly) {
422 case ESCAPE_FONT:
423 gly = mandoc_font(*start, *sz);
424 break;
425 case ESCAPE_SPECIAL:
426 if (**start == 'c') {
427 if (*sz < 6 || *sz > 7 ||
428 strncmp(*start, "char", 4) != 0 ||
429 (int)strspn(*start + 4, "0123456789") + 4 < *sz)
430 break;
431 c = 0;
432 for (i = 4; i < *sz; i++)
433 c = 10 * c + ((*start)[i] - '0');
434 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
435 break;
436 *start += 4;
437 *sz -= 4;
438 gly = ESCAPE_NUMBERED;
439 break;
440 }
441
442 /*
443 * Unicode escapes are defined in groff as \[u0000]
444 * to \[u10FFFF], where the contained value must be
445 * a valid Unicode codepoint. Here, however, only
446 * check the length and range.
447 */
448 if (**start != 'u' || *sz < 5 || *sz > 7)
449 break;
450 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
451 break;
452 if (*sz == 6 && (*start)[1] == '0')
453 break;
454 if (*sz == 5 && (*start)[1] == 'D' &&
455 strchr("89ABCDEF", (*start)[2]) != NULL)
456 break;
457 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
458 + 1 == *sz)
459 gly = ESCAPE_UNICODE;
460 break;
461 default:
462 break;
463 }
464
465 return gly;
466 }
467
468 static int
469 a2time(time_t *t, const char *fmt, const char *p)
470 {
471 struct tm tm;
472 char *pp;
473
474 memset(&tm, 0, sizeof(struct tm));
475
476 pp = NULL;
477 #if HAVE_STRPTIME
478 pp = strptime(p, fmt, &tm);
479 #endif
480 if (NULL != pp && '\0' == *pp) {
481 *t = mktime(&tm);
482 return 1;
483 }
484
485 return 0;
486 }
487
488 static char *
489 time2a(time_t t)
490 {
491 struct tm *tm;
492 char *buf, *p;
493 size_t ssz;
494 int isz;
495
496 tm = localtime(&t);
497 if (tm == NULL)
498 return NULL;
499
500 /*
501 * Reserve space:
502 * up to 9 characters for the month (September) + blank
503 * up to 2 characters for the day + comma + blank
504 * 4 characters for the year and a terminating '\0'
505 */
506
507 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
508
509 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
510 goto fail;
511 p += (int)ssz;
512
513 /*
514 * The output format is just "%d" here, not "%2d" or "%02d".
515 * That's also the reason why we can't just format the
516 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
517 * Besides, the present approach is less prone to buffer
518 * overflows, in case anybody should ever introduce the bug
519 * of looking at LC_TIME.
520 */
521
522 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
523 goto fail;
524 p += isz;
525
526 if (strftime(p, 4 + 1, "%Y", tm) == 0)
527 goto fail;
528 return buf;
529
530 fail:
531 free(buf);
532 return NULL;
533 }
534
535 char *
536 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
537 {
538 char *cp;
539 time_t t;
540
541 /* No date specified: use today's date. */
542
543 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
544 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
545 return time2a(time(NULL));
546 }
547
548 /* Valid mdoc(7) date format. */
549
550 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
551 a2time(&t, "%b %d, %Y", in)) {
552 cp = time2a(t);
553 if (t > time(NULL) + 86400)
554 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
555 else if (*in != '$' && strcmp(in, cp) != 0)
556 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
557 return cp;
558 }
559
560 /* In man(7), do not warn about the legacy format. */
561
562 if (a2time(&t, "%Y-%m-%d", in) == 0)
563 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
564 else if (t > time(NULL) + 86400)
565 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
566 else if (man->macroset == MACROSET_MDOC)
567 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
568
569 /* Use any non-mdoc(7) date verbatim. */
570
571 return mandoc_strdup(in);
572 }
573
574 int
575 mandoc_eos(const char *p, size_t sz)
576 {
577 const char *q;
578 int enclosed, found;
579
580 if (0 == sz)
581 return 0;
582
583 /*
584 * End-of-sentence recognition must include situations where
585 * some symbols, such as `)', allow prior EOS punctuation to
586 * propagate outward.
587 */
588
589 enclosed = found = 0;
590 for (q = p + (int)sz - 1; q >= p; q--) {
591 switch (*q) {
592 case '\"':
593 case '\'':
594 case ']':
595 case ')':
596 if (0 == found)
597 enclosed = 1;
598 break;
599 case '.':
600 case '!':
601 case '?':
602 found = 1;
603 break;
604 default:
605 return found &&
606 (!enclosed || isalnum((unsigned char)*q));
607 }
608 }
609
610 return found && !enclosed;
611 }
612
613 /*
614 * Convert a string to a long that may not be <0.
615 * If the string is invalid, or is less than 0, return -1.
616 */
617 int
618 mandoc_strntoi(const char *p, size_t sz, int base)
619 {
620 char buf[32];
621 char *ep;
622 long v;
623
624 if (sz > 31)
625 return -1;
626
627 memcpy(buf, p, sz);
628 buf[(int)sz] = '\0';
629
630 errno = 0;
631 v = strtol(buf, &ep, base);
632
633 if (buf[0] == '\0' || *ep != '\0')
634 return -1;
635
636 if (v > INT_MAX)
637 v = INT_MAX;
638 if (v < INT_MIN)
639 v = INT_MIN;
640
641 return (int)v;
642 }