]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.c
Cope with another one of the many kinds of DocBook stupidity:
[mandoc.git] / mandoc.c
1 /* $Id: mandoc.c,v 1.91 2015/01/21 20:33:25 schwarze Exp $ */
2 /*
3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30
31 #include "mandoc.h"
32 #include "mandoc_aux.h"
33 #include "libmandoc.h"
34
35 #define DATESIZE 32
36
37 static int a2time(time_t *, const char *, const char *);
38 static char *time2a(time_t);
39
40
41 enum mandoc_esc
42 mandoc_escape(const char **end, const char **start, int *sz)
43 {
44 const char *local_start;
45 int local_sz;
46 char term;
47 enum mandoc_esc gly;
48
49 /*
50 * When the caller doesn't provide return storage,
51 * use local storage.
52 */
53
54 if (NULL == start)
55 start = &local_start;
56 if (NULL == sz)
57 sz = &local_sz;
58
59 /*
60 * Beyond the backslash, at least one input character
61 * is part of the escape sequence. With one exception
62 * (see below), that character won't be returned.
63 */
64
65 gly = ESCAPE_ERROR;
66 *start = ++*end;
67 *sz = 0;
68 term = '\0';
69
70 switch ((*start)[-1]) {
71 /*
72 * First the glyphs. There are several different forms of
73 * these, but each eventually returns a substring of the glyph
74 * name.
75 */
76 case '(':
77 gly = ESCAPE_SPECIAL;
78 *sz = 2;
79 break;
80 case '[':
81 gly = ESCAPE_SPECIAL;
82 term = ']';
83 break;
84 case 'C':
85 if ('\'' != **start)
86 return(ESCAPE_ERROR);
87 *start = ++*end;
88 gly = ESCAPE_SPECIAL;
89 term = '\'';
90 break;
91
92 /*
93 * Escapes taking no arguments at all.
94 */
95 case 'd':
96 /* FALLTHROUGH */
97 case 'u':
98 return(ESCAPE_IGNORE);
99
100 /*
101 * The \z escape is supposed to output the following
102 * character without advancing the cursor position.
103 * Since we are mostly dealing with terminal mode,
104 * let us just skip the next character.
105 */
106 case 'z':
107 return(ESCAPE_SKIPCHAR);
108
109 /*
110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111 * 'X' is the trigger. These have opaque sub-strings.
112 */
113 case 'F':
114 /* FALLTHROUGH */
115 case 'g':
116 /* FALLTHROUGH */
117 case 'k':
118 /* FALLTHROUGH */
119 case 'M':
120 /* FALLTHROUGH */
121 case 'm':
122 /* FALLTHROUGH */
123 case 'n':
124 /* FALLTHROUGH */
125 case 'V':
126 /* FALLTHROUGH */
127 case 'Y':
128 gly = ESCAPE_IGNORE;
129 /* FALLTHROUGH */
130 case 'f':
131 if (ESCAPE_ERROR == gly)
132 gly = ESCAPE_FONT;
133 switch (**start) {
134 case '(':
135 *start = ++*end;
136 *sz = 2;
137 break;
138 case '[':
139 *start = ++*end;
140 term = ']';
141 break;
142 default:
143 *sz = 1;
144 break;
145 }
146 break;
147
148 /*
149 * These escapes are of the form \X'Y', where 'X' is the trigger
150 * and 'Y' is any string. These have opaque sub-strings.
151 * The \B and \w escapes are handled in roff.c, roff_res().
152 */
153 case 'A':
154 /* FALLTHROUGH */
155 case 'b':
156 /* FALLTHROUGH */
157 case 'D':
158 /* FALLTHROUGH */
159 case 'R':
160 /* FALLTHROUGH */
161 case 'X':
162 /* FALLTHROUGH */
163 case 'Z':
164 gly = ESCAPE_IGNORE;
165 /* FALLTHROUGH */
166 case 'o':
167 if (**start == '\0')
168 return(ESCAPE_ERROR);
169 if (gly == ESCAPE_ERROR)
170 gly = ESCAPE_OVERSTRIKE;
171 term = **start;
172 *start = ++*end;
173 break;
174
175 /*
176 * These escapes are of the form \X'N', where 'X' is the trigger
177 * and 'N' resolves to a numerical expression.
178 */
179 case 'h':
180 /* FALLTHROUGH */
181 case 'H':
182 /* FALLTHROUGH */
183 case 'L':
184 /* FALLTHROUGH */
185 case 'l':
186 /* FALLTHROUGH */
187 case 'S':
188 /* FALLTHROUGH */
189 case 'v':
190 /* FALLTHROUGH */
191 case 'x':
192 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
193 if ('\0' != **start)
194 ++*end;
195 return(ESCAPE_ERROR);
196 }
197 gly = ESCAPE_IGNORE;
198 term = **start;
199 *start = ++*end;
200 break;
201
202 /*
203 * Special handling for the numbered character escape.
204 * XXX Do any other escapes need similar handling?
205 */
206 case 'N':
207 if ('\0' == **start)
208 return(ESCAPE_ERROR);
209 (*end)++;
210 if (isdigit((unsigned char)**start)) {
211 *sz = 1;
212 return(ESCAPE_IGNORE);
213 }
214 (*start)++;
215 while (isdigit((unsigned char)**end))
216 (*end)++;
217 *sz = *end - *start;
218 if ('\0' != **end)
219 (*end)++;
220 return(ESCAPE_NUMBERED);
221
222 /*
223 * Sizes get a special category of their own.
224 */
225 case 's':
226 gly = ESCAPE_IGNORE;
227
228 /* See +/- counts as a sign. */
229 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
230 *start = ++*end;
231
232 switch (**end) {
233 case '(':
234 *start = ++*end;
235 *sz = 2;
236 break;
237 case '[':
238 *start = ++*end;
239 term = ']';
240 break;
241 case '\'':
242 *start = ++*end;
243 term = '\'';
244 break;
245 default:
246 *sz = 1;
247 break;
248 }
249
250 break;
251
252 /*
253 * Anything else is assumed to be a glyph.
254 * In this case, pass back the character after the backslash.
255 */
256 default:
257 gly = ESCAPE_SPECIAL;
258 *start = --*end;
259 *sz = 1;
260 break;
261 }
262
263 assert(ESCAPE_ERROR != gly);
264
265 /*
266 * Read up to the terminating character,
267 * paying attention to nested escapes.
268 */
269
270 if ('\0' != term) {
271 while (**end != term) {
272 switch (**end) {
273 case '\0':
274 return(ESCAPE_ERROR);
275 case '\\':
276 (*end)++;
277 if (ESCAPE_ERROR ==
278 mandoc_escape(end, NULL, NULL))
279 return(ESCAPE_ERROR);
280 break;
281 default:
282 (*end)++;
283 break;
284 }
285 }
286 *sz = (*end)++ - *start;
287 } else {
288 assert(*sz > 0);
289 if ((size_t)*sz > strlen(*start))
290 return(ESCAPE_ERROR);
291 *end += *sz;
292 }
293
294 /* Run post-processors. */
295
296 switch (gly) {
297 case ESCAPE_FONT:
298 if (2 == *sz) {
299 if ('C' == **start) {
300 /*
301 * Treat constant-width font modes
302 * just like regular font modes.
303 */
304 (*start)++;
305 (*sz)--;
306 } else {
307 if ('B' == (*start)[0] && 'I' == (*start)[1])
308 gly = ESCAPE_FONTBI;
309 break;
310 }
311 } else if (1 != *sz)
312 break;
313
314 switch (**start) {
315 case '3':
316 /* FALLTHROUGH */
317 case 'B':
318 gly = ESCAPE_FONTBOLD;
319 break;
320 case '2':
321 /* FALLTHROUGH */
322 case 'I':
323 gly = ESCAPE_FONTITALIC;
324 break;
325 case 'P':
326 gly = ESCAPE_FONTPREV;
327 break;
328 case '1':
329 /* FALLTHROUGH */
330 case 'R':
331 gly = ESCAPE_FONTROMAN;
332 break;
333 }
334 break;
335 case ESCAPE_SPECIAL:
336 if (1 == *sz && 'c' == **start)
337 gly = ESCAPE_NOSPACE;
338 /*
339 * Unicode escapes are defined in groff as \[u0000]
340 * to \[u10FFFF], where the contained value must be
341 * a valid Unicode codepoint. Here, however, only
342 * check the length and range.
343 */
344 if (**start != 'u' || *sz < 5 || *sz > 7)
345 break;
346 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
347 break;
348 if (*sz == 6 && (*start)[1] == '0')
349 break;
350 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
351 + 1 == *sz)
352 gly = ESCAPE_UNICODE;
353 break;
354 default:
355 break;
356 }
357
358 return(gly);
359 }
360
361 /*
362 * Parse a quoted or unquoted roff-style request or macro argument.
363 * Return a pointer to the parsed argument, which is either the original
364 * pointer or advanced by one byte in case the argument is quoted.
365 * NUL-terminate the argument in place.
366 * Collapse pairs of quotes inside quoted arguments.
367 * Advance the argument pointer to the next argument,
368 * or to the NUL byte terminating the argument line.
369 */
370 char *
371 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
372 {
373 char *start, *cp;
374 int quoted, pairs, white;
375
376 /* Quoting can only start with a new word. */
377 start = *cpp;
378 quoted = 0;
379 if ('"' == *start) {
380 quoted = 1;
381 start++;
382 }
383
384 pairs = 0;
385 white = 0;
386 for (cp = start; '\0' != *cp; cp++) {
387
388 /*
389 * Move the following text left
390 * after quoted quotes and after "\\" and "\t".
391 */
392 if (pairs)
393 cp[-pairs] = cp[0];
394
395 if ('\\' == cp[0]) {
396 /*
397 * In copy mode, translate double to single
398 * backslashes and backslash-t to literal tabs.
399 */
400 switch (cp[1]) {
401 case 't':
402 cp[0] = '\t';
403 /* FALLTHROUGH */
404 case '\\':
405 pairs++;
406 cp++;
407 break;
408 case ' ':
409 /* Skip escaped blanks. */
410 if (0 == quoted)
411 cp++;
412 break;
413 default:
414 break;
415 }
416 } else if (0 == quoted) {
417 if (' ' == cp[0]) {
418 /* Unescaped blanks end unquoted args. */
419 white = 1;
420 break;
421 }
422 } else if ('"' == cp[0]) {
423 if ('"' == cp[1]) {
424 /* Quoted quotes collapse. */
425 pairs++;
426 cp++;
427 } else {
428 /* Unquoted quotes end quoted args. */
429 quoted = 2;
430 break;
431 }
432 }
433 }
434
435 /* Quoted argument without a closing quote. */
436 if (1 == quoted)
437 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
438
439 /* NUL-terminate this argument and move to the next one. */
440 if (pairs)
441 cp[-pairs] = '\0';
442 if ('\0' != *cp) {
443 *cp++ = '\0';
444 while (' ' == *cp)
445 cp++;
446 }
447 *pos += (int)(cp - start) + (quoted ? 1 : 0);
448 *cpp = cp;
449
450 if ('\0' == *cp && (white || ' ' == cp[-1]))
451 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
452
453 return(start);
454 }
455
456 static int
457 a2time(time_t *t, const char *fmt, const char *p)
458 {
459 struct tm tm;
460 char *pp;
461
462 memset(&tm, 0, sizeof(struct tm));
463
464 pp = NULL;
465 #if HAVE_STRPTIME
466 pp = strptime(p, fmt, &tm);
467 #endif
468 if (NULL != pp && '\0' == *pp) {
469 *t = mktime(&tm);
470 return(1);
471 }
472
473 return(0);
474 }
475
476 static char *
477 time2a(time_t t)
478 {
479 struct tm *tm;
480 char *buf, *p;
481 size_t ssz;
482 int isz;
483
484 tm = localtime(&t);
485 if (tm == NULL)
486 return(NULL);
487
488 /*
489 * Reserve space:
490 * up to 9 characters for the month (September) + blank
491 * up to 2 characters for the day + comma + blank
492 * 4 characters for the year and a terminating '\0'
493 */
494 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
495
496 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
497 goto fail;
498 p += (int)ssz;
499
500 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
501 goto fail;
502 p += isz;
503
504 if (0 == strftime(p, 4 + 1, "%Y", tm))
505 goto fail;
506 return(buf);
507
508 fail:
509 free(buf);
510 return(NULL);
511 }
512
513 char *
514 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
515 {
516 char *out;
517 time_t t;
518
519 if (NULL == in || '\0' == *in ||
520 0 == strcmp(in, "$" "Mdocdate$")) {
521 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
522 time(&t);
523 }
524 else if (a2time(&t, "%Y-%m-%d", in))
525 t = 0;
526 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
527 !a2time(&t, "%b %d, %Y", in)) {
528 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
529 t = 0;
530 }
531 out = t ? time2a(t) : NULL;
532 return(out ? out : mandoc_strdup(in));
533 }
534
535 int
536 mandoc_eos(const char *p, size_t sz)
537 {
538 const char *q;
539 int enclosed, found;
540
541 if (0 == sz)
542 return(0);
543
544 /*
545 * End-of-sentence recognition must include situations where
546 * some symbols, such as `)', allow prior EOS punctuation to
547 * propagate outward.
548 */
549
550 enclosed = found = 0;
551 for (q = p + (int)sz - 1; q >= p; q--) {
552 switch (*q) {
553 case '\"':
554 /* FALLTHROUGH */
555 case '\'':
556 /* FALLTHROUGH */
557 case ']':
558 /* FALLTHROUGH */
559 case ')':
560 if (0 == found)
561 enclosed = 1;
562 break;
563 case '.':
564 /* FALLTHROUGH */
565 case '!':
566 /* FALLTHROUGH */
567 case '?':
568 found = 1;
569 break;
570 default:
571 return(found && (!enclosed || isalnum((unsigned char)*q)));
572 }
573 }
574
575 return(found && !enclosed);
576 }
577
578 /*
579 * Convert a string to a long that may not be <0.
580 * If the string is invalid, or is less than 0, return -1.
581 */
582 int
583 mandoc_strntoi(const char *p, size_t sz, int base)
584 {
585 char buf[32];
586 char *ep;
587 long v;
588
589 if (sz > 31)
590 return(-1);
591
592 memcpy(buf, p, sz);
593 buf[(int)sz] = '\0';
594
595 errno = 0;
596 v = strtol(buf, &ep, base);
597
598 if (buf[0] == '\0' || *ep != '\0')
599 return(-1);
600
601 if (v > INT_MAX)
602 v = INT_MAX;
603 if (v < INT_MIN)
604 v = INT_MIN;
605
606 return((int)v);
607 }