]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.c
Stricter syntax checking of Unicode character names:
[mandoc.git] / mandoc.c
1 /* $Id: mandoc.c,v 1.87 2014/10/13 17:17:45 schwarze Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30
31 #include "mandoc.h"
32 #include "mandoc_aux.h"
33 #include "libmandoc.h"
34
35 #define DATESIZE 32
36
37 static int a2time(time_t *, const char *, const char *);
38 static char *time2a(time_t);
39
40
41 enum mandoc_esc
42 mandoc_escape(const char **end, const char **start, int *sz)
43 {
44 const char *local_start;
45 int local_sz;
46 char term;
47 enum mandoc_esc gly;
48
49 /*
50 * When the caller doesn't provide return storage,
51 * use local storage.
52 */
53
54 if (NULL == start)
55 start = &local_start;
56 if (NULL == sz)
57 sz = &local_sz;
58
59 /*
60 * Beyond the backslash, at least one input character
61 * is part of the escape sequence. With one exception
62 * (see below), that character won't be returned.
63 */
64
65 gly = ESCAPE_ERROR;
66 *start = ++*end;
67 *sz = 0;
68 term = '\0';
69
70 switch ((*start)[-1]) {
71 /*
72 * First the glyphs. There are several different forms of
73 * these, but each eventually returns a substring of the glyph
74 * name.
75 */
76 case '(':
77 gly = ESCAPE_SPECIAL;
78 *sz = 2;
79 break;
80 case '[':
81 gly = ESCAPE_SPECIAL;
82 term = ']';
83 break;
84 case 'C':
85 if ('\'' != **start)
86 return(ESCAPE_ERROR);
87 *start = ++*end;
88 gly = ESCAPE_SPECIAL;
89 term = '\'';
90 break;
91
92 /*
93 * Escapes taking no arguments at all.
94 */
95 case 'd':
96 /* FALLTHROUGH */
97 case 'u':
98 return(ESCAPE_IGNORE);
99
100 /*
101 * The \z escape is supposed to output the following
102 * character without advancing the cursor position.
103 * Since we are mostly dealing with terminal mode,
104 * let us just skip the next character.
105 */
106 case 'z':
107 return(ESCAPE_SKIPCHAR);
108
109 /*
110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111 * 'X' is the trigger. These have opaque sub-strings.
112 */
113 case 'F':
114 /* FALLTHROUGH */
115 case 'g':
116 /* FALLTHROUGH */
117 case 'k':
118 /* FALLTHROUGH */
119 case 'M':
120 /* FALLTHROUGH */
121 case 'm':
122 /* FALLTHROUGH */
123 case 'n':
124 /* FALLTHROUGH */
125 case 'V':
126 /* FALLTHROUGH */
127 case 'Y':
128 gly = ESCAPE_IGNORE;
129 /* FALLTHROUGH */
130 case 'f':
131 if (ESCAPE_ERROR == gly)
132 gly = ESCAPE_FONT;
133 switch (**start) {
134 case '(':
135 *start = ++*end;
136 *sz = 2;
137 break;
138 case '[':
139 *start = ++*end;
140 term = ']';
141 break;
142 default:
143 *sz = 1;
144 break;
145 }
146 break;
147
148 /*
149 * These escapes are of the form \X'Y', where 'X' is the trigger
150 * and 'Y' is any string. These have opaque sub-strings.
151 * The \B and \w escapes are handled in roff.c, roff_res().
152 */
153 case 'A':
154 /* FALLTHROUGH */
155 case 'b':
156 /* FALLTHROUGH */
157 case 'D':
158 /* FALLTHROUGH */
159 case 'o':
160 /* FALLTHROUGH */
161 case 'R':
162 /* FALLTHROUGH */
163 case 'X':
164 /* FALLTHROUGH */
165 case 'Z':
166 if ('\0' == **start)
167 return(ESCAPE_ERROR);
168 gly = ESCAPE_IGNORE;
169 term = **start;
170 *start = ++*end;
171 break;
172
173 /*
174 * These escapes are of the form \X'N', where 'X' is the trigger
175 * and 'N' resolves to a numerical expression.
176 */
177 case 'h':
178 /* FALLTHROUGH */
179 case 'H':
180 /* FALLTHROUGH */
181 case 'L':
182 /* FALLTHROUGH */
183 case 'l':
184 /* FALLTHROUGH */
185 case 'S':
186 /* FALLTHROUGH */
187 case 'v':
188 /* FALLTHROUGH */
189 case 'x':
190 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
191 if ('\0' != **start)
192 ++*end;
193 return(ESCAPE_ERROR);
194 }
195 gly = ESCAPE_IGNORE;
196 term = **start;
197 *start = ++*end;
198 break;
199
200 /*
201 * Special handling for the numbered character escape.
202 * XXX Do any other escapes need similar handling?
203 */
204 case 'N':
205 if ('\0' == **start)
206 return(ESCAPE_ERROR);
207 (*end)++;
208 if (isdigit((unsigned char)**start)) {
209 *sz = 1;
210 return(ESCAPE_IGNORE);
211 }
212 (*start)++;
213 while (isdigit((unsigned char)**end))
214 (*end)++;
215 *sz = *end - *start;
216 if ('\0' != **end)
217 (*end)++;
218 return(ESCAPE_NUMBERED);
219
220 /*
221 * Sizes get a special category of their own.
222 */
223 case 's':
224 gly = ESCAPE_IGNORE;
225
226 /* See +/- counts as a sign. */
227 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
228 (*end)++;
229
230 switch (**end) {
231 case '(':
232 *start = ++*end;
233 *sz = 2;
234 break;
235 case '[':
236 *start = ++*end;
237 term = ']';
238 break;
239 case '\'':
240 *start = ++*end;
241 term = '\'';
242 break;
243 default:
244 *sz = 1;
245 break;
246 }
247
248 break;
249
250 /*
251 * Anything else is assumed to be a glyph.
252 * In this case, pass back the character after the backslash.
253 */
254 default:
255 gly = ESCAPE_SPECIAL;
256 *start = --*end;
257 *sz = 1;
258 break;
259 }
260
261 assert(ESCAPE_ERROR != gly);
262
263 /*
264 * Read up to the terminating character,
265 * paying attention to nested escapes.
266 */
267
268 if ('\0' != term) {
269 while (**end != term) {
270 switch (**end) {
271 case '\0':
272 return(ESCAPE_ERROR);
273 case '\\':
274 (*end)++;
275 if (ESCAPE_ERROR ==
276 mandoc_escape(end, NULL, NULL))
277 return(ESCAPE_ERROR);
278 break;
279 default:
280 (*end)++;
281 break;
282 }
283 }
284 *sz = (*end)++ - *start;
285 } else {
286 assert(*sz > 0);
287 if ((size_t)*sz > strlen(*start))
288 return(ESCAPE_ERROR);
289 *end += *sz;
290 }
291
292 /* Run post-processors. */
293
294 switch (gly) {
295 case ESCAPE_FONT:
296 if (2 == *sz) {
297 if ('C' == **start) {
298 /*
299 * Treat constant-width font modes
300 * just like regular font modes.
301 */
302 (*start)++;
303 (*sz)--;
304 } else {
305 if ('B' == (*start)[0] && 'I' == (*start)[1])
306 gly = ESCAPE_FONTBI;
307 break;
308 }
309 } else if (1 != *sz)
310 break;
311
312 switch (**start) {
313 case '3':
314 /* FALLTHROUGH */
315 case 'B':
316 gly = ESCAPE_FONTBOLD;
317 break;
318 case '2':
319 /* FALLTHROUGH */
320 case 'I':
321 gly = ESCAPE_FONTITALIC;
322 break;
323 case 'P':
324 gly = ESCAPE_FONTPREV;
325 break;
326 case '1':
327 /* FALLTHROUGH */
328 case 'R':
329 gly = ESCAPE_FONTROMAN;
330 break;
331 }
332 break;
333 case ESCAPE_SPECIAL:
334 if (1 == *sz && 'c' == **start)
335 gly = ESCAPE_NOSPACE;
336 /*
337 * Unicode escapes are defined in groff as \[uXXXX]
338 * to \[u10FFFF], where the contained value must be
339 * a valid Unicode codepoint. Here, however, only
340 * check the length and the validity of all digits.
341 */
342 else if (*sz > 4 && *sz < 8 && **start == 'u' &&
343 (int)strspn(*start + 1, "0123456789ABCDEFabcdef")
344 + 1 == *sz)
345 gly = ESCAPE_UNICODE;
346 break;
347 default:
348 break;
349 }
350
351 return(gly);
352 }
353
354 /*
355 * Parse a quoted or unquoted roff-style request or macro argument.
356 * Return a pointer to the parsed argument, which is either the original
357 * pointer or advanced by one byte in case the argument is quoted.
358 * NUL-terminate the argument in place.
359 * Collapse pairs of quotes inside quoted arguments.
360 * Advance the argument pointer to the next argument,
361 * or to the NUL byte terminating the argument line.
362 */
363 char *
364 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
365 {
366 char *start, *cp;
367 int quoted, pairs, white;
368
369 /* Quoting can only start with a new word. */
370 start = *cpp;
371 quoted = 0;
372 if ('"' == *start) {
373 quoted = 1;
374 start++;
375 }
376
377 pairs = 0;
378 white = 0;
379 for (cp = start; '\0' != *cp; cp++) {
380
381 /*
382 * Move the following text left
383 * after quoted quotes and after "\\" and "\t".
384 */
385 if (pairs)
386 cp[-pairs] = cp[0];
387
388 if ('\\' == cp[0]) {
389 /*
390 * In copy mode, translate double to single
391 * backslashes and backslash-t to literal tabs.
392 */
393 switch (cp[1]) {
394 case 't':
395 cp[0] = '\t';
396 /* FALLTHROUGH */
397 case '\\':
398 pairs++;
399 cp++;
400 break;
401 case ' ':
402 /* Skip escaped blanks. */
403 if (0 == quoted)
404 cp++;
405 break;
406 default:
407 break;
408 }
409 } else if (0 == quoted) {
410 if (' ' == cp[0]) {
411 /* Unescaped blanks end unquoted args. */
412 white = 1;
413 break;
414 }
415 } else if ('"' == cp[0]) {
416 if ('"' == cp[1]) {
417 /* Quoted quotes collapse. */
418 pairs++;
419 cp++;
420 } else {
421 /* Unquoted quotes end quoted args. */
422 quoted = 2;
423 break;
424 }
425 }
426 }
427
428 /* Quoted argument without a closing quote. */
429 if (1 == quoted)
430 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
431
432 /* NUL-terminate this argument and move to the next one. */
433 if (pairs)
434 cp[-pairs] = '\0';
435 if ('\0' != *cp) {
436 *cp++ = '\0';
437 while (' ' == *cp)
438 cp++;
439 }
440 *pos += (int)(cp - start) + (quoted ? 1 : 0);
441 *cpp = cp;
442
443 if ('\0' == *cp && (white || ' ' == cp[-1]))
444 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
445
446 return(start);
447 }
448
449 static int
450 a2time(time_t *t, const char *fmt, const char *p)
451 {
452 struct tm tm;
453 char *pp;
454
455 memset(&tm, 0, sizeof(struct tm));
456
457 pp = NULL;
458 #if HAVE_STRPTIME
459 pp = strptime(p, fmt, &tm);
460 #endif
461 if (NULL != pp && '\0' == *pp) {
462 *t = mktime(&tm);
463 return(1);
464 }
465
466 return(0);
467 }
468
469 static char *
470 time2a(time_t t)
471 {
472 struct tm *tm;
473 char *buf, *p;
474 size_t ssz;
475 int isz;
476
477 tm = localtime(&t);
478
479 /*
480 * Reserve space:
481 * up to 9 characters for the month (September) + blank
482 * up to 2 characters for the day + comma + blank
483 * 4 characters for the year and a terminating '\0'
484 */
485 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
486
487 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
488 goto fail;
489 p += (int)ssz;
490
491 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
492 goto fail;
493 p += isz;
494
495 if (0 == strftime(p, 4 + 1, "%Y", tm))
496 goto fail;
497 return(buf);
498
499 fail:
500 free(buf);
501 return(NULL);
502 }
503
504 char *
505 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
506 {
507 char *out;
508 time_t t;
509
510 if (NULL == in || '\0' == *in ||
511 0 == strcmp(in, "$" "Mdocdate$")) {
512 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
513 time(&t);
514 }
515 else if (a2time(&t, "%Y-%m-%d", in))
516 t = 0;
517 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
518 !a2time(&t, "%b %d, %Y", in)) {
519 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
520 t = 0;
521 }
522 out = t ? time2a(t) : NULL;
523 return(out ? out : mandoc_strdup(in));
524 }
525
526 int
527 mandoc_eos(const char *p, size_t sz)
528 {
529 const char *q;
530 int enclosed, found;
531
532 if (0 == sz)
533 return(0);
534
535 /*
536 * End-of-sentence recognition must include situations where
537 * some symbols, such as `)', allow prior EOS punctuation to
538 * propagate outward.
539 */
540
541 enclosed = found = 0;
542 for (q = p + (int)sz - 1; q >= p; q--) {
543 switch (*q) {
544 case '\"':
545 /* FALLTHROUGH */
546 case '\'':
547 /* FALLTHROUGH */
548 case ']':
549 /* FALLTHROUGH */
550 case ')':
551 if (0 == found)
552 enclosed = 1;
553 break;
554 case '.':
555 /* FALLTHROUGH */
556 case '!':
557 /* FALLTHROUGH */
558 case '?':
559 found = 1;
560 break;
561 default:
562 return(found && (!enclosed || isalnum((unsigned char)*q)));
563 }
564 }
565
566 return(found && !enclosed);
567 }
568
569 /*
570 * Convert a string to a long that may not be <0.
571 * If the string is invalid, or is less than 0, return -1.
572 */
573 int
574 mandoc_strntoi(const char *p, size_t sz, int base)
575 {
576 char buf[32];
577 char *ep;
578 long v;
579
580 if (sz > 31)
581 return(-1);
582
583 memcpy(buf, p, sz);
584 buf[(int)sz] = '\0';
585
586 errno = 0;
587 v = strtol(buf, &ep, base);
588
589 if (buf[0] == '\0' || *ep != '\0')
590 return(-1);
591
592 if (v > INT_MAX)
593 v = INT_MAX;
594 if (v < INT_MIN)
595 v = INT_MIN;
596
597 return((int)v);
598 }