]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.c
Add *.gz support to apropos(1) -a, man(1), and even mandoc(1).
[mandoc.git] / mandoc.c
1 /* $Id: mandoc.c,v 1.86 2014/08/18 09:11:47 kristaps Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30
31 #include "mandoc.h"
32 #include "mandoc_aux.h"
33 #include "libmandoc.h"
34
35 #define DATESIZE 32
36
37 static int a2time(time_t *, const char *, const char *);
38 static char *time2a(time_t);
39
40
41 enum mandoc_esc
42 mandoc_escape(const char **end, const char **start, int *sz)
43 {
44 const char *local_start;
45 int local_sz;
46 char term;
47 enum mandoc_esc gly;
48
49 /*
50 * When the caller doesn't provide return storage,
51 * use local storage.
52 */
53
54 if (NULL == start)
55 start = &local_start;
56 if (NULL == sz)
57 sz = &local_sz;
58
59 /*
60 * Beyond the backslash, at least one input character
61 * is part of the escape sequence. With one exception
62 * (see below), that character won't be returned.
63 */
64
65 gly = ESCAPE_ERROR;
66 *start = ++*end;
67 *sz = 0;
68 term = '\0';
69
70 switch ((*start)[-1]) {
71 /*
72 * First the glyphs. There are several different forms of
73 * these, but each eventually returns a substring of the glyph
74 * name.
75 */
76 case '(':
77 gly = ESCAPE_SPECIAL;
78 *sz = 2;
79 break;
80 case '[':
81 gly = ESCAPE_SPECIAL;
82 /*
83 * Unicode escapes are defined in groff as \[uXXXX] to
84 * \[u10FFFF], where the contained value must be a valid
85 * Unicode codepoint. Here, however, only check whether
86 * it's not a zero-width escape.
87 */
88 if ('u' == (*start)[0] && ']' != (*start)[1])
89 gly = ESCAPE_UNICODE;
90 term = ']';
91 break;
92 case 'C':
93 if ('\'' != **start)
94 return(ESCAPE_ERROR);
95 *start = ++*end;
96 if ('u' == (*start)[0] && '\'' != (*start)[1])
97 gly = ESCAPE_UNICODE;
98 else
99 gly = ESCAPE_SPECIAL;
100 term = '\'';
101 break;
102
103 /*
104 * Escapes taking no arguments at all.
105 */
106 case 'd':
107 /* FALLTHROUGH */
108 case 'u':
109 return(ESCAPE_IGNORE);
110
111 /*
112 * The \z escape is supposed to output the following
113 * character without advancing the cursor position.
114 * Since we are mostly dealing with terminal mode,
115 * let us just skip the next character.
116 */
117 case 'z':
118 return(ESCAPE_SKIPCHAR);
119
120 /*
121 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
122 * 'X' is the trigger. These have opaque sub-strings.
123 */
124 case 'F':
125 /* FALLTHROUGH */
126 case 'g':
127 /* FALLTHROUGH */
128 case 'k':
129 /* FALLTHROUGH */
130 case 'M':
131 /* FALLTHROUGH */
132 case 'm':
133 /* FALLTHROUGH */
134 case 'n':
135 /* FALLTHROUGH */
136 case 'V':
137 /* FALLTHROUGH */
138 case 'Y':
139 gly = ESCAPE_IGNORE;
140 /* FALLTHROUGH */
141 case 'f':
142 if (ESCAPE_ERROR == gly)
143 gly = ESCAPE_FONT;
144 switch (**start) {
145 case '(':
146 *start = ++*end;
147 *sz = 2;
148 break;
149 case '[':
150 *start = ++*end;
151 term = ']';
152 break;
153 default:
154 *sz = 1;
155 break;
156 }
157 break;
158
159 /*
160 * These escapes are of the form \X'Y', where 'X' is the trigger
161 * and 'Y' is any string. These have opaque sub-strings.
162 * The \B and \w escapes are handled in roff.c, roff_res().
163 */
164 case 'A':
165 /* FALLTHROUGH */
166 case 'b':
167 /* FALLTHROUGH */
168 case 'D':
169 /* FALLTHROUGH */
170 case 'o':
171 /* FALLTHROUGH */
172 case 'R':
173 /* FALLTHROUGH */
174 case 'X':
175 /* FALLTHROUGH */
176 case 'Z':
177 if ('\0' == **start)
178 return(ESCAPE_ERROR);
179 gly = ESCAPE_IGNORE;
180 term = **start;
181 *start = ++*end;
182 break;
183
184 /*
185 * These escapes are of the form \X'N', where 'X' is the trigger
186 * and 'N' resolves to a numerical expression.
187 */
188 case 'h':
189 /* FALLTHROUGH */
190 case 'H':
191 /* FALLTHROUGH */
192 case 'L':
193 /* FALLTHROUGH */
194 case 'l':
195 /* FALLTHROUGH */
196 case 'S':
197 /* FALLTHROUGH */
198 case 'v':
199 /* FALLTHROUGH */
200 case 'x':
201 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
202 if ('\0' != **start)
203 ++*end;
204 return(ESCAPE_ERROR);
205 }
206 gly = ESCAPE_IGNORE;
207 term = **start;
208 *start = ++*end;
209 break;
210
211 /*
212 * Special handling for the numbered character escape.
213 * XXX Do any other escapes need similar handling?
214 */
215 case 'N':
216 if ('\0' == **start)
217 return(ESCAPE_ERROR);
218 (*end)++;
219 if (isdigit((unsigned char)**start)) {
220 *sz = 1;
221 return(ESCAPE_IGNORE);
222 }
223 (*start)++;
224 while (isdigit((unsigned char)**end))
225 (*end)++;
226 *sz = *end - *start;
227 if ('\0' != **end)
228 (*end)++;
229 return(ESCAPE_NUMBERED);
230
231 /*
232 * Sizes get a special category of their own.
233 */
234 case 's':
235 gly = ESCAPE_IGNORE;
236
237 /* See +/- counts as a sign. */
238 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
239 (*end)++;
240
241 switch (**end) {
242 case '(':
243 *start = ++*end;
244 *sz = 2;
245 break;
246 case '[':
247 *start = ++*end;
248 term = ']';
249 break;
250 case '\'':
251 *start = ++*end;
252 term = '\'';
253 break;
254 default:
255 *sz = 1;
256 break;
257 }
258
259 break;
260
261 /*
262 * Anything else is assumed to be a glyph.
263 * In this case, pass back the character after the backslash.
264 */
265 default:
266 gly = ESCAPE_SPECIAL;
267 *start = --*end;
268 *sz = 1;
269 break;
270 }
271
272 assert(ESCAPE_ERROR != gly);
273
274 /*
275 * Read up to the terminating character,
276 * paying attention to nested escapes.
277 */
278
279 if ('\0' != term) {
280 while (**end != term) {
281 switch (**end) {
282 case '\0':
283 return(ESCAPE_ERROR);
284 case '\\':
285 (*end)++;
286 if (ESCAPE_ERROR ==
287 mandoc_escape(end, NULL, NULL))
288 return(ESCAPE_ERROR);
289 break;
290 default:
291 (*end)++;
292 break;
293 }
294 }
295 *sz = (*end)++ - *start;
296 } else {
297 assert(*sz > 0);
298 if ((size_t)*sz > strlen(*start))
299 return(ESCAPE_ERROR);
300 *end += *sz;
301 }
302
303 /* Run post-processors. */
304
305 switch (gly) {
306 case ESCAPE_FONT:
307 if (2 == *sz) {
308 if ('C' == **start) {
309 /*
310 * Treat constant-width font modes
311 * just like regular font modes.
312 */
313 (*start)++;
314 (*sz)--;
315 } else {
316 if ('B' == (*start)[0] && 'I' == (*start)[1])
317 gly = ESCAPE_FONTBI;
318 break;
319 }
320 } else if (1 != *sz)
321 break;
322
323 switch (**start) {
324 case '3':
325 /* FALLTHROUGH */
326 case 'B':
327 gly = ESCAPE_FONTBOLD;
328 break;
329 case '2':
330 /* FALLTHROUGH */
331 case 'I':
332 gly = ESCAPE_FONTITALIC;
333 break;
334 case 'P':
335 gly = ESCAPE_FONTPREV;
336 break;
337 case '1':
338 /* FALLTHROUGH */
339 case 'R':
340 gly = ESCAPE_FONTROMAN;
341 break;
342 }
343 break;
344 case ESCAPE_SPECIAL:
345 if (1 == *sz && 'c' == **start)
346 gly = ESCAPE_NOSPACE;
347 break;
348 default:
349 break;
350 }
351
352 return(gly);
353 }
354
355 /*
356 * Parse a quoted or unquoted roff-style request or macro argument.
357 * Return a pointer to the parsed argument, which is either the original
358 * pointer or advanced by one byte in case the argument is quoted.
359 * NUL-terminate the argument in place.
360 * Collapse pairs of quotes inside quoted arguments.
361 * Advance the argument pointer to the next argument,
362 * or to the NUL byte terminating the argument line.
363 */
364 char *
365 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
366 {
367 char *start, *cp;
368 int quoted, pairs, white;
369
370 /* Quoting can only start with a new word. */
371 start = *cpp;
372 quoted = 0;
373 if ('"' == *start) {
374 quoted = 1;
375 start++;
376 }
377
378 pairs = 0;
379 white = 0;
380 for (cp = start; '\0' != *cp; cp++) {
381
382 /*
383 * Move the following text left
384 * after quoted quotes and after "\\" and "\t".
385 */
386 if (pairs)
387 cp[-pairs] = cp[0];
388
389 if ('\\' == cp[0]) {
390 /*
391 * In copy mode, translate double to single
392 * backslashes and backslash-t to literal tabs.
393 */
394 switch (cp[1]) {
395 case 't':
396 cp[0] = '\t';
397 /* FALLTHROUGH */
398 case '\\':
399 pairs++;
400 cp++;
401 break;
402 case ' ':
403 /* Skip escaped blanks. */
404 if (0 == quoted)
405 cp++;
406 break;
407 default:
408 break;
409 }
410 } else if (0 == quoted) {
411 if (' ' == cp[0]) {
412 /* Unescaped blanks end unquoted args. */
413 white = 1;
414 break;
415 }
416 } else if ('"' == cp[0]) {
417 if ('"' == cp[1]) {
418 /* Quoted quotes collapse. */
419 pairs++;
420 cp++;
421 } else {
422 /* Unquoted quotes end quoted args. */
423 quoted = 2;
424 break;
425 }
426 }
427 }
428
429 /* Quoted argument without a closing quote. */
430 if (1 == quoted)
431 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
432
433 /* NUL-terminate this argument and move to the next one. */
434 if (pairs)
435 cp[-pairs] = '\0';
436 if ('\0' != *cp) {
437 *cp++ = '\0';
438 while (' ' == *cp)
439 cp++;
440 }
441 *pos += (int)(cp - start) + (quoted ? 1 : 0);
442 *cpp = cp;
443
444 if ('\0' == *cp && (white || ' ' == cp[-1]))
445 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
446
447 return(start);
448 }
449
450 static int
451 a2time(time_t *t, const char *fmt, const char *p)
452 {
453 struct tm tm;
454 char *pp;
455
456 memset(&tm, 0, sizeof(struct tm));
457
458 pp = NULL;
459 #if HAVE_STRPTIME
460 pp = strptime(p, fmt, &tm);
461 #endif
462 if (NULL != pp && '\0' == *pp) {
463 *t = mktime(&tm);
464 return(1);
465 }
466
467 return(0);
468 }
469
470 static char *
471 time2a(time_t t)
472 {
473 struct tm *tm;
474 char *buf, *p;
475 size_t ssz;
476 int isz;
477
478 tm = localtime(&t);
479
480 /*
481 * Reserve space:
482 * up to 9 characters for the month (September) + blank
483 * up to 2 characters for the day + comma + blank
484 * 4 characters for the year and a terminating '\0'
485 */
486 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
487
488 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
489 goto fail;
490 p += (int)ssz;
491
492 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
493 goto fail;
494 p += isz;
495
496 if (0 == strftime(p, 4 + 1, "%Y", tm))
497 goto fail;
498 return(buf);
499
500 fail:
501 free(buf);
502 return(NULL);
503 }
504
505 char *
506 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
507 {
508 char *out;
509 time_t t;
510
511 if (NULL == in || '\0' == *in ||
512 0 == strcmp(in, "$" "Mdocdate$")) {
513 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
514 time(&t);
515 }
516 else if (a2time(&t, "%Y-%m-%d", in))
517 t = 0;
518 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
519 !a2time(&t, "%b %d, %Y", in)) {
520 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
521 t = 0;
522 }
523 out = t ? time2a(t) : NULL;
524 return(out ? out : mandoc_strdup(in));
525 }
526
527 int
528 mandoc_eos(const char *p, size_t sz)
529 {
530 const char *q;
531 int enclosed, found;
532
533 if (0 == sz)
534 return(0);
535
536 /*
537 * End-of-sentence recognition must include situations where
538 * some symbols, such as `)', allow prior EOS punctuation to
539 * propagate outward.
540 */
541
542 enclosed = found = 0;
543 for (q = p + (int)sz - 1; q >= p; q--) {
544 switch (*q) {
545 case '\"':
546 /* FALLTHROUGH */
547 case '\'':
548 /* FALLTHROUGH */
549 case ']':
550 /* FALLTHROUGH */
551 case ')':
552 if (0 == found)
553 enclosed = 1;
554 break;
555 case '.':
556 /* FALLTHROUGH */
557 case '!':
558 /* FALLTHROUGH */
559 case '?':
560 found = 1;
561 break;
562 default:
563 return(found && (!enclosed || isalnum((unsigned char)*q)));
564 }
565 }
566
567 return(found && !enclosed);
568 }
569
570 /*
571 * Convert a string to a long that may not be <0.
572 * If the string is invalid, or is less than 0, return -1.
573 */
574 int
575 mandoc_strntoi(const char *p, size_t sz, int base)
576 {
577 char buf[32];
578 char *ep;
579 long v;
580
581 if (sz > 31)
582 return(-1);
583
584 memcpy(buf, p, sz);
585 buf[(int)sz] = '\0';
586
587 errno = 0;
588 v = strtol(buf, &ep, base);
589
590 if (buf[0] == '\0' || *ep != '\0')
591 return(-1);
592
593 if (v > INT_MAX)
594 v = INT_MAX;
595 if (v < INT_MIN)
596 v = INT_MIN;
597
598 return((int)v);
599 }