]> git.cameronkatri.com Git - mandoc.git/blob - read.c
Have mandoc-db accumulate manual page descriptions (`Nd' in -mdoc parlance)
[mandoc.git] / read.c
1 /* $Id: read.c,v 1.13 2011/04/11 21:59:39 kristaps Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <sys/stat.h>
23 #include <sys/mman.h>
24
25 #include <assert.h>
26 #include <ctype.h>
27 #include <fcntl.h>
28 #include <stdarg.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "mandoc.h"
35 #include "libmandoc.h"
36 #include "mdoc.h"
37 #include "man.h"
38
39 #ifndef MAP_FILE
40 #define MAP_FILE 0
41 #endif
42
43 #define REPARSE_LIMIT 1000
44
45 struct buf {
46 char *buf; /* binary input buffer */
47 size_t sz; /* size of binary buffer */
48 };
49
50 struct mparse {
51 enum mandoclevel file_status; /* status of current parse */
52 enum mandoclevel wlevel; /* ignore messages below this */
53 int line; /* line number in the file */
54 enum mparset inttype; /* which parser to use */
55 struct man *pman; /* persistent man parser */
56 struct mdoc *pmdoc; /* persistent mdoc parser */
57 struct man *man; /* man parser */
58 struct mdoc *mdoc; /* mdoc parser */
59 struct roff *roff; /* roff parser (!NULL) */
60 struct regset regs; /* roff registers */
61 int reparse_count; /* finite interp. stack */
62 mandocmsg mmsg; /* warning/error message handler */
63 void *arg; /* argument to mmsg */
64 const char *file;
65 };
66
67 static void resize_buf(struct buf *, size_t);
68 static void mparse_buf_r(struct mparse *, struct buf, int);
69 static void mparse_readfd_r(struct mparse *, int, const char *, int);
70 static void pset(const char *, int, struct mparse *);
71 static void pdesc(struct mparse *, const char *, int);
72 static int read_whole_file(const char *, int, struct buf *, int *);
73 static void mparse_end(struct mparse *);
74
75 static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = {
76 MANDOCERR_OK,
77 MANDOCERR_WARNING,
78 MANDOCERR_WARNING,
79 MANDOCERR_ERROR,
80 MANDOCERR_FATAL,
81 MANDOCERR_MAX,
82 MANDOCERR_MAX
83 };
84
85 static const char * const mandocerrs[MANDOCERR_MAX] = {
86 "ok",
87
88 "generic warning",
89
90 /* related to the prologue */
91 "no title in document",
92 "document title should be all caps",
93 "unknown manual section",
94 "date missing, using today's date",
95 "cannot parse date, using it verbatim",
96 "prologue macros out of order",
97 "duplicate prologue macro",
98 "macro not allowed in prologue",
99 "macro not allowed in body",
100
101 /* related to document structure */
102 ".so is fragile, better use ln(1)",
103 "NAME section must come first",
104 "bad NAME section contents",
105 "manual name not yet set",
106 "sections out of conventional order",
107 "duplicate section name",
108 "section not in conventional manual section",
109
110 /* related to macros and nesting */
111 "skipping obsolete macro",
112 "skipping paragraph macro",
113 "skipping no-space macro",
114 "blocks badly nested",
115 "child violates parent syntax",
116 "nested displays are not portable",
117 "already in literal mode",
118 "line scope broken",
119
120 /* related to missing macro arguments */
121 "skipping empty macro",
122 "argument count wrong",
123 "missing display type",
124 "list type must come first",
125 "tag lists require a width argument",
126 "missing font type",
127 "skipping end of block that is not open",
128
129 /* related to bad macro arguments */
130 "skipping argument",
131 "duplicate argument",
132 "duplicate display type",
133 "duplicate list type",
134 "unknown AT&T UNIX version",
135 "bad Boolean value",
136 "unknown font",
137 "unknown standard specifier",
138 "bad width argument",
139
140 /* related to plain text */
141 "blank line in non-literal context",
142 "tab in non-literal context",
143 "end of line whitespace",
144 "bad comment style",
145 "bad escape sequence",
146 "unterminated quoted string",
147
148 "generic error",
149
150 /* related to tables */
151 "bad table syntax",
152 "bad table option",
153 "bad table layout",
154 "no table layout cells specified",
155 "no table data cells specified",
156 "ignore data in cell",
157 "data block still open",
158 "ignoring extra data cells",
159
160 "input stack limit exceeded, infinite loop?",
161 "skipping bad character",
162 "escaped character not allowed in a name",
163 "skipping text before the first section header",
164 "skipping unknown macro",
165 "NOT IMPLEMENTED, please use groff: skipping request",
166 "argument count wrong",
167 "skipping end of block that is not open",
168 "missing end of block",
169 "scope open on exit",
170 "uname(3) system call failed",
171 "macro requires line argument(s)",
172 "macro requires body argument(s)",
173 "macro requires argument(s)",
174 "missing list type",
175 "line argument(s) will be lost",
176 "body argument(s) will be lost",
177
178 "generic fatal error",
179
180 "not a manual",
181 "column syntax is inconsistent",
182 "NOT IMPLEMENTED: .Bd -file",
183 "line scope broken, syntax violated",
184 "argument count wrong, violates syntax",
185 "child violates parent syntax",
186 "argument count wrong, violates syntax",
187 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
188 "no document body",
189 "no document prologue",
190 "static buffer exhausted",
191 };
192
193 static const char * const mandoclevels[MANDOCLEVEL_MAX] = {
194 "SUCCESS",
195 "RESERVED",
196 "WARNING",
197 "ERROR",
198 "FATAL",
199 "BADARG",
200 "SYSERR"
201 };
202
203 static void
204 resize_buf(struct buf *buf, size_t initial)
205 {
206
207 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
208 buf->buf = mandoc_realloc(buf->buf, buf->sz);
209 }
210
211 static void
212 pset(const char *buf, int pos, struct mparse *curp)
213 {
214 int i;
215
216 /*
217 * Try to intuit which kind of manual parser should be used. If
218 * passed in by command-line (-man, -mdoc), then use that
219 * explicitly. If passed as -mandoc, then try to guess from the
220 * line: either skip dot-lines, use -mdoc when finding `.Dt', or
221 * default to -man, which is more lenient.
222 *
223 * Separate out pmdoc/pman from mdoc/man: the first persists
224 * through all parsers, while the latter is used per-parse.
225 */
226
227 if ('.' == buf[0] || '\'' == buf[0]) {
228 for (i = 1; buf[i]; i++)
229 if (' ' != buf[i] && '\t' != buf[i])
230 break;
231 if ('\0' == buf[i])
232 return;
233 }
234
235 switch (curp->inttype) {
236 case (MPARSE_MDOC):
237 if (NULL == curp->pmdoc)
238 curp->pmdoc = mdoc_alloc(&curp->regs, curp);
239 assert(curp->pmdoc);
240 curp->mdoc = curp->pmdoc;
241 return;
242 case (MPARSE_MAN):
243 if (NULL == curp->pman)
244 curp->pman = man_alloc(&curp->regs, curp);
245 assert(curp->pman);
246 curp->man = curp->pman;
247 return;
248 default:
249 break;
250 }
251
252 if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) {
253 if (NULL == curp->pmdoc)
254 curp->pmdoc = mdoc_alloc(&curp->regs, curp);
255 assert(curp->pmdoc);
256 curp->mdoc = curp->pmdoc;
257 return;
258 }
259
260 if (NULL == curp->pman)
261 curp->pman = man_alloc(&curp->regs, curp);
262 assert(curp->pman);
263 curp->man = curp->pman;
264 }
265
266 /*
267 * Main parse routine for an opened file. This is called for each
268 * opened file and simply loops around the full input file, possibly
269 * nesting (i.e., with `so').
270 */
271 static void
272 mparse_buf_r(struct mparse *curp, struct buf blk, int start)
273 {
274 const struct tbl_span *span;
275 struct buf ln;
276 enum rofferr rr;
277 int i, of, rc;
278 int pos; /* byte number in the ln buffer */
279 int lnn; /* line number in the real file */
280 unsigned char c;
281
282 memset(&ln, 0, sizeof(struct buf));
283
284 lnn = curp->line;
285 pos = 0;
286
287 for (i = 0; i < (int)blk.sz; ) {
288 if (0 == pos && '\0' == blk.buf[i])
289 break;
290
291 if (start) {
292 curp->line = lnn;
293 curp->reparse_count = 0;
294 }
295
296 while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
297
298 /*
299 * When finding an unescaped newline character,
300 * leave the character loop to process the line.
301 * Skip a preceding carriage return, if any.
302 */
303
304 if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz &&
305 '\n' == blk.buf[i + 1])
306 ++i;
307 if ('\n' == blk.buf[i]) {
308 ++i;
309 ++lnn;
310 break;
311 }
312
313 /*
314 * Warn about bogus characters. If you're using
315 * non-ASCII encoding, you're screwing your
316 * readers. Since I'd rather this not happen,
317 * I'll be helpful and drop these characters so
318 * we don't display gibberish. Note to manual
319 * writers: use special characters.
320 */
321
322 c = (unsigned char) blk.buf[i];
323
324 if ( ! (isascii(c) &&
325 (isgraph(c) || isblank(c)))) {
326 mandoc_msg(MANDOCERR_BADCHAR, curp,
327 curp->line, pos, "ignoring byte");
328 i++;
329 continue;
330 }
331
332 /* Trailing backslash = a plain char. */
333
334 if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) {
335 if (pos >= (int)ln.sz)
336 resize_buf(&ln, 256);
337 ln.buf[pos++] = blk.buf[i++];
338 continue;
339 }
340
341 /*
342 * Found escape and at least one other character.
343 * When it's a newline character, skip it.
344 * When there is a carriage return in between,
345 * skip that one as well.
346 */
347
348 if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz &&
349 '\n' == blk.buf[i + 2])
350 ++i;
351 if ('\n' == blk.buf[i + 1]) {
352 i += 2;
353 ++lnn;
354 continue;
355 }
356
357 if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
358 i += 2;
359 /* Comment, skip to end of line */
360 for (; i < (int)blk.sz; ++i) {
361 if ('\n' == blk.buf[i]) {
362 ++i;
363 ++lnn;
364 break;
365 }
366 }
367
368 /* Backout trailing whitespaces */
369 for (; pos > 0; --pos) {
370 if (ln.buf[pos - 1] != ' ')
371 break;
372 if (pos > 2 && ln.buf[pos - 2] == '\\')
373 break;
374 }
375 break;
376 }
377
378 /* Some other escape sequence, copy & cont. */
379
380 if (pos + 1 >= (int)ln.sz)
381 resize_buf(&ln, 256);
382
383 ln.buf[pos++] = blk.buf[i++];
384 ln.buf[pos++] = blk.buf[i++];
385 }
386
387 if (pos >= (int)ln.sz)
388 resize_buf(&ln, 256);
389
390 ln.buf[pos] = '\0';
391
392 /*
393 * A significant amount of complexity is contained by
394 * the roff preprocessor. It's line-oriented but can be
395 * expressed on one line, so we need at times to
396 * readjust our starting point and re-run it. The roff
397 * preprocessor can also readjust the buffers with new
398 * data, so we pass them in wholesale.
399 */
400
401 of = 0;
402
403 rerun:
404 rr = roff_parseln
405 (curp->roff, curp->line,
406 &ln.buf, &ln.sz, of, &of);
407
408 switch (rr) {
409 case (ROFF_REPARSE):
410 if (REPARSE_LIMIT >= ++curp->reparse_count)
411 mparse_buf_r(curp, ln, 0);
412 else
413 mandoc_msg(MANDOCERR_ROFFLOOP, curp,
414 curp->line, pos, NULL);
415 pos = 0;
416 continue;
417 case (ROFF_APPEND):
418 pos = (int)strlen(ln.buf);
419 continue;
420 case (ROFF_RERUN):
421 goto rerun;
422 case (ROFF_IGN):
423 pos = 0;
424 continue;
425 case (ROFF_ERR):
426 assert(MANDOCLEVEL_FATAL <= curp->file_status);
427 break;
428 case (ROFF_SO):
429 mparse_readfd_r(curp, -1, ln.buf + of, 1);
430 if (MANDOCLEVEL_FATAL <= curp->file_status)
431 break;
432 pos = 0;
433 continue;
434 default:
435 break;
436 }
437
438 /*
439 * If we encounter errors in the recursive parse, make
440 * sure we don't continue parsing.
441 */
442
443 if (MANDOCLEVEL_FATAL <= curp->file_status)
444 break;
445
446 /*
447 * If input parsers have not been allocated, do so now.
448 * We keep these instanced betwen parsers, but set them
449 * locally per parse routine since we can use different
450 * parsers with each one.
451 */
452
453 if ( ! (curp->man || curp->mdoc))
454 pset(ln.buf + of, pos - of, curp);
455
456 /*
457 * Lastly, push down into the parsers themselves. One
458 * of these will have already been set in the pset()
459 * routine.
460 * If libroff returns ROFF_TBL, then add it to the
461 * currently open parse. Since we only get here if
462 * there does exist data (see tbl_data.c), we're
463 * guaranteed that something's been allocated.
464 * Do the same for ROFF_EQN.
465 */
466
467 rc = -1;
468
469 if (ROFF_TBL == rr)
470 while (NULL != (span = roff_span(curp->roff))) {
471 rc = curp->man ?
472 man_addspan(curp->man, span) :
473 mdoc_addspan(curp->mdoc, span);
474 if (0 == rc)
475 break;
476 }
477 else if (ROFF_EQN == rr)
478 rc = curp->mdoc ?
479 mdoc_addeqn(curp->mdoc,
480 roff_eqn(curp->roff)) :
481 man_addeqn(curp->man,
482 roff_eqn(curp->roff));
483 else if (curp->man || curp->mdoc)
484 rc = curp->man ?
485 man_parseln(curp->man,
486 curp->line, ln.buf, of) :
487 mdoc_parseln(curp->mdoc,
488 curp->line, ln.buf, of);
489
490 if (0 == rc) {
491 assert(MANDOCLEVEL_FATAL <= curp->file_status);
492 break;
493 }
494
495 /* Temporary buffers typically are not full. */
496
497 if (0 == start && '\0' == blk.buf[i])
498 break;
499
500 /* Start the next input line. */
501
502 pos = 0;
503 }
504
505 free(ln.buf);
506 }
507
508 static void
509 pdesc(struct mparse *curp, const char *file, int fd)
510 {
511 struct buf blk;
512 int with_mmap;
513
514 /*
515 * Run for each opened file; may be called more than once for
516 * each full parse sequence if the opened file is nested (i.e.,
517 * from `so'). Simply sucks in the whole file and moves into
518 * the parse phase for the file.
519 */
520
521 if ( ! read_whole_file(file, fd, &blk, &with_mmap)) {
522 curp->file_status = MANDOCLEVEL_SYSERR;
523 return;
524 }
525
526 /* Line number is per-file. */
527
528 curp->line = 1;
529
530 mparse_buf_r(curp, blk, 1);
531
532 if (with_mmap)
533 munmap(blk.buf, blk.sz);
534 else
535 free(blk.buf);
536 }
537
538 static int
539 read_whole_file(const char *file, int fd, struct buf *fb, int *with_mmap)
540 {
541 struct stat st;
542 size_t off;
543 ssize_t ssz;
544
545 if (-1 == fstat(fd, &st)) {
546 perror(file);
547 return(0);
548 }
549
550 /*
551 * If we're a regular file, try just reading in the whole entry
552 * via mmap(). This is faster than reading it into blocks, and
553 * since each file is only a few bytes to begin with, I'm not
554 * concerned that this is going to tank any machines.
555 */
556
557 if (S_ISREG(st.st_mode)) {
558 if (st.st_size >= (1U << 31)) {
559 fprintf(stderr, "%s: input too large\n", file);
560 return(0);
561 }
562 *with_mmap = 1;
563 fb->sz = (size_t)st.st_size;
564 fb->buf = mmap(NULL, fb->sz, PROT_READ,
565 MAP_FILE|MAP_SHARED, fd, 0);
566 if (fb->buf != MAP_FAILED)
567 return(1);
568 }
569
570 /*
571 * If this isn't a regular file (like, say, stdin), then we must
572 * go the old way and just read things in bit by bit.
573 */
574
575 *with_mmap = 0;
576 off = 0;
577 fb->sz = 0;
578 fb->buf = NULL;
579 for (;;) {
580 if (off == fb->sz) {
581 if (fb->sz == (1U << 31)) {
582 fprintf(stderr, "%s: input too large\n", file);
583 break;
584 }
585 resize_buf(fb, 65536);
586 }
587 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
588 if (ssz == 0) {
589 fb->sz = off;
590 return(1);
591 }
592 if (ssz == -1) {
593 perror(file);
594 break;
595 }
596 off += (size_t)ssz;
597 }
598
599 free(fb->buf);
600 fb->buf = NULL;
601 return(0);
602 }
603
604 static void
605 mparse_end(struct mparse *curp)
606 {
607
608 if (MANDOCLEVEL_FATAL <= curp->file_status)
609 return;
610
611 if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) {
612 assert(MANDOCLEVEL_FATAL <= curp->file_status);
613 return;
614 }
615
616 if (curp->man && ! man_endparse(curp->man)) {
617 assert(MANDOCLEVEL_FATAL <= curp->file_status);
618 return;
619 }
620
621 if ( ! (curp->man || curp->mdoc)) {
622 mandoc_msg(MANDOCERR_NOTMANUAL, curp, 1, 0, NULL);
623 curp->file_status = MANDOCLEVEL_FATAL;
624 return;
625 }
626
627 roff_endparse(curp->roff);
628 }
629
630 static void
631 mparse_readfd_r(struct mparse *curp, int fd, const char *file, int re)
632 {
633 const char *svfile;
634
635 if (-1 == fd)
636 if (-1 == (fd = open(file, O_RDONLY, 0))) {
637 perror(file);
638 curp->file_status = MANDOCLEVEL_SYSERR;
639 return;
640 }
641
642 svfile = curp->file;
643 curp->file = file;
644
645 pdesc(curp, file, fd);
646
647 if (0 == re && MANDOCLEVEL_FATAL > curp->file_status)
648 mparse_end(curp);
649
650 if (STDIN_FILENO != fd && -1 == close(fd))
651 perror(file);
652
653 curp->file = svfile;
654 }
655
656 enum mandoclevel
657 mparse_readfd(struct mparse *curp, int fd, const char *file)
658 {
659
660 mparse_readfd_r(curp, fd, file, 0);
661 return(curp->file_status);
662 }
663
664 struct mparse *
665 mparse_alloc(enum mparset inttype, enum mandoclevel wlevel, mandocmsg mmsg, void *arg)
666 {
667 struct mparse *curp;
668
669 assert(wlevel <= MANDOCLEVEL_FATAL);
670
671 curp = mandoc_calloc(1, sizeof(struct mparse));
672
673 curp->wlevel = wlevel;
674 curp->mmsg = mmsg;
675 curp->arg = arg;
676 curp->inttype = inttype;
677
678 curp->roff = roff_alloc(&curp->regs, curp);
679 return(curp);
680 }
681
682 void
683 mparse_reset(struct mparse *curp)
684 {
685
686 memset(&curp->regs, 0, sizeof(struct regset));
687
688 roff_reset(curp->roff);
689
690 if (curp->mdoc)
691 mdoc_reset(curp->mdoc);
692 if (curp->man)
693 man_reset(curp->man);
694
695 curp->file_status = MANDOCLEVEL_OK;
696 curp->mdoc = NULL;
697 curp->man = NULL;
698 }
699
700 void
701 mparse_free(struct mparse *curp)
702 {
703
704 if (curp->pmdoc)
705 mdoc_free(curp->pmdoc);
706 if (curp->pman)
707 man_free(curp->pman);
708 if (curp->roff)
709 roff_free(curp->roff);
710
711 free(curp);
712 }
713
714 void
715 mparse_result(struct mparse *curp, struct mdoc **mdoc, struct man **man)
716 {
717
718 if (mdoc)
719 *mdoc = curp->mdoc;
720 if (man)
721 *man = curp->man;
722 }
723
724 void
725 mandoc_vmsg(enum mandocerr t, struct mparse *m,
726 int ln, int pos, const char *fmt, ...)
727 {
728 char buf[256];
729 va_list ap;
730
731 va_start(ap, fmt);
732 vsnprintf(buf, sizeof(buf) - 1, fmt, ap);
733 va_end(ap);
734
735 mandoc_msg(t, m, ln, pos, buf);
736 }
737
738 void
739 mandoc_msg(enum mandocerr er, struct mparse *m,
740 int ln, int col, const char *msg)
741 {
742 enum mandoclevel level;
743
744 level = MANDOCLEVEL_FATAL;
745 while (er < mandoclimits[level])
746 level--;
747
748 if (level < m->wlevel)
749 return;
750
751 if (m->mmsg)
752 (*m->mmsg)(er, level, m->file, ln, col, msg);
753
754 if (m->file_status < level)
755 m->file_status = level;
756 }
757
758 const char *
759 mparse_strerror(enum mandocerr er)
760 {
761
762 return(mandocerrs[er]);
763 }
764
765 const char *
766 mparse_strlevel(enum mandoclevel lvl)
767 {
768 return(mandoclevels[lvl]);
769 }