]> git.cameronkatri.com Git - mandoc.git/blob - read.c
Add support for tdefine and ndefine. Consolidate some error messages. Add
[mandoc.git] / read.c
1 /* $Id: read.c,v 1.23 2011/07/23 18:41:18 kristaps Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #ifdef HAVE_MMAP
23 # include <sys/stat.h>
24 # include <sys/mman.h>
25 #endif
26
27 #include <assert.h>
28 #include <ctype.h>
29 #include <fcntl.h>
30 #include <stdarg.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
35
36 #include "mandoc.h"
37 #include "libmandoc.h"
38 #include "mdoc.h"
39 #include "man.h"
40
41 #ifndef MAP_FILE
42 #define MAP_FILE 0
43 #endif
44
45 #define REPARSE_LIMIT 1000
46
47 struct buf {
48 char *buf; /* binary input buffer */
49 size_t sz; /* size of binary buffer */
50 };
51
52 struct mparse {
53 enum mandoclevel file_status; /* status of current parse */
54 enum mandoclevel wlevel; /* ignore messages below this */
55 int line; /* line number in the file */
56 enum mparset inttype; /* which parser to use */
57 struct man *pman; /* persistent man parser */
58 struct mdoc *pmdoc; /* persistent mdoc parser */
59 struct man *man; /* man parser */
60 struct mdoc *mdoc; /* mdoc parser */
61 struct roff *roff; /* roff parser (!NULL) */
62 int reparse_count; /* finite interp. stack */
63 mandocmsg mmsg; /* warning/error message handler */
64 void *arg; /* argument to mmsg */
65 const char *file;
66 };
67
68 static void resize_buf(struct buf *, size_t);
69 static void mparse_buf_r(struct mparse *, struct buf, int);
70 static void mparse_readfd_r(struct mparse *, int, const char *, int);
71 static void pset(const char *, int, struct mparse *);
72 static void pdesc(struct mparse *, const char *, int);
73 static int read_whole_file(const char *, int, struct buf *, int *);
74 static void mparse_end(struct mparse *);
75
76 static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = {
77 MANDOCERR_OK,
78 MANDOCERR_WARNING,
79 MANDOCERR_WARNING,
80 MANDOCERR_ERROR,
81 MANDOCERR_FATAL,
82 MANDOCERR_MAX,
83 MANDOCERR_MAX
84 };
85
86 static const char * const mandocerrs[MANDOCERR_MAX] = {
87 "ok",
88
89 "generic warning",
90
91 /* related to the prologue */
92 "no title in document",
93 "document title should be all caps",
94 "unknown manual section",
95 "date missing, using today's date",
96 "cannot parse date, using it verbatim",
97 "prologue macros out of order",
98 "duplicate prologue macro",
99 "macro not allowed in prologue",
100 "macro not allowed in body",
101
102 /* related to document structure */
103 ".so is fragile, better use ln(1)",
104 "NAME section must come first",
105 "bad NAME section contents",
106 "manual name not yet set",
107 "sections out of conventional order",
108 "duplicate section name",
109 "section not in conventional manual section",
110
111 /* related to macros and nesting */
112 "skipping obsolete macro",
113 "skipping paragraph macro",
114 "skipping no-space macro",
115 "blocks badly nested",
116 "child violates parent syntax",
117 "nested displays are not portable",
118 "already in literal mode",
119 "line scope broken",
120
121 /* related to missing macro arguments */
122 "skipping empty macro",
123 "argument count wrong",
124 "missing display type",
125 "list type must come first",
126 "tag lists require a width argument",
127 "missing font type",
128 "skipping end of block that is not open",
129
130 /* related to bad macro arguments */
131 "skipping argument",
132 "duplicate argument",
133 "duplicate display type",
134 "duplicate list type",
135 "unknown AT&T UNIX version",
136 "bad Boolean value",
137 "unknown font",
138 "unknown standard specifier",
139 "bad width argument",
140
141 /* related to plain text */
142 "blank line in non-literal context",
143 "tab in non-literal context",
144 "end of line whitespace",
145 "bad comment style",
146 "bad escape sequence",
147 "unterminated quoted string",
148
149 /* related to equations */
150 "unexpected literal in equation",
151
152 "generic error",
153
154 /* related to equations */
155 "unexpected equation scope closure",
156 "equation scope open on exit",
157 "overlapping equation scopes",
158 "unexpected end of equation",
159 "equation syntax error",
160
161 /* related to tables */
162 "bad table syntax",
163 "bad table option",
164 "bad table layout",
165 "no table layout cells specified",
166 "no table data cells specified",
167 "ignore data in cell",
168 "data block still open",
169 "ignoring extra data cells",
170
171 "input stack limit exceeded, infinite loop?",
172 "skipping bad character",
173 "escaped character not allowed in a name",
174 "skipping text before the first section header",
175 "skipping unknown macro",
176 "NOT IMPLEMENTED, please use groff: skipping request",
177 "argument count wrong",
178 "skipping end of block that is not open",
179 "missing end of block",
180 "scope open on exit",
181 "uname(3) system call failed",
182 "macro requires line argument(s)",
183 "macro requires body argument(s)",
184 "macro requires argument(s)",
185 "missing list type",
186 "line argument(s) will be lost",
187 "body argument(s) will be lost",
188
189 "generic fatal error",
190
191 "not a manual",
192 "column syntax is inconsistent",
193 "NOT IMPLEMENTED: .Bd -file",
194 "line scope broken, syntax violated",
195 "argument count wrong, violates syntax",
196 "child violates parent syntax",
197 "argument count wrong, violates syntax",
198 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
199 "no document body",
200 "no document prologue",
201 "static buffer exhausted",
202 };
203
204 static const char * const mandoclevels[MANDOCLEVEL_MAX] = {
205 "SUCCESS",
206 "RESERVED",
207 "WARNING",
208 "ERROR",
209 "FATAL",
210 "BADARG",
211 "SYSERR"
212 };
213
214 static void
215 resize_buf(struct buf *buf, size_t initial)
216 {
217
218 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
219 buf->buf = mandoc_realloc(buf->buf, buf->sz);
220 }
221
222 static void
223 pset(const char *buf, int pos, struct mparse *curp)
224 {
225 int i;
226
227 /*
228 * Try to intuit which kind of manual parser should be used. If
229 * passed in by command-line (-man, -mdoc), then use that
230 * explicitly. If passed as -mandoc, then try to guess from the
231 * line: either skip dot-lines, use -mdoc when finding `.Dt', or
232 * default to -man, which is more lenient.
233 *
234 * Separate out pmdoc/pman from mdoc/man: the first persists
235 * through all parsers, while the latter is used per-parse.
236 */
237
238 if ('.' == buf[0] || '\'' == buf[0]) {
239 for (i = 1; buf[i]; i++)
240 if (' ' != buf[i] && '\t' != buf[i])
241 break;
242 if ('\0' == buf[i])
243 return;
244 }
245
246 switch (curp->inttype) {
247 case (MPARSE_MDOC):
248 if (NULL == curp->pmdoc)
249 curp->pmdoc = mdoc_alloc(curp->roff, curp);
250 assert(curp->pmdoc);
251 curp->mdoc = curp->pmdoc;
252 return;
253 case (MPARSE_MAN):
254 if (NULL == curp->pman)
255 curp->pman = man_alloc(curp->roff, curp);
256 assert(curp->pman);
257 curp->man = curp->pman;
258 return;
259 default:
260 break;
261 }
262
263 if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) {
264 if (NULL == curp->pmdoc)
265 curp->pmdoc = mdoc_alloc(curp->roff, curp);
266 assert(curp->pmdoc);
267 curp->mdoc = curp->pmdoc;
268 return;
269 }
270
271 if (NULL == curp->pman)
272 curp->pman = man_alloc(curp->roff, curp);
273 assert(curp->pman);
274 curp->man = curp->pman;
275 }
276
277 /*
278 * Main parse routine for an opened file. This is called for each
279 * opened file and simply loops around the full input file, possibly
280 * nesting (i.e., with `so').
281 */
282 static void
283 mparse_buf_r(struct mparse *curp, struct buf blk, int start)
284 {
285 const struct tbl_span *span;
286 struct buf ln;
287 enum rofferr rr;
288 int i, of, rc;
289 int pos; /* byte number in the ln buffer */
290 int lnn; /* line number in the real file */
291 unsigned char c;
292
293 memset(&ln, 0, sizeof(struct buf));
294
295 lnn = curp->line;
296 pos = 0;
297
298 for (i = 0; i < (int)blk.sz; ) {
299 if (0 == pos && '\0' == blk.buf[i])
300 break;
301
302 if (start) {
303 curp->line = lnn;
304 curp->reparse_count = 0;
305 }
306
307 while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
308
309 /*
310 * When finding an unescaped newline character,
311 * leave the character loop to process the line.
312 * Skip a preceding carriage return, if any.
313 */
314
315 if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz &&
316 '\n' == blk.buf[i + 1])
317 ++i;
318 if ('\n' == blk.buf[i]) {
319 ++i;
320 ++lnn;
321 break;
322 }
323
324 /*
325 * Warn about bogus characters. If you're using
326 * non-ASCII encoding, you're screwing your
327 * readers. Since I'd rather this not happen,
328 * I'll be helpful and drop these characters so
329 * we don't display gibberish. Note to manual
330 * writers: use special characters.
331 */
332
333 c = (unsigned char) blk.buf[i];
334
335 if ( ! (isascii(c) &&
336 (isgraph(c) || isblank(c)))) {
337 mandoc_msg(MANDOCERR_BADCHAR, curp,
338 curp->line, pos, "ignoring byte");
339 i++;
340 continue;
341 }
342
343 /* Trailing backslash = a plain char. */
344
345 if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) {
346 if (pos >= (int)ln.sz)
347 resize_buf(&ln, 256);
348 ln.buf[pos++] = blk.buf[i++];
349 continue;
350 }
351
352 /*
353 * Found escape and at least one other character.
354 * When it's a newline character, skip it.
355 * When there is a carriage return in between,
356 * skip that one as well.
357 */
358
359 if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz &&
360 '\n' == blk.buf[i + 2])
361 ++i;
362 if ('\n' == blk.buf[i + 1]) {
363 i += 2;
364 ++lnn;
365 continue;
366 }
367
368 if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
369 i += 2;
370 /* Comment, skip to end of line */
371 for (; i < (int)blk.sz; ++i) {
372 if ('\n' == blk.buf[i]) {
373 ++i;
374 ++lnn;
375 break;
376 }
377 }
378
379 /* Backout trailing whitespaces */
380 for (; pos > 0; --pos) {
381 if (ln.buf[pos - 1] != ' ')
382 break;
383 if (pos > 2 && ln.buf[pos - 2] == '\\')
384 break;
385 }
386 break;
387 }
388
389 /* Some other escape sequence, copy & cont. */
390
391 if (pos + 1 >= (int)ln.sz)
392 resize_buf(&ln, 256);
393
394 ln.buf[pos++] = blk.buf[i++];
395 ln.buf[pos++] = blk.buf[i++];
396 }
397
398 if (pos >= (int)ln.sz)
399 resize_buf(&ln, 256);
400
401 ln.buf[pos] = '\0';
402
403 /*
404 * A significant amount of complexity is contained by
405 * the roff preprocessor. It's line-oriented but can be
406 * expressed on one line, so we need at times to
407 * readjust our starting point and re-run it. The roff
408 * preprocessor can also readjust the buffers with new
409 * data, so we pass them in wholesale.
410 */
411
412 of = 0;
413
414 rerun:
415 rr = roff_parseln
416 (curp->roff, curp->line,
417 &ln.buf, &ln.sz, of, &of);
418
419 switch (rr) {
420 case (ROFF_REPARSE):
421 if (REPARSE_LIMIT >= ++curp->reparse_count)
422 mparse_buf_r(curp, ln, 0);
423 else
424 mandoc_msg(MANDOCERR_ROFFLOOP, curp,
425 curp->line, pos, NULL);
426 pos = 0;
427 continue;
428 case (ROFF_APPEND):
429 pos = (int)strlen(ln.buf);
430 continue;
431 case (ROFF_RERUN):
432 goto rerun;
433 case (ROFF_IGN):
434 pos = 0;
435 continue;
436 case (ROFF_ERR):
437 assert(MANDOCLEVEL_FATAL <= curp->file_status);
438 break;
439 case (ROFF_SO):
440 mparse_readfd_r(curp, -1, ln.buf + of, 1);
441 if (MANDOCLEVEL_FATAL <= curp->file_status)
442 break;
443 pos = 0;
444 continue;
445 default:
446 break;
447 }
448
449 /*
450 * If we encounter errors in the recursive parse, make
451 * sure we don't continue parsing.
452 */
453
454 if (MANDOCLEVEL_FATAL <= curp->file_status)
455 break;
456
457 /*
458 * If input parsers have not been allocated, do so now.
459 * We keep these instanced between parsers, but set them
460 * locally per parse routine since we can use different
461 * parsers with each one.
462 */
463
464 if ( ! (curp->man || curp->mdoc))
465 pset(ln.buf + of, pos - of, curp);
466
467 /*
468 * Lastly, push down into the parsers themselves. One
469 * of these will have already been set in the pset()
470 * routine.
471 * If libroff returns ROFF_TBL, then add it to the
472 * currently open parse. Since we only get here if
473 * there does exist data (see tbl_data.c), we're
474 * guaranteed that something's been allocated.
475 * Do the same for ROFF_EQN.
476 */
477
478 rc = -1;
479
480 if (ROFF_TBL == rr)
481 while (NULL != (span = roff_span(curp->roff))) {
482 rc = curp->man ?
483 man_addspan(curp->man, span) :
484 mdoc_addspan(curp->mdoc, span);
485 if (0 == rc)
486 break;
487 }
488 else if (ROFF_EQN == rr)
489 rc = curp->mdoc ?
490 mdoc_addeqn(curp->mdoc,
491 roff_eqn(curp->roff)) :
492 man_addeqn(curp->man,
493 roff_eqn(curp->roff));
494 else if (curp->man || curp->mdoc)
495 rc = curp->man ?
496 man_parseln(curp->man,
497 curp->line, ln.buf, of) :
498 mdoc_parseln(curp->mdoc,
499 curp->line, ln.buf, of);
500
501 if (0 == rc) {
502 assert(MANDOCLEVEL_FATAL <= curp->file_status);
503 break;
504 }
505
506 /* Temporary buffers typically are not full. */
507
508 if (0 == start && '\0' == blk.buf[i])
509 break;
510
511 /* Start the next input line. */
512
513 pos = 0;
514 }
515
516 free(ln.buf);
517 }
518
519 static void
520 pdesc(struct mparse *curp, const char *file, int fd)
521 {
522 struct buf blk;
523 int with_mmap;
524
525 /*
526 * Run for each opened file; may be called more than once for
527 * each full parse sequence if the opened file is nested (i.e.,
528 * from `so'). Simply sucks in the whole file and moves into
529 * the parse phase for the file.
530 */
531
532 if ( ! read_whole_file(file, fd, &blk, &with_mmap)) {
533 curp->file_status = MANDOCLEVEL_SYSERR;
534 return;
535 }
536
537 /* Line number is per-file. */
538
539 curp->line = 1;
540
541 mparse_buf_r(curp, blk, 1);
542
543 #ifdef HAVE_MMAP
544 if (with_mmap)
545 munmap(blk.buf, blk.sz);
546 else
547 #endif
548 free(blk.buf);
549 }
550
551 static int
552 read_whole_file(const char *file, int fd, struct buf *fb, int *with_mmap)
553 {
554 size_t off;
555 ssize_t ssz;
556
557 #ifdef HAVE_MMAP
558 struct stat st;
559 if (-1 == fstat(fd, &st)) {
560 perror(file);
561 return(0);
562 }
563
564 /*
565 * If we're a regular file, try just reading in the whole entry
566 * via mmap(). This is faster than reading it into blocks, and
567 * since each file is only a few bytes to begin with, I'm not
568 * concerned that this is going to tank any machines.
569 */
570
571 if (S_ISREG(st.st_mode)) {
572 if (st.st_size >= (1U << 31)) {
573 fprintf(stderr, "%s: input too large\n", file);
574 return(0);
575 }
576 *with_mmap = 1;
577 fb->sz = (size_t)st.st_size;
578 fb->buf = mmap(NULL, fb->sz, PROT_READ,
579 MAP_FILE|MAP_SHARED, fd, 0);
580 if (fb->buf != MAP_FAILED)
581 return(1);
582 }
583 #endif
584
585 /*
586 * If this isn't a regular file (like, say, stdin), then we must
587 * go the old way and just read things in bit by bit.
588 */
589
590 *with_mmap = 0;
591 off = 0;
592 fb->sz = 0;
593 fb->buf = NULL;
594 for (;;) {
595 if (off == fb->sz) {
596 if (fb->sz == (1U << 31)) {
597 fprintf(stderr, "%s: input too large\n", file);
598 break;
599 }
600 resize_buf(fb, 65536);
601 }
602 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
603 if (ssz == 0) {
604 fb->sz = off;
605 return(1);
606 }
607 if (ssz == -1) {
608 perror(file);
609 break;
610 }
611 off += (size_t)ssz;
612 }
613
614 free(fb->buf);
615 fb->buf = NULL;
616 return(0);
617 }
618
619 static void
620 mparse_end(struct mparse *curp)
621 {
622
623 if (MANDOCLEVEL_FATAL <= curp->file_status)
624 return;
625
626 if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) {
627 assert(MANDOCLEVEL_FATAL <= curp->file_status);
628 return;
629 }
630
631 if (curp->man && ! man_endparse(curp->man)) {
632 assert(MANDOCLEVEL_FATAL <= curp->file_status);
633 return;
634 }
635
636 if ( ! (curp->man || curp->mdoc)) {
637 mandoc_msg(MANDOCERR_NOTMANUAL, curp, 1, 0, NULL);
638 curp->file_status = MANDOCLEVEL_FATAL;
639 return;
640 }
641
642 roff_endparse(curp->roff);
643 }
644
645 static void
646 mparse_readfd_r(struct mparse *curp, int fd, const char *file, int re)
647 {
648 const char *svfile;
649
650 if (-1 == fd)
651 if (-1 == (fd = open(file, O_RDONLY, 0))) {
652 perror(file);
653 curp->file_status = MANDOCLEVEL_SYSERR;
654 return;
655 }
656
657 svfile = curp->file;
658 curp->file = file;
659
660 pdesc(curp, file, fd);
661
662 if (0 == re && MANDOCLEVEL_FATAL > curp->file_status)
663 mparse_end(curp);
664
665 if (STDIN_FILENO != fd && -1 == close(fd))
666 perror(file);
667
668 curp->file = svfile;
669 }
670
671 enum mandoclevel
672 mparse_readfd(struct mparse *curp, int fd, const char *file)
673 {
674
675 mparse_readfd_r(curp, fd, file, 0);
676 return(curp->file_status);
677 }
678
679 struct mparse *
680 mparse_alloc(enum mparset inttype, enum mandoclevel wlevel, mandocmsg mmsg, void *arg)
681 {
682 struct mparse *curp;
683
684 assert(wlevel <= MANDOCLEVEL_FATAL);
685
686 curp = mandoc_calloc(1, sizeof(struct mparse));
687
688 curp->wlevel = wlevel;
689 curp->mmsg = mmsg;
690 curp->arg = arg;
691 curp->inttype = inttype;
692
693 curp->roff = roff_alloc(curp);
694 return(curp);
695 }
696
697 void
698 mparse_reset(struct mparse *curp)
699 {
700
701 roff_reset(curp->roff);
702
703 if (curp->mdoc)
704 mdoc_reset(curp->mdoc);
705 if (curp->man)
706 man_reset(curp->man);
707
708 curp->file_status = MANDOCLEVEL_OK;
709 curp->mdoc = NULL;
710 curp->man = NULL;
711 }
712
713 void
714 mparse_free(struct mparse *curp)
715 {
716
717 if (curp->pmdoc)
718 mdoc_free(curp->pmdoc);
719 if (curp->pman)
720 man_free(curp->pman);
721 if (curp->roff)
722 roff_free(curp->roff);
723
724 free(curp);
725 }
726
727 void
728 mparse_result(struct mparse *curp, struct mdoc **mdoc, struct man **man)
729 {
730
731 if (mdoc)
732 *mdoc = curp->mdoc;
733 if (man)
734 *man = curp->man;
735 }
736
737 void
738 mandoc_vmsg(enum mandocerr t, struct mparse *m,
739 int ln, int pos, const char *fmt, ...)
740 {
741 char buf[256];
742 va_list ap;
743
744 va_start(ap, fmt);
745 vsnprintf(buf, sizeof(buf) - 1, fmt, ap);
746 va_end(ap);
747
748 mandoc_msg(t, m, ln, pos, buf);
749 }
750
751 void
752 mandoc_msg(enum mandocerr er, struct mparse *m,
753 int ln, int col, const char *msg)
754 {
755 enum mandoclevel level;
756
757 level = MANDOCLEVEL_FATAL;
758 while (er < mandoclimits[level])
759 level--;
760
761 if (level < m->wlevel)
762 return;
763
764 if (m->mmsg)
765 (*m->mmsg)(er, level, m->file, ln, col, msg);
766
767 if (m->file_status < level)
768 m->file_status = level;
769 }
770
771 const char *
772 mparse_strerror(enum mandocerr er)
773 {
774
775 return(mandocerrs[er]);
776 }
777
778 const char *
779 mparse_strlevel(enum mandoclevel lvl)
780 {
781 return(mandoclevels[lvl]);
782 }