-/* $Id: read.c,v 1.89 2014/10/11 21:14:16 schwarze Exp $ */
+/* $Id: read.c,v 1.93 2014/10/25 01:03:52 schwarze Exp $ */
/*
* Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
#define REPARSE_LIMIT 1000
-struct buf {
- char *buf; /* binary input buffer */
- size_t sz; /* size of binary buffer */
-};
-
struct mparse {
struct man *pman; /* persistent man parser */
struct mdoc *pmdoc; /* persistent mdoc parser */
enum mandoclevel file_status; /* status of current parse */
enum mandoclevel wlevel; /* ignore messages below this */
int options; /* parser options */
+ int filenc; /* encoding of the current file */
int reparse_count; /* finite interp. stack */
int line; /* line number in the file */
};
"missing font type, using \\fR",
"unknown font type, using \\fR",
"missing -std argument, adding it",
+ "missing eqn box, using \"\"",
/* related to bad macro arguments */
"unterminated quoted argument",
"equation scope open on exit",
"overlapping equation scopes",
"unexpected end of equation",
- "equation syntax error",
/* related to tables */
"bad table syntax",
"skipping request without numeric argument",
"skipping all arguments",
"skipping excess arguments",
+ "divide by zero",
"generic fatal error",
lnn = curp->line;
pos = 0;
- for (i = 0; i < (int)blk.sz; ) {
+ for (i = blk.offs; i < (int)blk.sz; ) {
if (0 == pos && '\0' == blk.buf[i])
break;
if (start) {
curp->line = lnn;
curp->reparse_count = 0;
+
+ if (lnn < 3 &&
+ curp->filenc & MPARSE_UTF8 &&
+ curp->filenc & MPARSE_LATIN1) {
+ blk.offs = i;
+ curp->filenc = preconv_cue(&blk);
+ }
}
while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
}
/*
- * Make sure we have space for at least
- * one backslash and one other character
- * and the trailing NUL byte.
+ * Make sure we have space for the worst
+ * case of 11 bytes: "\\[u10ffff]\0"
*/
- if (pos + 2 >= (int)ln.sz)
+ if (pos + 11 > (int)ln.sz)
resize_buf(&ln, 256);
/*
- * Warn about bogus characters. If you're using
- * non-ASCII encoding, you're screwing your
- * readers. Since I'd rather this not happen,
- * I'll be helpful and replace these characters
- * with "?", so we don't display gibberish.
- * Note to manual writers: use special characters.
+ * Encode 8-bit input.
*/
- c = (unsigned char) blk.buf[i];
+ c = blk.buf[i];
+ if (c & 0x80) {
+ blk.offs = i;
+ ln.offs = pos;
+ if (curp->filenc && preconv_encode(
+ &blk, &ln, &curp->filenc)) {
+ pos = ln.offs;
+ i = blk.offs;
+ } else {
+ mandoc_vmsg(MANDOCERR_BADCHAR,
+ curp, curp->line, pos,
+ "0x%x", c);
+ ln.buf[pos++] = '?';
+ i++;
+ }
+ continue;
+ }
+
+ /*
+ * Exclude control characters.
+ */
- if ( ! (isascii(c) &&
- (isgraph(c) || isblank(c)))) {
+ if (c == 0x7f || (c < 0x20 && c != 0x09)) {
mandoc_vmsg(MANDOCERR_BADCHAR, curp,
curp->line, pos, "0x%x", c);
i++;
return(0);
}
*with_mmap = 1;
+ fb->offs = 0;
fb->sz = (size_t)st.st_size;
fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
if (fb->buf != MAP_FAILED)
ssz = read(fd, fb->buf + (int)off, fb->sz - off);
if (ssz == 0) {
fb->sz = off;
+ fb->offs = 0;
return(1);
}
if (ssz == -1) {
curp->line = 1;
recursion_depth++;
+ /* Skip an UTF-8 byte order mark. */
+ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+ (unsigned char)blk.buf[0] == 0xef &&
+ (unsigned char)blk.buf[1] == 0xbb &&
+ (unsigned char)blk.buf[2] == 0xbf) {
+ blk.offs = 3;
+ curp->filenc &= ~MPARSE_LATIN1;
+ }
+
mparse_buf_r(curp, blk, 1);
if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
blk.buf = UNCONST(buf);
blk.sz = len;
+ blk.offs = 0;
mparse_parse_buffer(curp, blk, file);
return(curp->file_status);
{
struct buf blk;
int with_mmap;
+ int save_filenc;
if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
curp->file_status = MANDOCLEVEL_SYSERR;
(*curp->mmsg)(MANDOCERR_SYSOPEN,
curp->file_status,
file, 0, 0, strerror(errno));
- goto out;
+ return(curp->file_status);
}
/*
* the parse phase for the file.
*/
- if ( ! read_whole_file(curp, file, fd, &blk, &with_mmap))
- goto out;
-
- mparse_parse_buffer(curp, blk, file);
-
+ if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+ save_filenc = curp->filenc;
+ curp->filenc = curp->options &
+ (MPARSE_UTF8 | MPARSE_LATIN1);
+ mparse_parse_buffer(curp, blk, file);
+ curp->filenc = save_filenc;
#if HAVE_MMAP
- if (with_mmap)
- munmap(blk.buf, blk.sz);
- else
+ if (with_mmap)
+ munmap(blk.buf, blk.sz);
+ else
#endif
- free(blk.buf);
+ free(blk.buf);
+ }
if (STDIN_FILENO != fd && -1 == close(fd))
perror(file);
-out:
+
return(curp->file_status);
}