aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/read.c
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@openbsd.org>2014-10-25 01:03:52 +0000
committerIngo Schwarze <schwarze@openbsd.org>2014-10-25 01:03:52 +0000
commitb862a185d00afc5c08a5353e6bc4633818b058cc (patch)
tree712aa647f086a4f6ef629c3e09e859ea13d21122 /read.c
parent586bd579d5085e1a046f1f463315281ff3f09037 (diff)
downloadmandoc-b862a185d00afc5c08a5353e6bc4633818b058cc.tar.gz
mandoc-b862a185d00afc5c08a5353e6bc4633818b058cc.tar.zst
mandoc-b862a185d00afc5c08a5353e6bc4633818b058cc.zip
integrate preconv(1) into mandoc(1);
enhances functionality and reduces code and docs by more than 300 lines
Diffstat (limited to 'read.c')
-rw-r--r--read.c73
1 files changed, 53 insertions, 20 deletions
diff --git a/read.c b/read.c
index 6c042e3f..67e6a838 100644
--- a/read.c
+++ b/read.c
@@ -1,4 +1,4 @@
-/* $Id: read.c,v 1.92 2014/10/20 19:04:45 kristaps Exp $ */
+/* $Id: read.c,v 1.93 2014/10/25 01:03:52 schwarze Exp $ */
/*
* Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -45,11 +45,6 @@
#define REPARSE_LIMIT 1000
-struct buf {
- char *buf; /* binary input buffer */
- size_t sz; /* size of binary buffer */
-};
-
struct mparse {
struct man *pman; /* persistent man parser */
struct mdoc *pmdoc; /* persistent mdoc parser */
@@ -65,6 +60,7 @@ struct mparse {
enum mandoclevel file_status; /* status of current parse */
enum mandoclevel wlevel; /* ignore messages below this */
int options; /* parser options */
+ int filenc; /* encoding of the current file */
int reparse_count; /* finite interp. stack */
int line; /* line number in the file */
};
@@ -326,13 +322,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
lnn = curp->line;
pos = 0;
- for (i = 0; i < (int)blk.sz; ) {
+ for (i = blk.offs; i < (int)blk.sz; ) {
if (0 == pos && '\0' == blk.buf[i])
break;
if (start) {
curp->line = lnn;
curp->reparse_count = 0;
+
+ if (lnn < 3 &&
+ curp->filenc & MPARSE_UTF8 &&
+ curp->filenc & MPARSE_LATIN1) {
+ blk.offs = i;
+ curp->filenc = preconv_cue(&blk);
+ }
}
while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
@@ -353,27 +356,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
}
/*
- * Make sure we have space for at least
- * one backslash and one other character
- * and the trailing NUL byte.
+ * Make sure we have space for the worst
+ * case of 11 bytes: "\\[u10ffff]\0"
*/
- if (pos + 2 >= (int)ln.sz)
+ if (pos + 11 > (int)ln.sz)
resize_buf(&ln, 256);
/*
- * Warn about bogus characters. If you're using
- * non-ASCII encoding, you're screwing your
- * readers. Since I'd rather this not happen,
- * I'll be helpful and replace these characters
- * with "?", so we don't display gibberish.
- * Note to manual writers: use special characters.
+ * Encode 8-bit input.
*/
- c = (unsigned char) blk.buf[i];
+ c = blk.buf[i];
+ if (c & 0x80) {
+ blk.offs = i;
+ ln.offs = pos;
+ if (curp->filenc && preconv_encode(
+ &blk, &ln, &curp->filenc)) {
+ pos = ln.offs;
+ i = blk.offs;
+ } else {
+ mandoc_vmsg(MANDOCERR_BADCHAR,
+ curp, curp->line, pos,
+ "0x%x", c);
+ ln.buf[pos++] = '?';
+ i++;
+ }
+ continue;
+ }
+
+ /*
+ * Exclude control characters.
+ */
- if ( ! (isascii(c) &&
- (isgraph(c) || isblank(c)))) {
+ if (c == 0x7f || (c < 0x20 && c != 0x09)) {
mandoc_vmsg(MANDOCERR_BADCHAR, curp,
curp->line, pos, "0x%x", c);
i++;
@@ -633,6 +649,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
return(0);
}
*with_mmap = 1;
+ fb->offs = 0;
fb->sz = (size_t)st.st_size;
fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
if (fb->buf != MAP_FAILED)
@@ -664,6 +681,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
ssz = read(fd, fb->buf + (int)off, fb->sz - off);
if (ssz == 0) {
fb->sz = off;
+ fb->offs = 0;
return(1);
}
if (ssz == -1) {
@@ -735,6 +753,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
curp->line = 1;
recursion_depth++;
+ /* Skip an UTF-8 byte order mark. */
+ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+ (unsigned char)blk.buf[0] == 0xef &&
+ (unsigned char)blk.buf[1] == 0xbb &&
+ (unsigned char)blk.buf[2] == 0xbf) {
+ blk.offs = 3;
+ curp->filenc &= ~MPARSE_LATIN1;
+ }
+
mparse_buf_r(curp, blk, 1);
if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
@@ -752,6 +779,7 @@ mparse_readmem(struct mparse *curp, const void *buf, size_t len,
blk.buf = UNCONST(buf);
blk.sz = len;
+ blk.offs = 0;
mparse_parse_buffer(curp, blk, file);
return(curp->file_status);
@@ -762,6 +790,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
{
struct buf blk;
int with_mmap;
+ int save_filenc;
if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
curp->file_status = MANDOCLEVEL_SYSERR;
@@ -780,7 +809,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
*/
if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+ save_filenc = curp->filenc;
+ curp->filenc = curp->options &
+ (MPARSE_UTF8 | MPARSE_LATIN1);
mparse_parse_buffer(curp, blk, file);
+ curp->filenc = save_filenc;
#if HAVE_MMAP
if (with_mmap)
munmap(blk.buf, blk.sz);