From f860cf56ff373daca5cc5c911a731f157674f2b1 Mon Sep 17 00:00:00 2001 From: Kristaps Dzonsons Date: Thu, 26 May 2011 14:43:07 +0000 Subject: [PATCH] preconv is now on encoding-recognition parity with groff. This last commit adds parsing of "File Variables" in the first two lines in order to grok the encoding. This completes groff's recognition sequence (-e, BOM, File variables, -D, default). I've also cleaned up the manual to indicate this and for some general readability. preconv is now compiled by default in the Makefile. --- Makefile | 2 +- preconv.1 | 73 ++++++++++++++++++++++-------------- preconv.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 152 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index 5f139b8e..8ee84ed6 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ INSTALL_DATA = $(INSTALL) -m 0444 INSTALL_LIB = $(INSTALL) -m 0644 INSTALL_MAN = $(INSTALL_DATA) -all: mandoc +all: mandoc preconv SRCS = Makefile \ arch.c \ diff --git a/preconv.1 b/preconv.1 index 266cbc7c..12ea6f80 100644 --- a/preconv.1 +++ b/preconv.1 @@ -1,4 +1,4 @@ -.\" $Id: preconv.1,v 1.2 2011/05/26 12:14:46 kristaps Exp $ +.\" $Id: preconv.1,v 1.3 2011/05/26 14:43:07 kristaps Exp $ .\" .\" Copyright (c) 2011 Kristaps Dzonsons .\" @@ -42,18 +42,8 @@ Its arguments are as follows: .Bl -tag -width Ds .It Fl D Ar enc The default encoding. -This is case-insensitive. -See -.Sx Algorithm -and -.Sx Encodings . .It Fl e Ar enc The document's encoding. -This is case-insensitive. -See -.Sx Algorithm -and -.Sx Encodings . .It Ar file The input file. .El @@ -63,27 +53,23 @@ If is not provided, .Nm accepts standard input. -Output is written to standard output. -Unicode characters in the ASCII range are printed as regular ASCII -characters; those above this range are printed using the +See +.Sx Algorithm +for encoding choice. +.Pp +The recoded input is written to standard output: Unicode characters in +the ASCII range are printed as regular ASCII characters, while those +above this range are printed using the .Sq \e[uNNNN] format documented in .Xr mandoc_char 7 . .Pp If input bytes are improperly formed in the current encoding, they're passed unmodified to standard output. -.Ss Encodings -The +For some encodings, such as UTF-8, unrecoverable input sequences will +cause .Nm -utility accepts the -.Ar utf\-8 , -.Ar us\-ascii , -and -.Ar latin\-1 -encodings as arguments to -.Fl D Ar enc -or -.Fl e Ar enc . +to stop processing and exit. .Ss Algorithm An encoding is chosen according to the following steps: .Bl -enum @@ -91,13 +77,41 @@ An encoding is chosen according to the following steps: From the argument passed to .Fl e Ar enc . .It -If a BOM exists, utf\-8 encoding is selected. +If a BOM exists, UTF\-8 encoding is selected. +.It +From the coding tags parsed from +.Qq File Variables +on the first two lines of input. +A file variable is an input line of the form +.Pp +.Dl \%.\e\(dq -*- key: val [; key: val ]* -*- +.Pp +where +.Cm key +is +.Qq coding +and +.Cm val +is the name of the encoding. +A typical usage may be +.Pp +.Dl \%.\e\(dq -*- mode: troff; coding: utf-8 -*- .It From the argument passed to .Fl D Ar enc . .It If all else fails, Latin\-1 is used. .El +.Pp +The +.Nm +utility recognises the UTF\-8, us\-ascii, and latin\-1 encodings as +passed to the +.Fl e +and +.Fl D +arguments, or as coding tags. +Encodings are matched case-insensitively. .\" .Sh IMPLEMENTATION NOTES .\" Not used in OpenBSD. .\" .Sh RETURN VALUES @@ -107,7 +121,12 @@ If all else fails, Latin\-1 is used. .\" .Sh FILES .Sh EXIT STATUS .Ex -std -.\" .Sh EXAMPLES +.Sh EXAMPLES +Explicitly page a UTF\-8 manual +.Pa foo.1 +in the current locale: +.Pp +.Dl $ preconv \-e utf\-8 foo.1 | mandoc -Tlocale | less .\" .Sh DIAGNOSTICS .\" For sections 1, 4, 6, 7, & 8 only. .\" .Sh ERRORS diff --git a/preconv.c b/preconv.c index da5af9b4..59c34349 100644 --- a/preconv.c +++ b/preconv.c @@ -1,4 +1,4 @@ -/* $Id: preconv.c,v 1.2 2011/05/26 12:01:14 kristaps Exp $ */ +/* $Id: preconv.c,v 1.3 2011/05/26 14:43:07 kristaps Exp $ */ /* * Copyright (c) 2011 Kristaps Dzonsons * @@ -55,6 +55,7 @@ struct encode { int (*conv)(const struct buf *); }; +static int cue_enc(const struct buf *, size_t *, enum enc *); static int conv_latin_1(const struct buf *); static int conv_us_ascii(const struct buf *); static int conv_utf_8(const struct buf *); @@ -94,11 +95,13 @@ conv_latin_1(const struct buf *b) * Latin-1 falls into the first 256 code-points of Unicode, so * there's no need for any sort of translation. Just make the * 8-bit characters use the Unicode escape. + * Note that binary values 128 < v < 160 are passed through + * unmodified to mandoc. */ for (i = b->offs; i < b->sz; i++) { cu = (unsigned char)*cp++; - cu < 128U ? putchar(cu) : printf("\\[u%.4X]", cu); + cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); } return(1); @@ -306,6 +309,93 @@ read_whole_file(const char *f, int fd, return(0); } +static int +cue_enc(const struct buf *b, size_t *offs, enum enc *enc) +{ + const char *ln, *eoln, *eoph; + size_t sz, phsz, nsz; + int i; + + ln = b->buf + (int)*offs; + sz = b->sz - *offs; + + /* Look for the end-of-line. */ + + if (NULL == (eoln = memchr(ln, '\n', sz))) + return(-1); + + /* Set next-line marker. */ + + *offs = (size_t)((eoln + 1) - b->buf); + + /* Check if we have the correct header/trailer. */ + + if ((sz = (size_t)(eoln - ln)) < 10 || + memcmp(ln, ".\\\" -*-", 7) || + memcmp(eoln - 3, "-*-", 3)) + return(0); + + /* Move after the header and adjust for the trailer. */ + + ln += 7; + sz -= 10; + + while (sz > 0) { + while (sz > 0 && ' ' == *ln) { + ln++; + sz--; + } + if (0 == sz) + break; + + /* Find the end-of-phrase marker (or eoln). */ + + if (NULL == (eoph = memchr(ln, ';', sz))) + eoph = eoln - 3; + else + eoph++; + + /* Only account for the "coding" phrase. */ + + if ((phsz = (size_t)(eoph - ln)) < 7 || + strncasecmp(ln, "coding:", 7)) { + sz -= phsz; + ln += phsz; + continue; + } + + sz -= 7; + ln += 7; + + while (sz > 0 && ' ' == *ln) { + ln++; + sz--; + } + if (0 == sz) + break; + + /* Check us against known encodings. */ + + for (i = 0; i < ENC__MAX; i++) { + nsz = strlen(encs[i].name); + if (phsz < nsz) + continue; + if (strncasecmp(ln, encs[i].name, nsz)) + continue; + + *enc = (enum enc)i; + return(1); + } + + /* Unknown encoding. */ + + *enc = ENC__MAX; + return(1); + } + + return(0); +} + int main(int argc, char *argv[]) { @@ -314,6 +404,7 @@ main(int argc, char *argv[]) const char *fn; enum enc enc, def; const char bom[3] = { 0xEF, 0xBB, 0xBF }; + size_t offs; extern int optind; extern char *optarg; @@ -394,6 +485,15 @@ main(int argc, char *argv[]) enc = ENC_UTF_8; } + /* Try reading from the "-*-" cue. */ + + if (ENC__MAX == enc) { + offs = b.offs; + ch = cue_enc(&b, &offs, &enc); + if (0 == ch) + ch = cue_enc(&b, &offs, &enc); + } + /* * No encoding has been detected. * Thus, we either fall into our default encoder, if specified, @@ -403,8 +503,10 @@ main(int argc, char *argv[]) if (ENC__MAX == enc) enc = ENC__MAX == def ? ENC_LATIN_1 : def; - if ( ! (*encs[(int)enc].conv)(&b)) + if ( ! (*encs[(int)enc].conv)(&b)) { + fprintf(stderr, "%s: Bad encoding\n", fn); goto out; + } rc = EXIT_SUCCESS; out: -- 2.47.1