From: Kristaps Dzonsons Date: Tue, 17 May 2011 22:32:45 +0000 (+0000) Subject: Locale support. I'm checking this in to clean up fall-out in-tree, but X-Git-Tag: VERSION_1_11_3~30 X-Git-Url: https://git.cameronkatri.com/mandoc.git/commitdiff_plain/9fbd9ce5cadeb91ed28f18559e80d0bb5a2e1d54?ds=inline Locale support. I'm checking this in to clean up fall-out in-tree, but it looks pretty good. Basically, the -Tlocale option propogates into term_ascii.c, where we set locale-specific console call-backs IFF (1) setlocale() works; (2) locale support is compiled in (see Makefile for -DUSE_WCHAR); (3) the internal structure of wchar_t maps directly to Unicode codepoints as defined by __STDC_ISO_10646__; and (4) the console supports multi-byte characters. To date, this configuration only supports GNU/Linux. OpenBSD doesn't export __STDC_ISO_10646__ although I'm told by stsp@openbsd.org that it should (it has the correct map). Apparently FreeBSD is the same way. NetBSD? Don't know. Apple also supports this, but doesn't define the macro. Special-casing! Benchmark: -Tlocale incurs less than 0.2 factor overhead when run through several thousand manuals when UTF8 output is enabled. Native mode (whether directly -Tascii or through no locale or whatever) is UNCHANGED: the function callbacks are the same as before. Note. If the underlying system does NOT support STDC_ISO_10646, there is a "slow" version possible with iconv or other means of flipping from a Unicode codepoint to a wchar_t. --- diff --git a/Makefile b/Makefile index cc3fb743..a88b57b6 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,10 @@ VERSION = 1.11.2 VDATE = 12 May 2011 -CFLAGS += -g -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\"" +# If your system doesn't support multi-byte functions (specifically +# setlocale(), wcwidth(), putwchar()), then remove -DUSE_CHAR. You'll +# still be able to use -Tlocale, but it becomes a synonym for -Tascii. +CFLAGS += -g -DUSE_WCHAR -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\"" CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings PREFIX = /usr/local BINDIR = $(PREFIX)/bin diff --git a/mandoc.1 b/mandoc.1 index be863ea4..de4b970d 100644 --- a/mandoc.1 +++ b/mandoc.1 @@ -1,4 +1,4 @@ -.\" $Id: mandoc.1,v 1.86 2011/05/17 12:22:15 kristaps Exp $ +.\" $Id: mandoc.1,v 1.87 2011/05/17 22:32:45 kristaps Exp $ .\" .\" Copyright (c) 2009, 2010 Kristaps Dzonsons .\" @@ -158,6 +158,12 @@ utility accepts the following .Fl T arguments, which correspond to output modes: .Bl -tag -width Ds +.It Fl T Ns Cm locale +This option encodes output characters using the current +.Xr locale 1 +configuration. +See +.Sx Locale Output . .It Fl T Ns Cm ascii Produce 7-bit ASCII output. This is the default. @@ -189,6 +195,16 @@ See .Pp If multiple input files are specified, these will be processed by the corresponding filter in-order. +.Ss Locale Output +Locale-depending output encoding is triggered with +.Fl T Ns Cm locale . +This option is not available on all systems: systems without locale +support, or those whose internal representation is not natively UCS-4, +will fall back to +.Fl T Ns Cm ascii . +See +.Sx ASCII Output +for font style specification and available command-line arguments. .Ss ASCII Output Output produced by .Fl T Ns Cm ascii , @@ -209,6 +225,9 @@ Emboldened characters are rendered as The special characters documented in .Xr mandoc_char 7 are rendered best-effort in an ASCII equivalent. +If no equivalent is found, +.Sq \&? +is used instead. .Pp Output width is limited to 78 visible columns unless literal input lines exceed this limit. @@ -460,7 +479,7 @@ Each input and output format is separately noted. .Ss ASCII Compatibility .Bl -bullet -compact .It -Unicode codepoints specified with +Unrenderable unicode codepoints specified with .Sq \e[uNNNN] escapes are printed as .Sq \&? diff --git a/term.c b/term.c index 922385ae..04d55d67 100644 --- a/term.c +++ b/term.c @@ -1,4 +1,4 @@ -/* $Id: term.c,v 1.193 2011/05/17 14:38:34 kristaps Exp $ */ +/* $Id: term.c,v 1.194 2011/05/17 22:32:45 kristaps Exp $ */ /* * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons * Copyright (c) 2010, 2011 Ingo Schwarze @@ -36,6 +36,7 @@ static void adjbuf(struct termp *p, int); static void bufferc(struct termp *, char); static void encode(struct termp *, const char *, size_t); +static void encode1(struct termp *, int); void term_free(struct termp *p) @@ -403,7 +404,7 @@ term_word(struct termp *p, const char *word) { const char *seq, *cp; char c; - int sz; + int sz, uc; size_t ssz; enum mandoc_esc esc; @@ -440,7 +441,13 @@ term_word(struct termp *p, const char *word) switch (esc) { case (ESCAPE_UNICODE): - encode(p, "?", 1); + if (TERMENC_ASCII == p->enc) { + encode1(p, '?'); + break; + } + uc = mchars_num2uc(seq + 1, sz - 1); + if ('\0' != uc) + encode1(p, uc); break; case (ESCAPE_NUMBERED): if ('\0' != (c = mchars_num2char(seq, sz))) @@ -503,6 +510,33 @@ bufferc(struct termp *p, char c) p->buf[p->col++] = c; } +/* + * See encode(). + * Do this for a single (probably unicode) value. + * Does not check for non-decorated glyphs. + */ +static void +encode1(struct termp *p, int c) +{ + enum termfont f; + + if (p->col + 4 >= p->maxcols) + adjbuf(p, p->col + 4); + + f = term_fonttop(p); + + if (TERMFONT_NONE == f) { + p->buf[p->col++] = c; + return; + } else if (TERMFONT_UNDER == f) { + p->buf[p->col++] = '_'; + } else + p->buf[p->col++] = c; + + p->buf[p->col++] = 8; + p->buf[p->col++] = c; +} + static void encode(struct termp *p, const char *word, size_t sz) { @@ -584,11 +618,16 @@ term_strlen(const struct termp *p, const char *cp) case (ESCAPE_ERROR): return(sz); case (ESCAPE_UNICODE): - c = '?'; - /* FALLTHROUGH */ - case (ESCAPE_NUMBERED): + if (TERMENC_ASCII != p->enc) { + sz += (*p->width)(p, '?'); + break; + } + c = mchars_num2uc(seq + 1, ssz - 1); if ('\0' != c) - c = mchars_num2char(seq, ssz); + sz += (*p->width)(p, c); + break; + case (ESCAPE_NUMBERED): + c = mchars_num2char(seq, ssz); if ('\0' != c) sz += (*p->width)(p, c); break; diff --git a/term_ascii.c b/term_ascii.c index 5b2ee847..7619907e 100644 --- a/term_ascii.c +++ b/term_ascii.c @@ -1,4 +1,4 @@ -/* $Id: term_ascii.c,v 1.14 2011/05/17 14:38:34 kristaps Exp $ */ +/* $Id: term_ascii.c,v 1.15 2011/05/17 22:32:45 kristaps Exp $ */ /* * Copyright (c) 2010 Kristaps Dzonsons * @@ -21,16 +21,26 @@ #include #include +#ifdef USE_WCHAR +# include +#endif #include #include #include #include +#ifdef USE_WCHAR +# include +#endif #include "mandoc.h" #include "out.h" #include "term.h" #include "main.h" +#if ! defined(__STDC_ISO_10646__) +# undef USE_WCHAR +#endif + static struct termp *ascii_init(enum termenc, char *); static double ascii_hspan(const struct termp *, const struct roffsu *); @@ -41,6 +51,13 @@ static void ascii_end(struct termp *); static void ascii_endline(struct termp *); static void ascii_letter(struct termp *, int); +#ifdef USE_WCHAR +static void locale_advance(struct termp *, size_t); +static void locale_endline(struct termp *); +static void locale_letter(struct termp *, int); +static size_t locale_width(const struct termp *, int); +#endif + static struct termp * ascii_init(enum termenc enc, char *outopts) { @@ -54,15 +71,28 @@ ascii_init(enum termenc enc, char *outopts) p->tabwidth = 5; p->defrmargin = 78; - p->advance = ascii_advance; p->begin = ascii_begin; p->end = ascii_end; - p->endline = ascii_endline; p->hspan = ascii_hspan; - p->letter = ascii_letter; p->type = TERMTYPE_CHAR; + + p->enc = TERMENC_ASCII; + p->advance = ascii_advance; + p->endline = ascii_endline; + p->letter = ascii_letter; p->width = ascii_width; +#if defined (USE_WCHAR) + if (TERMENC_LOCALE == enc) + if (setlocale(LC_ALL, "") && MB_CUR_MAX > 1) { + p->enc = enc; + p->advance = locale_advance; + p->endline = locale_endline; + p->letter = locale_letter; + p->width = locale_width; + } +#endif + toks[0] = "width"; toks[1] = NULL; @@ -104,7 +134,6 @@ ascii_width(const struct termp *p, int c) return(1); } - void ascii_free(void *arg) { @@ -112,17 +141,14 @@ ascii_free(void *arg) term_free((struct termp *)arg); } - /* ARGSUSED */ static void ascii_letter(struct termp *p, int c) { - /* LINTED */ putchar(c); } - static void ascii_begin(struct termp *p) { @@ -130,7 +156,6 @@ ascii_begin(struct termp *p) (*p->headf)(p, p->argf); } - static void ascii_end(struct termp *p) { @@ -138,7 +163,6 @@ ascii_end(struct termp *p) (*p->footf)(p, p->argf); } - /* ARGSUSED */ static void ascii_endline(struct termp *p) @@ -147,19 +171,16 @@ ascii_endline(struct termp *p) putchar('\n'); } - /* ARGSUSED */ static void ascii_advance(struct termp *p, size_t len) { size_t i; - /* Just print whitespace on the terminal. */ for (i = 0; i < len; i++) putchar(' '); } - /* ARGSUSED */ static double ascii_hspan(const struct termp *p, const struct roffsu *su) @@ -198,3 +219,39 @@ ascii_hspan(const struct termp *p, const struct roffsu *su) return(r); } +#ifdef USE_WCHAR +/* ARGSUSED */ +static size_t +locale_width(const struct termp *p, int c) +{ + int rc; + + return((rc = wcwidth(c)) < 0 ? 0 : rc); +} + +/* ARGSUSED */ +static void +locale_advance(struct termp *p, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) + putwchar(L' '); +} + +/* ARGSUSED */ +static void +locale_endline(struct termp *p) +{ + + putwchar(L'\n'); +} + +/* ARGSUSED */ +static void +locale_letter(struct termp *p, int c) +{ + + putwchar(c); +} +#endif