it looks pretty good. Basically, the -Tlocale option propogates into
term_ascii.c, where we set locale-specific console call-backs IFF (1)
setlocale() works; (2) locale support is compiled in (see Makefile for
-DUSE_WCHAR); (3) the internal structure of wchar_t maps directly to
Unicode codepoints as defined by __STDC_ISO_10646__; and (4) the console
supports multi-byte characters.
To date, this configuration only supports GNU/Linux. OpenBSD doesn't
export __STDC_ISO_10646__ although I'm told by stsp@openbsd.org that it
should (it has the correct map). Apparently FreeBSD is the same way.
NetBSD? Don't know. Apple also supports this, but doesn't define the
macro. Special-casing!
Benchmark: -Tlocale incurs less than 0.2 factor overhead when run
through several thousand manuals when UTF8 output is enabled. Native
mode (whether directly -Tascii or through no locale or whatever) is
UNCHANGED: the function callbacks are the same as before.
Note. If the underlying system does NOT support STDC_ISO_10646, there
is a "slow" version possible with iconv or other means of flipping from
a Unicode codepoint to a wchar_t.
VERSION = 1.11.2
VDATE = 12 May 2011
VERSION = 1.11.2
VDATE = 12 May 2011
-CFLAGS += -g -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
+# If your system doesn't support multi-byte functions (specifically
+# setlocale(), wcwidth(), putwchar()), then remove -DUSE_CHAR. You'll
+# still be able to use -Tlocale, but it becomes a synonym for -Tascii.
+CFLAGS += -g -DUSE_WCHAR -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
PREFIX = /usr/local
BINDIR = $(PREFIX)/bin
CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
PREFIX = /usr/local
BINDIR = $(PREFIX)/bin
-.\" $Id: mandoc.1,v 1.86 2011/05/17 12:22:15 kristaps Exp $
+.\" $Id: mandoc.1,v 1.87 2011/05/17 22:32:45 kristaps Exp $
.\"
.\" Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
.\"
.\"
.\" Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
.\"
.Fl T
arguments, which correspond to output modes:
.Bl -tag -width Ds
.Fl T
arguments, which correspond to output modes:
.Bl -tag -width Ds
+.It Fl T Ns Cm locale
+This option encodes output characters using the current
+.Xr locale 1
+configuration.
+See
+.Sx Locale Output .
.It Fl T Ns Cm ascii
Produce 7-bit ASCII output.
This is the default.
.It Fl T Ns Cm ascii
Produce 7-bit ASCII output.
This is the default.
.Pp
If multiple input files are specified, these will be processed by the
corresponding filter in-order.
.Pp
If multiple input files are specified, these will be processed by the
corresponding filter in-order.
+.Ss Locale Output
+Locale-depending output encoding is triggered with
+.Fl T Ns Cm locale .
+This option is not available on all systems: systems without locale
+support, or those whose internal representation is not natively UCS-4,
+will fall back to
+.Fl T Ns Cm ascii .
+See
+.Sx ASCII Output
+for font style specification and available command-line arguments.
.Ss ASCII Output
Output produced by
.Fl T Ns Cm ascii ,
.Ss ASCII Output
Output produced by
.Fl T Ns Cm ascii ,
The special characters documented in
.Xr mandoc_char 7
are rendered best-effort in an ASCII equivalent.
The special characters documented in
.Xr mandoc_char 7
are rendered best-effort in an ASCII equivalent.
+If no equivalent is found,
+.Sq \&?
+is used instead.
.Pp
Output width is limited to 78 visible columns unless literal input lines
exceed this limit.
.Pp
Output width is limited to 78 visible columns unless literal input lines
exceed this limit.
.Ss ASCII Compatibility
.Bl -bullet -compact
.It
.Ss ASCII Compatibility
.Bl -bullet -compact
.It
-Unicode codepoints specified with
+Unrenderable unicode codepoints specified with
.Sq \e[uNNNN]
escapes are printed as
.Sq \&?
.Sq \e[uNNNN]
escapes are printed as
.Sq \&?
-/* $Id: term.c,v 1.193 2011/05/17 14:38:34 kristaps Exp $ */
+/* $Id: term.c,v 1.194 2011/05/17 22:32:45 kristaps Exp $ */
/*
* Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
/*
* Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
static void adjbuf(struct termp *p, int);
static void bufferc(struct termp *, char);
static void encode(struct termp *, const char *, size_t);
static void adjbuf(struct termp *p, int);
static void bufferc(struct termp *, char);
static void encode(struct termp *, const char *, size_t);
+static void encode1(struct termp *, int);
void
term_free(struct termp *p)
void
term_free(struct termp *p)
{
const char *seq, *cp;
char c;
{
const char *seq, *cp;
char c;
size_t ssz;
enum mandoc_esc esc;
size_t ssz;
enum mandoc_esc esc;
switch (esc) {
case (ESCAPE_UNICODE):
switch (esc) {
case (ESCAPE_UNICODE):
+ if (TERMENC_ASCII == p->enc) {
+ encode1(p, '?');
+ break;
+ }
+ uc = mchars_num2uc(seq + 1, sz - 1);
+ if ('\0' != uc)
+ encode1(p, uc);
break;
case (ESCAPE_NUMBERED):
if ('\0' != (c = mchars_num2char(seq, sz)))
break;
case (ESCAPE_NUMBERED):
if ('\0' != (c = mchars_num2char(seq, sz)))
+/*
+ * See encode().
+ * Do this for a single (probably unicode) value.
+ * Does not check for non-decorated glyphs.
+ */
+static void
+encode1(struct termp *p, int c)
+{
+ enum termfont f;
+
+ if (p->col + 4 >= p->maxcols)
+ adjbuf(p, p->col + 4);
+
+ f = term_fonttop(p);
+
+ if (TERMFONT_NONE == f) {
+ p->buf[p->col++] = c;
+ return;
+ } else if (TERMFONT_UNDER == f) {
+ p->buf[p->col++] = '_';
+ } else
+ p->buf[p->col++] = c;
+
+ p->buf[p->col++] = 8;
+ p->buf[p->col++] = c;
+}
+
static void
encode(struct termp *p, const char *word, size_t sz)
{
static void
encode(struct termp *p, const char *word, size_t sz)
{
case (ESCAPE_ERROR):
return(sz);
case (ESCAPE_UNICODE):
case (ESCAPE_ERROR):
return(sz);
case (ESCAPE_UNICODE):
- c = '?';
- /* FALLTHROUGH */
- case (ESCAPE_NUMBERED):
+ if (TERMENC_ASCII != p->enc) {
+ sz += (*p->width)(p, '?');
+ break;
+ }
+ c = mchars_num2uc(seq + 1, ssz - 1);
- c = mchars_num2char(seq, ssz);
+ sz += (*p->width)(p, c);
+ break;
+ case (ESCAPE_NUMBERED):
+ c = mchars_num2char(seq, ssz);
if ('\0' != c)
sz += (*p->width)(p, c);
break;
if ('\0' != c)
sz += (*p->width)(p, c);
break;
-/* $Id: term_ascii.c,v 1.14 2011/05/17 14:38:34 kristaps Exp $ */
+/* $Id: term_ascii.c,v 1.15 2011/05/17 22:32:45 kristaps Exp $ */
/*
* Copyright (c) 2010 Kristaps Dzonsons <kristaps@bsd.lv>
*
/*
* Copyright (c) 2010 Kristaps Dzonsons <kristaps@bsd.lv>
*
#include <sys/types.h>
#include <assert.h>
#include <sys/types.h>
#include <assert.h>
+#ifdef USE_WCHAR
+# include <locale.h>
+#endif
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#ifdef USE_WCHAR
+# include <wchar.h>
+#endif
#include "mandoc.h"
#include "out.h"
#include "term.h"
#include "main.h"
#include "mandoc.h"
#include "out.h"
#include "term.h"
#include "main.h"
+#if ! defined(__STDC_ISO_10646__)
+# undef USE_WCHAR
+#endif
+
static struct termp *ascii_init(enum termenc, char *);
static double ascii_hspan(const struct termp *,
const struct roffsu *);
static struct termp *ascii_init(enum termenc, char *);
static double ascii_hspan(const struct termp *,
const struct roffsu *);
static void ascii_endline(struct termp *);
static void ascii_letter(struct termp *, int);
static void ascii_endline(struct termp *);
static void ascii_letter(struct termp *, int);
+#ifdef USE_WCHAR
+static void locale_advance(struct termp *, size_t);
+static void locale_endline(struct termp *);
+static void locale_letter(struct termp *, int);
+static size_t locale_width(const struct termp *, int);
+#endif
+
static struct termp *
ascii_init(enum termenc enc, char *outopts)
{
static struct termp *
ascii_init(enum termenc enc, char *outopts)
{
p->tabwidth = 5;
p->defrmargin = 78;
p->tabwidth = 5;
p->defrmargin = 78;
- p->advance = ascii_advance;
p->begin = ascii_begin;
p->end = ascii_end;
p->begin = ascii_begin;
p->end = ascii_end;
- p->endline = ascii_endline;
- p->letter = ascii_letter;
+
+ p->enc = TERMENC_ASCII;
+ p->advance = ascii_advance;
+ p->endline = ascii_endline;
+ p->letter = ascii_letter;
+#if defined (USE_WCHAR)
+ if (TERMENC_LOCALE == enc)
+ if (setlocale(LC_ALL, "") && MB_CUR_MAX > 1) {
+ p->enc = enc;
+ p->advance = locale_advance;
+ p->endline = locale_endline;
+ p->letter = locale_letter;
+ p->width = locale_width;
+ }
+#endif
+
toks[0] = "width";
toks[1] = NULL;
toks[0] = "width";
toks[1] = NULL;
void
ascii_free(void *arg)
{
void
ascii_free(void *arg)
{
term_free((struct termp *)arg);
}
term_free((struct termp *)arg);
}
/* ARGSUSED */
static void
ascii_letter(struct termp *p, int c)
{
/* ARGSUSED */
static void
ascii_letter(struct termp *p, int c)
{
static void
ascii_begin(struct termp *p)
{
static void
ascii_begin(struct termp *p)
{
(*p->headf)(p, p->argf);
}
(*p->headf)(p, p->argf);
}
static void
ascii_end(struct termp *p)
{
static void
ascii_end(struct termp *p)
{
(*p->footf)(p, p->argf);
}
(*p->footf)(p, p->argf);
}
/* ARGSUSED */
static void
ascii_endline(struct termp *p)
/* ARGSUSED */
static void
ascii_endline(struct termp *p)
/* ARGSUSED */
static void
ascii_advance(struct termp *p, size_t len)
{
size_t i;
/* ARGSUSED */
static void
ascii_advance(struct termp *p, size_t len)
{
size_t i;
- /* Just print whitespace on the terminal. */
for (i = 0; i < len; i++)
putchar(' ');
}
for (i = 0; i < len; i++)
putchar(' ');
}
/* ARGSUSED */
static double
ascii_hspan(const struct termp *p, const struct roffsu *su)
/* ARGSUSED */
static double
ascii_hspan(const struct termp *p, const struct roffsu *su)
+#ifdef USE_WCHAR
+/* ARGSUSED */
+static size_t
+locale_width(const struct termp *p, int c)
+{
+ int rc;
+
+ return((rc = wcwidth(c)) < 0 ? 0 : rc);
+}
+
+/* ARGSUSED */
+static void
+locale_advance(struct termp *p, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ putwchar(L' ');
+}
+
+/* ARGSUSED */
+static void
+locale_endline(struct termp *p)
+{
+
+ putwchar(L'\n');
+}
+
+/* ARGSUSED */
+static void
+locale_letter(struct termp *p, int c)
+{
+
+ putwchar(c);
+}
+#endif