Locale support. I'm checking this in to clean up fall-out in-tree, but

author Kristaps Dzonsons <kristaps@bsd.lv>

Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)

committer Kristaps Dzonsons <kristaps@bsd.lv>

Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
author Kristaps Dzonsons <kristaps@bsd.lv>
Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
committer Kristaps Dzonsons <kristaps@bsd.lv>
Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
diff --git a/Makefile b/Makefile

index cc3fb743b6f27ee12b7413a4457692caca4574b8..a88b57b680ba394af955c63841006e8829c70012 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,10 @@
  
  VERSION                 = 1.11.2
  VDATE           = 12 May 2011
-CFLAGS         += -g -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
+# If your system doesn't support multi-byte functions (specifically
+# setlocale(), wcwidth(), putwchar()), then remove -DUSE_CHAR.  You'll
+# still be able to use -Tlocale, but it becomes a synonym for -Tascii.
+CFLAGS         += -g -DUSE_WCHAR -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
  CFLAGS         += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
  PREFIX          = /usr/local
  BINDIR          = $(PREFIX)/bin
diff --git a/mandoc.1 b/mandoc.1

index be863ea497285a0ae32bd491a65c251970c5cef2..de4b970d6b79d84a387dcf23e4eef1fefb334c9b 100644 (file)
--- a/mandoc.1
+++ b/mandoc.1
@@ -1,4 +1,4 @@
-.\"    $Id: mandoc.1,v 1.86 2011/05/17 12:22:15 kristaps Exp $
+.\"    $Id: mandoc.1,v 1.87 2011/05/17 22:32:45 kristaps Exp $
  .\"
  .\" Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  .\"
@@ -158,6 +158,12 @@ utility accepts the following
  .Fl T
  arguments, which correspond to output modes:
  .Bl -tag -width Ds
+.It Fl T Ns Cm locale
+This option encodes output characters using the current
+.Xr locale 1
+configuration.
+See
+.Sx Locale Output .
  .It Fl T Ns Cm ascii
  Produce 7-bit ASCII output.
  This is the default.
@@ -189,6 +195,16 @@ See
  .Pp
  If multiple input files are specified, these will be processed by the
  corresponding filter in-order.
+.Ss Locale Output
+Locale-depending output encoding is triggered with
+.Fl T Ns Cm locale .
+This option is not available on all systems: systems without locale
+support, or those whose internal representation is not natively UCS-4,
+will fall back to
+.Fl T Ns Cm ascii .
+See
+.Sx ASCII Output
+for font style specification and available command-line arguments.
  .Ss ASCII Output
  Output produced by
  .Fl T Ns Cm ascii ,
@@ -209,6 +225,9 @@ Emboldened characters are rendered as
  The special characters documented in
  .Xr mandoc_char 7
  are rendered best-effort in an ASCII equivalent.
+If no equivalent is found,
+.Sq \&?
+is used instead.
  .Pp
  Output width is limited to 78 visible columns unless literal input lines
  exceed this limit.
@@ -460,7 +479,7 @@ Each input and output format is separately noted.
  .Ss ASCII Compatibility
  .Bl -bullet -compact
  .It
-Unicode codepoints specified with
+Unrenderable unicode codepoints specified with
  .Sq \e[uNNNN]
  escapes are printed as
  .Sq \&?
diff --git a/term.c b/term.c

index 922385ae90c64e38180c388ac26850bc84b8960a..04d55d6730583bb940c467172906cdb979397be4 100644 (file)
--- a/term.c
+++ b/term.c
@@ -1,4 +1,4 @@
-/*     $Id: term.c,v 1.193 2011/05/17 14:38:34 kristaps Exp $ */
+/*     $Id: term.c,v 1.194 2011/05/17 22:32:45 kristaps Exp $ */
  /*
   * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
@@ -36,6 +36,7 @@
  static void             adjbuf(struct termp *p, int);
  static void             bufferc(struct termp *, char);
  static void             encode(struct termp *, const char *, size_t);
+static void             encode1(struct termp *, int);
  
  void
  term_free(struct termp *p)
@@ -403,7 +404,7 @@ term_word(struct termp *p, const char *word)
  {
         const char      *seq, *cp;
         char             c;
-       int              sz;
+       int              sz, uc;
         size_t           ssz;
         enum mandoc_esc  esc;
  
@@ -440,7 +441,13 @@ term_word(struct termp *p, const char *word)
  
                 switch (esc) {
                 case (ESCAPE_UNICODE):
-                       encode(p, "?", 1);
+                       if (TERMENC_ASCII == p->enc) {
+                               encode1(p, '?');
+                               break;
+                       }
+                       uc = mchars_num2uc(seq + 1, sz - 1);
+                       if ('\0' != uc)
+                               encode1(p, uc);
                         break;
                 case (ESCAPE_NUMBERED):
                         if ('\0' != (c = mchars_num2char(seq, sz)))
@@ -503,6 +510,33 @@ bufferc(struct termp *p, char c)
         p->buf[p->col++] = c;
  }
  
+/*
+ * See encode().
+ * Do this for a single (probably unicode) value.
+ * Does not check for non-decorated glyphs.
+ */
+static void
+encode1(struct termp *p, int c)
+{
+       enum termfont     f;
+
+       if (p->col + 4 >= p->maxcols)
+               adjbuf(p, p->col + 4);
+
+       f = term_fonttop(p);
+
+       if (TERMFONT_NONE == f) {
+               p->buf[p->col++] = c;
+               return;
+       } else if (TERMFONT_UNDER == f) {
+               p->buf[p->col++] = '_';
+       } else
+               p->buf[p->col++] = c;
+
+       p->buf[p->col++] = 8;
+       p->buf[p->col++] = c;
+}
+
  static void
  encode(struct termp *p, const char *word, size_t sz)
  {
@@ -584,11 +618,16 @@ term_strlen(const struct termp *p, const char *cp)
                         case (ESCAPE_ERROR):
                                 return(sz);
                         case (ESCAPE_UNICODE):
-                               c = '?';
-                               /* FALLTHROUGH */
-                       case (ESCAPE_NUMBERED):
+                               if (TERMENC_ASCII != p->enc) {
+                                       sz += (*p->width)(p, '?');
+                                       break;
+                               }
+                               c = mchars_num2uc(seq + 1, ssz - 1);
                                 if ('\0' != c)
-                                       c = mchars_num2char(seq, ssz);
+                                       sz += (*p->width)(p, c);
+                               break;
+                       case (ESCAPE_NUMBERED):
+                               c = mchars_num2char(seq, ssz);
                                 if ('\0' != c)
                                         sz += (*p->width)(p, c);
                                 break;
diff --git a/term_ascii.c b/term_ascii.c

index 5b2ee847aca070d405092ad4ba3e6a53eed3b291..7619907ed14479d348d35cbbdebe4d6a9d9d48e0 100644 (file)
--- a/term_ascii.c
+++ b/term_ascii.c
@@ -1,4 +1,4 @@
-/*     $Id: term_ascii.c,v 1.14 2011/05/17 14:38:34 kristaps Exp $ */
+/*     $Id: term_ascii.c,v 1.15 2011/05/17 22:32:45 kristaps Exp $ */
  /*
   * Copyright (c) 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   *
@@ -21,16 +21,26 @@
  #include <sys/types.h>
  
  #include <assert.h>
+#ifdef USE_WCHAR
+# include <locale.h>
+#endif
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
+#ifdef USE_WCHAR
+# include <wchar.h>
+#endif
  
  #include "mandoc.h"
  #include "out.h"
  #include "term.h"
  #include "main.h"
  
+#if ! defined(__STDC_ISO_10646__)
+# undef USE_WCHAR
+#endif
+
  static struct termp     *ascii_init(enum termenc, char *);
  static double            ascii_hspan(const struct termp *,
                                 const struct roffsu *);
@@ -41,6 +51,13 @@ static       void              ascii_end(struct termp *);
  static void              ascii_endline(struct termp *);
  static void              ascii_letter(struct termp *, int);
  
+#ifdef USE_WCHAR
+static void              locale_advance(struct termp *, size_t);
+static void              locale_endline(struct termp *);
+static void              locale_letter(struct termp *, int);
+static size_t            locale_width(const struct termp *, int);
+#endif
+
  static struct termp *
  ascii_init(enum termenc enc, char *outopts)
  {
@@ -54,15 +71,28 @@ ascii_init(enum termenc enc, char *outopts)
         p->tabwidth = 5;
         p->defrmargin = 78;
  
-       p->advance = ascii_advance;
         p->begin = ascii_begin;
         p->end = ascii_end;
-       p->endline = ascii_endline;
         p->hspan = ascii_hspan;
-       p->letter = ascii_letter;
         p->type = TERMTYPE_CHAR;
+
+       p->enc = TERMENC_ASCII;
+       p->advance = ascii_advance;
+       p->endline = ascii_endline;
+       p->letter = ascii_letter;
         p->width = ascii_width;
  
+#if defined (USE_WCHAR)
+       if (TERMENC_LOCALE == enc)
+               if (setlocale(LC_ALL, "") && MB_CUR_MAX > 1) {
+                       p->enc = enc;
+                       p->advance = locale_advance;
+                       p->endline = locale_endline;
+                       p->letter = locale_letter;
+                       p->width = locale_width;
+               }
+#endif
+
         toks[0] = "width";
         toks[1] = NULL;
  
@@ -104,7 +134,6 @@ ascii_width(const struct termp *p, int c)
         return(1);
  }
  
-
  void
  ascii_free(void *arg)
  {
@@ -112,17 +141,14 @@ ascii_free(void *arg)
         term_free((struct termp *)arg);
  }
  
-
  /* ARGSUSED */
  static void
  ascii_letter(struct termp *p, int c)
  {
         
-       /* LINTED */
         putchar(c);
  }
  
-
  static void
  ascii_begin(struct termp *p)
  {
@@ -130,7 +156,6 @@ ascii_begin(struct termp *p)
         (*p->headf)(p, p->argf);
  }
  
-
  static void
  ascii_end(struct termp *p)
  {
@@ -138,7 +163,6 @@ ascii_end(struct termp *p)
         (*p->footf)(p, p->argf);
  }
  
-
  /* ARGSUSED */
  static void
  ascii_endline(struct termp *p)
@@ -147,19 +171,16 @@ ascii_endline(struct termp *p)
         putchar('\n');
  }
  
-
  /* ARGSUSED */
  static void
  ascii_advance(struct termp *p, size_t len)
  {
         size_t          i;
  
-       /* Just print whitespace on the terminal. */
         for (i = 0; i < len; i++)
                 putchar(' ');
  }
  
-
  /* ARGSUSED */
  static double
  ascii_hspan(const struct termp *p, const struct roffsu *su)
@@ -198,3 +219,39 @@ ascii_hspan(const struct termp *p, const struct roffsu *su)
         return(r);
  }
  
+#ifdef USE_WCHAR
+/* ARGSUSED */
+static size_t
+locale_width(const struct termp *p, int c)
+{
+       int             rc;
+
+       return((rc = wcwidth(c)) < 0 ? 0 : rc);
+}
+
+/* ARGSUSED */
+static void
+locale_advance(struct termp *p, size_t len)
+{
+       size_t          i;
+
+       for (i = 0; i < len; i++)
+               putwchar(L' ');
+}
+
+/* ARGSUSED */
+static void
+locale_endline(struct termp *p)
+{
+
+       putwchar(L'\n');
+}
+
+/* ARGSUSED */
+static void
+locale_letter(struct termp *p, int c)
+{
+       
+       putwchar(c);
+}
+#endif
author	Kristaps Dzonsons <kristaps@bsd.lv>
	Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
committer	Kristaps Dzonsons <kristaps@bsd.lv>
	Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
Makefile		patch \| blob \| history
mandoc.1		patch \| blob \| history
term.c		patch \| blob \| history
term_ascii.c		patch \| blob \| history