Locale support. I'm checking this in to clean up fall-out in-tree, but

author Kristaps Dzonsons <kristaps@bsd.lv>

Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)

committer Kristaps Dzonsons <kristaps@bsd.lv>

Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
author Kristaps Dzonsons <kristaps@bsd.lv>
Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
committer Kristaps Dzonsons <kristaps@bsd.lv>
Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
diff --git a/Makefile b/Makefile

index cc3fb743b6f27ee12b7413a4457692caca4574b8..a88b57b680ba394af955c63841006e8829c70012 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,10 @@
  
  VERSION                 = 1.11.2
  VDATE           = 12 May 2011
  
  VERSION                 = 1.11.2
  VDATE           = 12 May 2011
-CFLAGS         += -g -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
+# If your system doesn't support multi-byte functions (specifically
+# setlocale(), wcwidth(), putwchar()), then remove -DUSE_CHAR.  You'll
+# still be able to use -Tlocale, but it becomes a synonym for -Tascii.
+CFLAGS         += -g -DUSE_WCHAR -DHAVE_CONFIG_H -DVERSION="\"$(VERSION)\""
  CFLAGS         += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
  PREFIX          = /usr/local
  BINDIR          = $(PREFIX)/bin
  CFLAGS         += -W -Wall -Wstrict-prototypes -Wno-unused-parameter -Wwrite-strings
  PREFIX          = /usr/local
  BINDIR          = $(PREFIX)/bin
diff --git a/mandoc.1 b/mandoc.1

index be863ea497285a0ae32bd491a65c251970c5cef2..de4b970d6b79d84a387dcf23e4eef1fefb334c9b 100644 (file)
--- a/mandoc.1
+++ b/mandoc.1
@@ -1,4 +1,4 @@
-.\"    $Id: mandoc.1,v 1.86 2011/05/17 12:22:15 kristaps Exp $
+.\"    $Id: mandoc.1,v 1.87 2011/05/17 22:32:45 kristaps Exp $
  .\"
  .\" Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  .\"
  .\"
  .\" Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
  .\"
@@ -158,6 +158,12 @@ utility accepts the following
  .Fl T
  arguments, which correspond to output modes:
  .Bl -tag -width Ds
  .Fl T
  arguments, which correspond to output modes:
  .Bl -tag -width Ds
+.It Fl T Ns Cm locale
+This option encodes output characters using the current
+.Xr locale 1
+configuration.
+See
+.Sx Locale Output .
  .It Fl T Ns Cm ascii
  Produce 7-bit ASCII output.
  This is the default.
  .It Fl T Ns Cm ascii
  Produce 7-bit ASCII output.
  This is the default.
@@ -189,6 +195,16 @@ See
  .Pp
  If multiple input files are specified, these will be processed by the
  corresponding filter in-order.
  .Pp
  If multiple input files are specified, these will be processed by the
  corresponding filter in-order.
+.Ss Locale Output
+Locale-depending output encoding is triggered with
+.Fl T Ns Cm locale .
+This option is not available on all systems: systems without locale
+support, or those whose internal representation is not natively UCS-4,
+will fall back to
+.Fl T Ns Cm ascii .
+See
+.Sx ASCII Output
+for font style specification and available command-line arguments.
  .Ss ASCII Output
  Output produced by
  .Fl T Ns Cm ascii ,
  .Ss ASCII Output
  Output produced by
  .Fl T Ns Cm ascii ,
@@ -209,6 +225,9 @@ Emboldened characters are rendered as
  The special characters documented in
  .Xr mandoc_char 7
  are rendered best-effort in an ASCII equivalent.
  The special characters documented in
  .Xr mandoc_char 7
  are rendered best-effort in an ASCII equivalent.
+If no equivalent is found,
+.Sq \&?
+is used instead.
  .Pp
  Output width is limited to 78 visible columns unless literal input lines
  exceed this limit.
  .Pp
  Output width is limited to 78 visible columns unless literal input lines
  exceed this limit.
@@ -460,7 +479,7 @@ Each input and output format is separately noted.
  .Ss ASCII Compatibility
  .Bl -bullet -compact
  .It
  .Ss ASCII Compatibility
  .Bl -bullet -compact
  .It
-Unicode codepoints specified with
+Unrenderable unicode codepoints specified with
  .Sq \e[uNNNN]
  escapes are printed as
  .Sq \&?
  .Sq \e[uNNNN]
  escapes are printed as
  .Sq \&?
diff --git a/term.c b/term.c

index 922385ae90c64e38180c388ac26850bc84b8960a..04d55d6730583bb940c467172906cdb979397be4 100644 (file)
--- a/term.c
+++ b/term.c
@@ -1,4 +1,4 @@
-/*     $Id: term.c,v 1.193 2011/05/17 14:38:34 kristaps Exp $ */
+/*     $Id: term.c,v 1.194 2011/05/17 22:32:45 kristaps Exp $ */
  /*
   * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
  /*
   * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
@@ -36,6 +36,7 @@
  static void             adjbuf(struct termp *p, int);
  static void             bufferc(struct termp *, char);
  static void             encode(struct termp *, const char *, size_t);
  static void             adjbuf(struct termp *p, int);
  static void             bufferc(struct termp *, char);
  static void             encode(struct termp *, const char *, size_t);
+static void             encode1(struct termp *, int);
  
  void
  term_free(struct termp *p)
  
  void
  term_free(struct termp *p)
@@ -403,7 +404,7 @@ term_word(struct termp *p, const char *word)
  {
         const char      *seq, *cp;
         char             c;
  {
         const char      *seq, *cp;
         char             c;
-       int              sz;
+       int              sz, uc;
         size_t           ssz;
         enum mandoc_esc  esc;
  
         size_t           ssz;
         enum mandoc_esc  esc;
  
@@ -440,7 +441,13 @@ term_word(struct termp *p, const char *word)
  
                 switch (esc) {
                 case (ESCAPE_UNICODE):
  
                 switch (esc) {
                 case (ESCAPE_UNICODE):
-                       encode(p, "?", 1);
+                       if (TERMENC_ASCII == p->enc) {
+                               encode1(p, '?');
+                               break;
+                       }
+                       uc = mchars_num2uc(seq + 1, sz - 1);
+                       if ('\0' != uc)
+                               encode1(p, uc);
                         break;
                 case (ESCAPE_NUMBERED):
                         if ('\0' != (c = mchars_num2char(seq, sz)))
                         break;
                 case (ESCAPE_NUMBERED):
                         if ('\0' != (c = mchars_num2char(seq, sz)))
@@ -503,6 +510,33 @@ bufferc(struct termp *p, char c)
         p->buf[p->col++] = c;
  }
  
         p->buf[p->col++] = c;
  }
  
+/*
+ * See encode().
+ * Do this for a single (probably unicode) value.
+ * Does not check for non-decorated glyphs.
+ */
+static void
+encode1(struct termp *p, int c)
+{
+       enum termfont     f;
+
+       if (p->col + 4 >= p->maxcols)
+               adjbuf(p, p->col + 4);
+
+       f = term_fonttop(p);
+
+       if (TERMFONT_NONE == f) {
+               p->buf[p->col++] = c;
+               return;
+       } else if (TERMFONT_UNDER == f) {
+               p->buf[p->col++] = '_';
+       } else
+               p->buf[p->col++] = c;
+
+       p->buf[p->col++] = 8;
+       p->buf[p->col++] = c;
+}
+
  static void
  encode(struct termp *p, const char *word, size_t sz)
  {
  static void
  encode(struct termp *p, const char *word, size_t sz)
  {
@@ -584,11 +618,16 @@ term_strlen(const struct termp *p, const char *cp)
                         case (ESCAPE_ERROR):
                                 return(sz);
                         case (ESCAPE_UNICODE):
                         case (ESCAPE_ERROR):
                                 return(sz);
                         case (ESCAPE_UNICODE):
-                               c = '?';
-                               /* FALLTHROUGH */
-                       case (ESCAPE_NUMBERED):
+                               if (TERMENC_ASCII != p->enc) {
+                                       sz += (*p->width)(p, '?');
+                                       break;
+                               }
+                               c = mchars_num2uc(seq + 1, ssz - 1);
                                 if ('\0' != c)
                                 if ('\0' != c)
-                                       c = mchars_num2char(seq, ssz);
+                                       sz += (*p->width)(p, c);
+                               break;
+                       case (ESCAPE_NUMBERED):
+                               c = mchars_num2char(seq, ssz);
                                 if ('\0' != c)
                                         sz += (*p->width)(p, c);
                                 break;
                                 if ('\0' != c)
                                         sz += (*p->width)(p, c);
                                 break;
diff --git a/term_ascii.c b/term_ascii.c

index 5b2ee847aca070d405092ad4ba3e6a53eed3b291..7619907ed14479d348d35cbbdebe4d6a9d9d48e0 100644 (file)
--- a/term_ascii.c
+++ b/term_ascii.c
@@ -1,4 +1,4 @@
-/*     $Id: term_ascii.c,v 1.14 2011/05/17 14:38:34 kristaps Exp $ */
+/*     $Id: term_ascii.c,v 1.15 2011/05/17 22:32:45 kristaps Exp $ */
  /*
   * Copyright (c) 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   *
  /*
   * Copyright (c) 2010 Kristaps Dzonsons <kristaps@bsd.lv>
   *
@@ -21,16 +21,26 @@
  #include <sys/types.h>
  
  #include <assert.h>
  #include <sys/types.h>
  
  #include <assert.h>
+#ifdef USE_WCHAR
+# include <locale.h>
+#endif
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
+#ifdef USE_WCHAR
+# include <wchar.h>
+#endif
  
  #include "mandoc.h"
  #include "out.h"
  #include "term.h"
  #include "main.h"
  
  
  #include "mandoc.h"
  #include "out.h"
  #include "term.h"
  #include "main.h"
  
+#if ! defined(__STDC_ISO_10646__)
+# undef USE_WCHAR
+#endif
+
  static struct termp     *ascii_init(enum termenc, char *);
  static double            ascii_hspan(const struct termp *,
                                 const struct roffsu *);
  static struct termp     *ascii_init(enum termenc, char *);
  static double            ascii_hspan(const struct termp *,
                                 const struct roffsu *);
@@ -41,6 +51,13 @@ static       void              ascii_end(struct termp *);
  static void              ascii_endline(struct termp *);
  static void              ascii_letter(struct termp *, int);
  
  static void              ascii_endline(struct termp *);
  static void              ascii_letter(struct termp *, int);
  
+#ifdef USE_WCHAR
+static void              locale_advance(struct termp *, size_t);
+static void              locale_endline(struct termp *);
+static void              locale_letter(struct termp *, int);
+static size_t            locale_width(const struct termp *, int);
+#endif
+
  static struct termp *
  ascii_init(enum termenc enc, char *outopts)
  {
  static struct termp *
  ascii_init(enum termenc enc, char *outopts)
  {
@@ -54,15 +71,28 @@ ascii_init(enum termenc enc, char *outopts)
         p->tabwidth = 5;
         p->defrmargin = 78;
  
         p->tabwidth = 5;
         p->defrmargin = 78;
  
-       p->advance = ascii_advance;
         p->begin = ascii_begin;
         p->end = ascii_end;
         p->begin = ascii_begin;
         p->end = ascii_end;
-       p->endline = ascii_endline;
         p->hspan = ascii_hspan;
         p->hspan = ascii_hspan;
-       p->letter = ascii_letter;
         p->type = TERMTYPE_CHAR;
         p->type = TERMTYPE_CHAR;
+
+       p->enc = TERMENC_ASCII;
+       p->advance = ascii_advance;
+       p->endline = ascii_endline;
+       p->letter = ascii_letter;
         p->width = ascii_width;
  
         p->width = ascii_width;
  
+#if defined (USE_WCHAR)
+       if (TERMENC_LOCALE == enc)
+               if (setlocale(LC_ALL, "") && MB_CUR_MAX > 1) {
+                       p->enc = enc;
+                       p->advance = locale_advance;
+                       p->endline = locale_endline;
+                       p->letter = locale_letter;
+                       p->width = locale_width;
+               }
+#endif
+
         toks[0] = "width";
         toks[1] = NULL;
  
         toks[0] = "width";
         toks[1] = NULL;
  
@@ -104,7 +134,6 @@ ascii_width(const struct termp *p, int c)
         return(1);
  }
  
         return(1);
  }
  
-
  void
  ascii_free(void *arg)
  {
  void
  ascii_free(void *arg)
  {
@@ -112,17 +141,14 @@ ascii_free(void *arg)
         term_free((struct termp *)arg);
  }
  
         term_free((struct termp *)arg);
  }
  
-
  /* ARGSUSED */
  static void
  ascii_letter(struct termp *p, int c)
  {
         
  /* ARGSUSED */
  static void
  ascii_letter(struct termp *p, int c)
  {
         
-       /* LINTED */
         putchar(c);
  }
  
         putchar(c);
  }
  
-
  static void
  ascii_begin(struct termp *p)
  {
  static void
  ascii_begin(struct termp *p)
  {
@@ -130,7 +156,6 @@ ascii_begin(struct termp *p)
         (*p->headf)(p, p->argf);
  }
  
         (*p->headf)(p, p->argf);
  }
  
-
  static void
  ascii_end(struct termp *p)
  {
  static void
  ascii_end(struct termp *p)
  {
@@ -138,7 +163,6 @@ ascii_end(struct termp *p)
         (*p->footf)(p, p->argf);
  }
  
         (*p->footf)(p, p->argf);
  }
  
-
  /* ARGSUSED */
  static void
  ascii_endline(struct termp *p)
  /* ARGSUSED */
  static void
  ascii_endline(struct termp *p)
@@ -147,19 +171,16 @@ ascii_endline(struct termp *p)
         putchar('\n');
  }
  
         putchar('\n');
  }
  
-
  /* ARGSUSED */
  static void
  ascii_advance(struct termp *p, size_t len)
  {
         size_t          i;
  
  /* ARGSUSED */
  static void
  ascii_advance(struct termp *p, size_t len)
  {
         size_t          i;
  
-       /* Just print whitespace on the terminal. */
         for (i = 0; i < len; i++)
                 putchar(' ');
  }
  
         for (i = 0; i < len; i++)
                 putchar(' ');
  }
  
-
  /* ARGSUSED */
  static double
  ascii_hspan(const struct termp *p, const struct roffsu *su)
  /* ARGSUSED */
  static double
  ascii_hspan(const struct termp *p, const struct roffsu *su)
@@ -198,3 +219,39 @@ ascii_hspan(const struct termp *p, const struct roffsu *su)
         return(r);
  }
  
         return(r);
  }
  
+#ifdef USE_WCHAR
+/* ARGSUSED */
+static size_t
+locale_width(const struct termp *p, int c)
+{
+       int             rc;
+
+       return((rc = wcwidth(c)) < 0 ? 0 : rc);
+}
+
+/* ARGSUSED */
+static void
+locale_advance(struct termp *p, size_t len)
+{
+       size_t          i;
+
+       for (i = 0; i < len; i++)
+               putwchar(L' ');
+}
+
+/* ARGSUSED */
+static void
+locale_endline(struct termp *p)
+{
+
+       putwchar(L'\n');
+}
+
+/* ARGSUSED */
+static void
+locale_letter(struct termp *p, int c)
+{
+       
+       putwchar(c);
+}
+#endif
author	Kristaps Dzonsons <kristaps@bsd.lv>
	Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
committer	Kristaps Dzonsons <kristaps@bsd.lv>
	Tue, 17 May 2011 22:32:45 +0000 (22:32 +0000)
Makefile		patch \| blob \| history
mandoc.1		patch \| blob \| history
term.c		patch \| blob \| history
term_ascii.c		patch \| blob \| history