From 1ee1eeda195d12cccd87b8fdfca9e982035a89e7 Mon Sep 17 00:00:00 2001 From: Kristaps Dzonsons Date: Tue, 17 May 2011 11:50:20 +0000 Subject: [PATCH] Flip on unicode output (via \[uNNNN]) in -T[x]html. Here we go! --- chars.c | 20 +++++++++++++++++--- html.c | 10 +++++++++- mandoc.3 | 18 ++++++++++++++---- mandoc.h | 3 ++- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/chars.c b/chars.c index 06a2a923..808d70d7 100644 --- a/chars.c +++ b/chars.c @@ -1,4 +1,4 @@ -/* $Id: chars.c,v 1.43 2011/05/15 22:29:50 kristaps Exp $ */ +/* $Id: chars.c,v 1.44 2011/05/17 11:50:20 kristaps Exp $ */ /* * Copyright (c) 2009, 2010 Kristaps Dzonsons * Copyright (c) 2011 Ingo Schwarze @@ -138,7 +138,7 @@ mchars_res2cp(struct mchars *arg, const char *p, size_t sz) } /* - * Numbered character to literal character. + * Numbered character string to ASCII codepoint. * This can only be a printable character (i.e., alnum, punct, space) so * prevent the character from ruining our state (backspace, newline, and * so on). @@ -151,10 +151,24 @@ mchars_num2char(const char *p, size_t sz) if ((i = mandoc_strntou(p, sz, 10)) < 0) return('\0'); - return(isprint(i) ? i : '\0'); } +/* + * Hex character string to Unicode codepoint. + * If the character is illegal, returns '\0'. + */ +int +mchars_num2uc(const char *p, size_t sz) +{ + int i; + + if ((i = mandoc_strntou(p, sz, 16)) < 0) + return('\0'); + /* FIXME: make sure we're not in a bogus range. */ + return(i > 0x80 && i <= 0x10FFFF ? i : '\0'); +} + /* * Special character to string array. */ diff --git a/html.c b/html.c index a50d2eb0..fd696284 100644 --- a/html.c +++ b/html.c @@ -1,4 +1,4 @@ -/* $Id: html.c,v 1.143 2011/05/17 11:38:18 kristaps Exp $ */ +/* $Id: html.c,v 1.144 2011/05/17 11:50:20 kristaps Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons * Copyright (c) 2011 Ingo Schwarze @@ -315,6 +315,8 @@ html_strlen(const char *cp) switch (mandoc_escape(&cp, &seq, &ssz)) { case (ESCAPE_ERROR): return(sz); + case (ESCAPE_UNICODE): + /* FALLTHROUGH */ case (ESCAPE_NUMBERED): /* FALLTHROUGH */ case (ESCAPE_PREDEF): @@ -373,6 +375,12 @@ print_encode(struct html *h, const char *p, int norecurse) break; switch (esc) { + case (ESCAPE_UNICODE): + /* Skip passed "u" header. */ + c = mchars_num2uc(seq + 1, len - 1); + if ('\0' != c) + printf("&#x%x;", c); + break; case (ESCAPE_NUMBERED): c = mchars_num2char(seq, len); if ('\0' != c) diff --git a/mandoc.3 b/mandoc.3 index 66148c4d..0521e391 100644 --- a/mandoc.3 +++ b/mandoc.3 @@ -1,4 +1,4 @@ -.\" $Id: mandoc.3,v 1.6 2011/05/01 10:40:52 kristaps Exp $ +.\" $Id: mandoc.3,v 1.7 2011/05/17 11:50:20 kristaps Exp $ .\" .\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons .\" Copyright (c) 2010 Ingo Schwarze @@ -15,7 +15,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: May 1 2011 $ +.Dd $Mdocdate: May 17 2011 $ .Dt MANDOC 3 .Os .Sh NAME @@ -26,6 +26,7 @@ .Nm mchars_alloc , .Nm mchars_free , .Nm mchars_num2char , +.Nm mchars_num2uc , .Nm mchars_res2cp , .Nm mchars_res2str , .Nm mchars_spec2cp , @@ -64,6 +65,8 @@ .Fn mchars_free "struct mchars *p" .Ft char .Fn mchars_num2char "const char *cp" "size_t sz" +.Ft int +.Fn mchars_num2uc "const char *cp" "size_t sz" .Ft "const char *" .Fo mchars_res2str .Fa "struct mchars *p" @@ -188,6 +191,9 @@ library also contains routines for translating character strings into glyphs .Pq see Fn mchars_alloc and parsing escape sequences from strings .Pq see Fn mandoc_escape . +.Pp +This library is +.Ud .Sh REFERENCE This section documents the functions, types, and variables available via @@ -247,8 +253,12 @@ The object must be freed with Free an object created with .Fn mchars_alloc . .It Fn mchars_num2char -Convert a character index as found in \eN\(aq\(aq into a printable -character. +Convert a character index (e.g., the \eN\(aq\(aq escape) into a +printable ASCII character. +Returns \e0 (the nil character) if the input sequence is malformed. +.It Fn mchars_num2uc +Convert a hexadecimal character index (e.g., the \e[uNNNN] escape) into +a Unicode codepoint. Returns \e0 (the nil character) if the input sequence is malformed. .It Fn mchars_res2cp Convert a predefined character into a valid Unicode codepoint. diff --git a/mandoc.h b/mandoc.h index db7b30bf..5f01644c 100644 --- a/mandoc.h +++ b/mandoc.h @@ -1,4 +1,4 @@ -/* $Id: mandoc.h,v 1.75 2011/05/15 15:30:33 kristaps Exp $ */ +/* $Id: mandoc.h,v 1.76 2011/05/17 11:50:20 kristaps Exp $ */ /* * Copyright (c) 2010, 2011 Kristaps Dzonsons * @@ -330,6 +330,7 @@ enum mandoc_esc mandoc_escape(const char **, const char **, int *); struct mchars *mchars_alloc(void); char mchars_num2char(const char *, size_t); +int mchars_num2uc(const char *, size_t); const char *mchars_spec2str(struct mchars *, const char *, size_t, size_t *); int mchars_spec2cp(struct mchars *, const char *, size_t); const char *mchars_res2str(struct mchars *, const char *, size_t, size_t *); -- 2.47.1