From c462999d899acf87741657ed63dff7612559ce20 Mon Sep 17 00:00:00 2001 From: Kristaps Dzonsons Date: Sun, 15 May 2011 15:30:33 +0000 Subject: [PATCH] Support groff's escape for Unicode input. See http://mdocml.bsd.lv/archives/tech/0368.html For the time being, we just throw it away. --- mandoc.c | 10 +++++++++- mandoc.h | 3 ++- mandoc_char.7 | 18 ++++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/mandoc.c b/mandoc.c index 671f059a..c9290214 100644 --- a/mandoc.c +++ b/mandoc.c @@ -1,4 +1,4 @@ -/* $Id: mandoc.c,v 1.51 2011/05/14 17:54:42 kristaps Exp $ */ +/* $Id: mandoc.c,v 1.52 2011/05/15 15:30:33 kristaps Exp $ */ /* * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons * Copyright (c) 2011 Ingo Schwarze @@ -125,6 +125,14 @@ mandoc_escape(const char **end, const char **start, int *sz) break; case ('['): gly = ESCAPE_SPECIAL; + /* + * Unicode escapes are defined in groff as \[uXXXX] to + * \[u10FFFF], where the contained value must be a valid + * Unicode codepoint. Here, however, only check whether + * it's not a zero-width escape. + */ + if ('u' == cp[i] && ']' != cp[i + 1]) + gly = ESCAPE_UNICODE; term = ']'; break; case ('C'): diff --git a/mandoc.h b/mandoc.h index 55878b62..db7b30bf 100644 --- a/mandoc.h +++ b/mandoc.h @@ -1,4 +1,4 @@ -/* $Id: mandoc.h,v 1.74 2011/04/30 22:24:31 kristaps Exp $ */ +/* $Id: mandoc.h,v 1.75 2011/05/15 15:30:33 kristaps Exp $ */ /* * Copyright (c) 2010, 2011 Kristaps Dzonsons * @@ -299,6 +299,7 @@ enum mandoc_esc { ESCAPE_FONTROMAN, /* roman font mode */ ESCAPE_FONTPREV, /* previous font mode */ ESCAPE_NUMBERED, /* a numbered glyph */ + ESCAPE_UNICODE, /* a unicode codepoint */ ESCAPE_NOSPACE /* suppress space if the last on a line */ }; diff --git a/mandoc_char.7 b/mandoc_char.7 index c52d1e78..d0c5dd7f 100644 --- a/mandoc_char.7 +++ b/mandoc_char.7 @@ -1,4 +1,4 @@ -.\" $Id: mandoc_char.7,v 1.44 2011/05/01 08:45:10 kristaps Exp $ +.\" $Id: mandoc_char.7,v 1.45 2011/05/15 15:30:33 kristaps Exp $ .\" .\" Copyright (c) 2009 Kristaps Dzonsons .\" @@ -14,7 +14,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: May 1 2011 $ +.Dd $Mdocdate: May 15 2011 $ .Dt MANDOC_CHAR 7 .Os .Sh NAME @@ -520,6 +520,20 @@ portable. .It \e*(Px Ta \*(Px Ta POSIX standard name .It \e*(Ai Ta \*(Ai Ta ANSI standard name .El +.Sh UNICODE CHARACTERS +The escape sequence +.Pp +.Dl \e[uXXXX] +.Pp +is interpreted as a Unicode codepoint. +The codepoint must be in the range above U+0080 and less than U+10FFFF. +For compatibility, points must be zero-padded to four characters; if +greater than four characters, no zero padding is allowed. +Unicode surrogates are not allowed. +.\" .Pp +.\" Unicode glyphs attenuate to the +.\" .Sq \&? +.\" character if invalid or not rendered by current output media. .Sh NUMBERED CHARACTERS For backward compatibility with existing manuals, .Xr mandoc 1 -- 2.47.1