aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@openbsd.org>2014-10-26 17:12:03 +0000
committerIngo Schwarze <schwarze@openbsd.org>2014-10-26 17:12:03 +0000
commit5faa62e2445541401f9bee1667d1cd2b2e443e53 (patch)
treefd737f26543e4c9e9e08db9bc3b51103c61736a1
parenteb1d4be7915b314c92a4c377c4a09a06e811fc57 (diff)
downloadmandoc-5faa62e2445541401f9bee1667d1cd2b2e443e53.tar.gz
mandoc-5faa62e2445541401f9bee1667d1cd2b2e443e53.tar.zst
mandoc-5faa62e2445541401f9bee1667d1cd2b2e443e53.zip
Improve -Tascii output for Unicode escape sequences: For the first 512
code points, provide ASCII approximations. This is already much better than what groff does, which prints nothing for most code points. A few minor fixes while here: * Handle Unicode escape sequences in the ASCII range. * In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8 and the string "<?>" for -Tascii output. * Handle all one-character escape sequences in mchars_spec2{cp,str}() and remove the workarounds on the higher level.
-rw-r--r--chars.c23
-rw-r--r--html.c16
-rw-r--r--term.c94
-rw-r--r--term.h4
-rw-r--r--term_ascii.c77
5 files changed, 139 insertions, 75 deletions
diff --git a/chars.c b/chars.c
index acea7db6..a7b34b21 100644
--- a/chars.c
+++ b/chars.c
@@ -1,7 +1,7 @@
-/* $Id: chars.c,v 1.59 2014/08/10 23:54:41 schwarze Exp $ */
+/* $Id: chars.c,v 1.60 2014/10/26 17:12:03 schwarze Exp $ */
/*
* Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011, 2014 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -104,9 +104,7 @@ mchars_spec2cp(const struct mchars *arg, const char *p, size_t sz)
const struct ln *ln;
ln = find(arg, p, sz);
- if (NULL == ln)
- return(-1);
- return(ln->unicode);
+ return(ln != NULL ? ln->unicode : sz == 1 ? *p : -1);
}
char
@@ -126,20 +124,13 @@ mchars_num2uc(const char *p, size_t sz)
int i;
if ((i = mandoc_strntoi(p, sz, 16)) < 0)
- return('\0');
+ return(0xFFFD);
/*
- * Security warning:
- * Never extend the range of accepted characters
- * to overlap with the ASCII range, 0x00-0x7F
- * without re-auditing the callers of this function.
- * Some callers might relay on the fact that we never
- * return ASCII characters for their escaping decisions.
- *
* XXX Code is missing here to exclude bogus ranges.
*/
- return(i > 0x80 && i <= 0x10FFFF ? i : '\0');
+ return(i <= 0x10FFFF ? i : 0xFFFD);
}
const char *
@@ -149,9 +140,9 @@ mchars_spec2str(const struct mchars *arg,
const struct ln *ln;
ln = find(arg, p, sz);
- if (NULL == ln) {
+ if (ln == NULL) {
*rsz = 1;
- return(NULL);
+ return(sz == 1 ? p : NULL);
}
*rsz = strlen(ln->ascii);
diff --git a/html.c b/html.c
index 1830a12e..8d8d1130 100644
--- a/html.c
+++ b/html.c
@@ -1,4 +1,4 @@
-/* $Id: html.c,v 1.176 2014/10/10 15:26:29 schwarze Exp $ */
+/* $Id: html.c,v 1.177 2014/10/26 17:12:03 schwarze Exp $ */
/*
* Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -437,8 +437,18 @@ print_encode(struct html *h, const char *p, int norecurse)
case ESCAPE_UNICODE:
/* Skip past "u" header. */
c = mchars_num2uc(seq + 1, len - 1);
- if ('\0' != c)
- printf("&#x%x;", c);
+
+ /*
+ * XXX Security warning:
+ * For now, forbid Unicode obfuscation of ASCII
+ * characters. An audit of the callers is
+ * required before this can be removed.
+ */
+
+ if (c < 0x80)
+ c = 0xFFFD;
+
+ printf("&#x%x;", c);
break;
case ESCAPE_NUMBERED:
c = mchars_num2char(seq, len);
diff --git a/term.c b/term.c
index da1f20c9..017961e0 100644
--- a/term.c
+++ b/term.c
@@ -1,4 +1,4 @@
-/* $Id: term.c,v 1.228 2014/08/18 21:07:53 kristaps Exp $ */
+/* $Id: term.c,v 1.229 2014/10/26 17:12:03 schwarze Exp $ */
/*
* Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -444,27 +444,14 @@ term_word(struct termp *p, const char *word)
if (ESCAPE_ERROR == esc)
continue;
- if (TERMENC_ASCII != p->enc)
- switch (esc) {
- case ESCAPE_UNICODE:
- uc = mchars_num2uc(seq + 1, sz - 1);
- if ('\0' == uc)
- break;
- encode1(p, uc);
- continue;
- case ESCAPE_SPECIAL:
- uc = mchars_spec2cp(p->symtab, seq, sz);
- if (uc <= 0)
- break;
- encode1(p, uc);
- continue;
- default:
- break;
- }
-
switch (esc) {
case ESCAPE_UNICODE:
- encode1(p, '?');
+ uc = mchars_num2uc(seq + 1, sz - 1);
+ if (p->enc == TERMENC_ASCII) {
+ cp = ascii_uc2str(uc);
+ encode(p, cp, strlen(cp));
+ } else
+ encode1(p, uc);
break;
case ESCAPE_NUMBERED:
c = mchars_num2char(seq, sz);
@@ -472,11 +459,19 @@ term_word(struct termp *p, const char *word)
encode(p, &c, 1);
break;
case ESCAPE_SPECIAL:
- cp = mchars_spec2str(p->symtab, seq, sz, &ssz);
- if (NULL != cp)
- encode(p, cp, ssz);
- else if (1 == ssz)
- encode(p, seq, sz);
+ if (p->enc == TERMENC_ASCII) {
+ cp = mchars_spec2str(p->symtab,
+ seq, sz, &ssz);
+ if (cp == NULL)
+ encode(p, "<?>", 3);
+ else
+ encode(p, cp, ssz);
+ } else {
+ uc = mchars_spec2cp(p->symtab, seq, sz);
+ if (uc <= 0)
+ uc = 0xFFFD;
+ encode1(p, uc);
+ }
break;
case ESCAPE_FONTBOLD:
term_fontrepl(p, TERMFONT_BOLD);
@@ -683,31 +678,16 @@ term_strlen(const struct termp *p, const char *cp)
if (ESCAPE_ERROR == esc)
continue;
- if (TERMENC_ASCII != p->enc)
- switch (esc) {
- case ESCAPE_UNICODE:
- c = mchars_num2uc(seq + 1,
- ssz - 1);
- if ('\0' == c)
- break;
- sz += cond_width(p, c, &skip);
- continue;
- case ESCAPE_SPECIAL:
- c = mchars_spec2cp(p->symtab,
- seq, ssz);
- if (c <= 0)
- break;
- sz += cond_width(p, c, &skip);
- continue;
- default:
- break;
- }
-
rhs = NULL;
switch (esc) {
case ESCAPE_UNICODE:
- sz += cond_width(p, '?', &skip);
+ c = mchars_num2uc(seq + 1, sz - 1);
+ if (p->enc == TERMENC_ASCII) {
+ rhs = ascii_uc2str(c);
+ rsz = strlen(rhs);
+ } else
+ sz += cond_width(p, c, &skip);
break;
case ESCAPE_NUMBERED:
c = mchars_num2char(seq, ssz);
@@ -715,14 +695,20 @@ term_strlen(const struct termp *p, const char *cp)
sz += cond_width(p, c, &skip);
break;
case ESCAPE_SPECIAL:
- rhs = mchars_spec2str(p->symtab,
- seq, ssz, &rsz);
-
- if (ssz != 1 || rhs)
- break;
-
- rhs = seq;
- rsz = ssz;
+ if (p->enc == TERMENC_ASCII) {
+ rhs = mchars_spec2str(p->symtab,
+ seq, ssz, &rsz);
+ if (rhs == NULL) {
+ rhs = "<?>";
+ rsz = 3;
+ }
+ } else {
+ c = mchars_spec2cp(p->symtab,
+ seq, ssz);
+ if (c <= 0)
+ c = 0xFFFD;
+ sz += cond_width(p, c, &skip);
+ }
break;
case ESCAPE_SKIPCHAR:
skip = 1;
diff --git a/term.h b/term.h
index 9c4b431c..ecb21689 100644
--- a/term.h
+++ b/term.h
@@ -1,4 +1,4 @@
-/* $Id: term.h,v 1.103 2014/09/17 20:18:58 schwarze Exp $ */
+/* $Id: term.h,v 1.104 2014/10/26 17:12:03 schwarze Exp $ */
/*
* Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -104,6 +104,8 @@ struct termp {
struct termp_ps *ps;
};
+const char *ascii_uc2str(int);
+
void term_eqn(struct termp *, const struct eqn *);
void term_tbl(struct termp *, const struct tbl_span *);
void term_free(struct termp *);
diff --git a/term_ascii.c b/term_ascii.c
index 887710a2..64f9f686 100644
--- a/term_ascii.c
+++ b/term_ascii.c
@@ -1,4 +1,4 @@
-/* $Id: term_ascii.c,v 1.33 2014/09/03 05:22:45 schwarze Exp $ */
+/* $Id: term_ascii.c,v 1.34 2014/10/26 17:12:03 schwarze Exp $ */
/*
* Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -166,6 +166,81 @@ ascii_setwidth(struct termp *p, int iop, size_t width)
p->rmargin = p->maxrmargin = p->defrmargin;
}
+const char *
+ascii_uc2str(int uc)
+{
+ static const char nbrsp[2] = { ASCII_NBRSP, '\0' };
+ static const char *tab[] = {
+ "<NUL>","<SOH>","<STX>","<ETX>","<EOT>","<ENQ>","<ACK>","<BEL>",
+ "<BS>", "\t", "<LF>", "<VT>", "<FF>", "<CR>", "<SO>", "<SI>",
+ "<DLE>","<DC1>","<DC2>","<DC3>","<DC4>","<NAK>","<SYN>","<ETB>",
+ "<CAN>","<EM>", "<SUB>","<ESC>","<FS>", "<GS>", "<RS>", "<US>",
+ " ", "!", "\"", "#", "$", "%", "&", "'",
+ "(", ")", "*", "+", ",", "-", ".", "/",
+ "0", "1", "2", "3", "4", "5", "6", "7",
+ "8", "9", ":", ";", "<", "=", ">", "?",
+ "@", "A", "B", "C", "D", "E", "F", "G",
+ "H", "I", "J", "K", "L", "M", "N", "O",
+ "P", "Q", "R", "S", "T", "U", "V", "W",
+ "X", "Y", "Z", "[", "\\", "]", "^", "_",
+ "`", "a", "b", "c", "d", "e", "f", "g",
+ "h", "i", "j", "k", "l", "m", "n", "o",
+ "p", "q", "r", "s", "t", "u", "v", "w",
+ "x", "y", "z", "{", "|", "}", "~", "<DEL>",
+ "<80>", "<81>", "<82>", "<83>", "<84>", "<85>", "<86>", "<87>",
+ "<88>", "<89>", "<8A>", "<8B>", "<8C>", "<8D>", "<8E>", "<8F>",
+ "<90>", "<91>", "<92>", "<93>", "<94>", "<95>", "<96>", "<97>",
+ "<99>", "<99>", "<9A>", "<9B>", "<9C>", "<9D>", "<9E>", "<9F>",
+ nbrsp, "!", "c", "GBP", "$?", "Y=", "|", "<sec>",
+ "\"", "(C)", "a.", "<<", "<not>","", "(R)", "-",
+ "<deg>","+-", "^2", "^3", "'", "<my>", "<par>","*",
+ ",", "^1", "o.", ">>", "1/4", "1/2", "3/4", "?",
+ "A", "A", "A", "A", "Ae", "Aa", "AE", "C",
+ "E", "E", "E", "E", "I", "I", "I", "I",
+ "D", "N", "O", "O", "O", "O", "Oe", "*",
+ "Oe", "U", "U", "U", "Ue", "Y", "Th", "ss",
+ "a", "a", "a", "a", "ae", "aa", "ae", "c",
+ "e", "e", "e", "e", "i", "i", "i", "i",
+ "d", "n", "o", "o", "o", "o", "oe", "/",
+ "oe", "u", "u", "u", "ue", "y", "th", "y",
+ "A", "a", "A", "a", "A", "a", "C", "c",
+ "C", "c", "C", "c", "C", "c", "D", "d",
+ "D", "d", "E", "e", "E", "e", "E", "e",
+ "E", "e", "E", "e", "G", "g", "G", "g",
+ "G", "g", "G", "g", "H", "h", "H", "h",
+ "I", "i", "I", "i", "I", "i", "I", "i",
+ "I", "i", "IJ", "ij", "J", "j", "K", "k",
+ "q", "L", "l", "L", "l", "L", "l", "L",
+ "l", "L", "l", "N", "n", "N", "n", "N",
+ "n", "'n", "Ng", "ng", "O", "o", "O", "o",
+ "O", "o", "OE", "oe", "R", "r", "R", "r",
+ "R", "r", "S", "s", "S", "s", "S", "s",
+ "S", "s", "T", "t", "T", "t", "T", "t",
+ "U", "u", "U", "u", "U", "u", "U", "u",
+ "U", "u", "U", "u", "W", "w", "Y", "y",
+ "Y", "Z", "z", "Z", "z", "Z", "z", "s",
+ "b", "B", "B", "b", "6", "6", "O", "C",
+ "c", "D", "D", "D", "d", "d", "3", "@",
+ "E", "F", "f", "G", "G", "hv", "I", "I",
+ "K", "k", "l", "l", "W", "N", "n", "O",
+ "O", "o", "OI", "oi", "P", "p", "YR", "2",
+ "2", "SH", "sh", "t", "T", "t", "T", "U",
+ "u", "Y", "V", "Y", "y", "Z", "z", "ZH",
+ "ZH", "zh", "zh", "2", "5", "5", "ts", "w",
+ "|", "||", "|=", "!", "DZ", "Dz", "dz", "LJ",
+ "Lj", "lj", "NJ", "Nj", "nj", "A", "a", "I",
+ "i", "O", "o", "U", "u", "U", "u", "U",
+ "u", "U", "u", "U", "u", "@", "A", "a",
+ "A", "a", "AE", "ae", "G", "g", "G", "g",
+ "K", "k", "O", "o", "O", "o", "ZH", "zh",
+ "j", "DZ", "D", "dz", "G", "g", "HV", "W",
+ "N", "n", "A", "a", "AE", "ae", "O", "o"};
+
+ if (uc < 0 || (size_t)uc >= sizeof(tab)/sizeof(tab[0]))
+ return("<?>");
+ return(tab[uc]);
+}
+
static size_t
ascii_width(const struct termp *p, int c)
{