From 5faa62e2445541401f9bee1667d1cd2b2e443e53 Mon Sep 17 00:00:00 2001
From: Ingo Schwarze <schwarze@openbsd.org>
Date: Sun, 26 Oct 2014 17:12:03 +0000
Subject: Improve -Tascii output for Unicode escape sequences: For the first
 512 code points, provide ASCII approximations.  This is already much better
 than what groff does, which prints nothing for most code points.

A few minor fixes while here:
* Handle Unicode escape sequences in the ASCII range.
* In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8
and the string "<?>" for -Tascii output.
* Handle all one-character escape sequences in mchars_spec2{cp,str}()
and remove the workarounds on the higher level.
---
 html.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'html.c')

diff --git a/html.c b/html.c
index 1830a12e..8d8d1130 100644
--- a/html.c
+++ b/html.c
@@ -1,4 +1,4 @@
-/*	$Id: html.c,v 1.176 2014/10/10 15:26:29 schwarze Exp $ */
+/*	$Id: html.c,v 1.177 2014/10/26 17:12:03 schwarze Exp $ */
 /*
  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -437,8 +437,18 @@ print_encode(struct html *h, const char *p, int norecurse)
 		case ESCAPE_UNICODE:
 			/* Skip past "u" header. */
 			c = mchars_num2uc(seq + 1, len - 1);
-			if ('\0' != c)
-				printf("&#x%x;", c);
+
+			/*
+			 * XXX Security warning:
+			 * For now, forbid Unicode obfuscation of ASCII
+			 * characters.  An audit of the callers is
+			 * required before this can be removed.
+			 */
+
+			if (c < 0x80)
+				c = 0xFFFD;
+
+			printf("&#x%x;", c);
 			break;
 		case ESCAPE_NUMBERED:
 			c = mchars_num2char(seq, len);
-- 
cgit v1.2.3