Stricter syntax checking of Unicode character names:

Require exactly 4, 5 or 6 hex digits and allow nothing else. This avoids mishandling stuff like \[ua] and \C'uA' as Unicode and also fixes underlining in eqn(7) -Thtml output which uses \[ul]. Problem found and semantics suggested by kristaps@.
author: Ingo Schwarze <schwarze@openbsd.org> 2014-10-13 17:17:45 +0000
committer: Ingo Schwarze <schwarze@openbsd.org> 2014-10-13 17:17:45 +0000
commit: ccddf0063047104179acf0697799f81a7f19f6ac (patch)
tree: 08d7cdac16e7c26cb68d1196e2e9ffb24c43c6e5 /mandoc.c
parent: 2623b4393620b67d1251b6af13d35b5fe9a6be80 (diff)
download: mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.tar.gz
mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.tar.zst
mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.zip
1 files changed, 12 insertions, 13 deletions
diff --git a/mandoc.c b/mandoc.c
index be3e264c..e82093b9 100644
--- a/mandoc.c
+++ b/mandoc.c
@@ -1,4 +1,4 @@
-/*	$Id: mandoc.c,v 1.86 2014/08/18 09:11:47 kristaps Exp $ */
+/*	$Id: mandoc.c,v 1.87 2014/10/13 17:17:45 schwarze Exp $ */
 /*
  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -79,24 +79,13 @@ mandoc_escape(const char **end, const char **start, int *sz)
 		break;
 	case '[':
 		gly = ESCAPE_SPECIAL;
-		/*
-		 * Unicode escapes are defined in groff as \[uXXXX] to
-		 * \[u10FFFF], where the contained value must be a valid
-		 * Unicode codepoint.  Here, however, only check whether
-		 * it's not a zero-width escape.
-		 */
-		if ('u' == (*start)[0] && ']' != (*start)[1])
-			gly = ESCAPE_UNICODE;
 		term = ']';
 		break;
 	case 'C':
 		if ('\'' != **start)
 			return(ESCAPE_ERROR);
 		*start = ++*end;
-		if ('u' == (*start)[0] && '\'' != (*start)[1])
-			gly = ESCAPE_UNICODE;
-		else
-			gly = ESCAPE_SPECIAL;
+		gly = ESCAPE_SPECIAL;
 		term = '\'';
 		break;
 
@@ -344,6 +333,16 @@ mandoc_escape(const char **end, const char **start, int *sz)
 	case ESCAPE_SPECIAL:
 		if (1 == *sz && 'c' == **start)
 			gly = ESCAPE_NOSPACE;
+		/*
+		 * Unicode escapes are defined in groff as \[uXXXX]
+		 * to \[u10FFFF], where the contained value must be
+		 * a valid Unicode codepoint.  Here, however, only
+		 * check the length and the validity of all digits.
+		 */
+		else if (*sz > 4 && *sz < 8 && **start == 'u' &&
+		    (int)strspn(*start + 1, "0123456789ABCDEFabcdef")
+		    + 1 == *sz)
+			gly = ESCAPE_UNICODE;
 		break;
 	default:
 		break;
author	Ingo Schwarze <schwarze@openbsd.org>	2014-10-13 17:17:45 +0000
committer	Ingo Schwarze <schwarze@openbsd.org>	2014-10-13 17:17:45 +0000
commit	ccddf0063047104179acf0697799f81a7f19f6ac (patch)
tree	08d7cdac16e7c26cb68d1196e2e9ffb24c43c6e5 /mandoc.c
parent	2623b4393620b67d1251b6af13d35b5fe9a6be80 (diff)
download	mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.tar.gz mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.tar.zst mandoc-ccddf0063047104179acf0697799f81a7f19f6ac.zip