Rewrite the low-level UTF-8 parser from scratch.

It accepted invalid byte sequences like 0xc080-c1bf, 0xe08080-e09fbf, 0xeda080-edbfbf, and 0xf0808080-f08fbfbf, produced valid roff Unicode escape sequences from them, and the algorithm contained strong defenses against any attempt to fix it. This cures an assertion failure in the terminal formatter caused by sneaking in ASCII 0x08 (backspace) by "encoding" it as an (invalid) multibyte UTF-8 sequence, found by jsg@ with afl. As a bonus, the new algorithm also reduces the code in the function by about 20%.
author: Ingo Schwarze <schwarze@openbsd.org> 2014-12-19 04:58:35 +0000
committer: Ingo Schwarze <schwarze@openbsd.org> 2014-12-19 04:58:35 +0000
commit: 3e2c5875ed63b44494405e85c9aa899ca0b2c53e (patch)
tree: 82cb686feae71c826eb445272d09b27f82693eb2 /preconv.c
parent: b30c3c3ead7f853efa6119905f0f545a0895b1d7 (diff)
download: mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.tar.gz
mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.tar.zst
mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.zip
1 files changed, 59 insertions, 76 deletions
diff --git a/preconv.c b/preconv.c
index 0c6076ec..a2bbe9ca 100644
--- a/preconv.c
+++ b/preconv.c
@@ -1,4 +1,4 @@
-/*	$Id: preconv.c,v 1.12 2014/11/14 04:24:04 schwarze Exp $ */
+/*	$Id: preconv.c,v 1.13 2014/12/19 04:58:35 schwarze Exp $ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -19,6 +19,7 @@
 
 #include <sys/types.h>
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include "mandoc.h"
@@ -28,88 +29,70 @@ int
 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
     int *filenc)
 {
-	size_t		 i;
-	int		 state;
+	unsigned char	*cu;
+	int		 nby;
 	unsigned int	 accum;
-	unsigned char	 cu;
+
+	cu = ib->buf + *ii;
+	assert(*cu & 0x80);
 
 	if ( ! (*filenc & MPARSE_UTF8))
 		goto latin;
 
-	state = 0;
-	accum = 0U;
-
-	for (i = *ii; i < ib->sz; i++) {
-		cu = ib->buf[i];
-		if (state) {
-			if ( ! (cu & 128) || (cu & 64)) {
-				/* Bad sequence header. */
-				break;
-			}
-
-			/* Accept only legitimate bit patterns. */
-
-			if (cu > 191 || cu < 128) {
-				/* Bad in-sequence bits. */
-				break;
-			}
-
-			accum |= (cu & 63) << --state * 6;
-
-			if (state)
-				continue;
-
-			if (accum < 0x80)
-				ob->buf[(*oi)++] = accum;
-			else
-				*oi += snprintf(ob->buf + *oi,
-				    11, "\\[u%.4X]", accum);
-			*ii = i + 1;
-			*filenc &= ~MPARSE_LATIN1;
-			return(1);
-		} else {
-			/*
-			 * Entering a UTF-8 state:  if we encounter a
-			 * UTF-8 bitmask, calculate the expected UTF-8
-			 * state from it.
-			 */
-			for (state = 0; state < 7; state++)
-				if ( ! (cu & (1 << (7 - state))))
-					break;
-
-			/* Accept only legitimate bit patterns. */
-
-			switch (state--) {
-			case (4):
-				if (cu <= 244 && cu >= 240) {
-					accum = (cu & 7) << 18;
-					continue;
-				}
-				/* Bad 4-sequence start bits. */
-				break;
-			case (3):
-				if (cu <= 239 && cu >= 224) {
-					accum = (cu & 15) << 12;
-					continue;
-				}
-				/* Bad 3-sequence start bits. */
-				break;
-			case (2):
-				if (cu <= 223 && cu >= 194) {
-					accum = (cu & 31) << 6;
-					continue;
-				}
-				/* Bad 2-sequence start bits. */
-				break;
-			default:
-				/* Bad sequence bit mask. */
-				break;
-			}
-			break;
-		}
+	nby = 1;
+	while (nby < 5 && *cu & (1 << (7 - nby)))
+		nby++;
+
+	switch (nby) {
+	case 2:
+		accum = *cu & 0x1f;
+		if (accum < 0x02)  /* Obfuscated ASCII. */
+			goto latin;
+		break;
+	case 3:
+		accum = *cu & 0x0f;
+		break;
+	case 4:
+		accum = *cu & 0x07;
+		if (accum > 0x04) /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:  /* Bad sequence header. */
+		goto latin;
+	}
+
+	cu++;
+	switch (nby) {
+	case 3:
+		if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+		    (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+			goto latin;
+		break;
+	case 4:
+		if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+		    (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:
+		break;
+	}
+
+	while (--nby) {
+		if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+			goto latin;
+		accum <<= 6;
+		accum += *cu & 0x3f;
+		cu++;
 	}
 
-	/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+	assert(accum > 0x7f);
+	assert(accum < 0x110000);
+	assert(accum < 0xd800 || accum > 0xdfff);
+
+	*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+	*ii = (char *)cu - ib->buf;
+	*filenc &= ~MPARSE_LATIN1;
+	return(1);
 
 latin:
 	if ( ! (*filenc & MPARSE_LATIN1))
author	Ingo Schwarze <schwarze@openbsd.org>	2014-12-19 04:58:35 +0000
committer	Ingo Schwarze <schwarze@openbsd.org>	2014-12-19 04:58:35 +0000
commit	3e2c5875ed63b44494405e85c9aa899ca0b2c53e (patch)
tree	82cb686feae71c826eb445272d09b27f82693eb2 /preconv.c
parent	b30c3c3ead7f853efa6119905f0f545a0895b1d7 (diff)
download	mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.tar.gz mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.tar.zst mandoc-3e2c5875ed63b44494405e85c9aa899ca0b2c53e.zip