From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from localhost (fantadrom.bsd.lv [local]);
	by fantadrom.bsd.lv (OpenSMTPD) with ESMTPA id c4b2025e;
	for <source@mdocml.bsd.lv>;
	Thu, 18 Dec 2014 23:59:05 -0500 (EST)
Date: Thu, 18 Dec 2014 23:59:05 -0500 (EST)
Message-Id: <2504663034529504146.enqueue@fantadrom.bsd.lv>
X-Mailinglist: mdocml-source
Reply-To: source@mdocml.bsd.lv
MIME-Version: 1.0
From: schwarze@mdocml.bsd.lv
To: source@mdocml.bsd.lv
Subject: mdocml: Rewrite the low-level UTF-8 parser from scratch.
X-Mailer: activitymail 1.26, http://search.cpan.org/dist/activitymail/
Content-Type: text/plain; charset=utf-8

Log Message:
-----------
Rewrite the low-level UTF-8 parser from scratch.             
It accepted invalid byte sequences like 0xc080-c1bf, 0xe08080-e09fbf,
0xeda080-edbfbf, and 0xf0808080-f08fbfbf, produced valid roff Unicode
escape sequences from them, and the algorithm contained strong
defenses against any attempt to fix it.

This cures an assertion failure in the terminal formatter caused 
by sneaking in ASCII 0x08 (backspace) by "encoding" it as an (invalid) 
multibyte UTF-8 sequence, found by jsg@ with afl.  

As a bonus, the new algorithm also reduces the code in the function
by about 20%.

Modified Files:
--------------
    mdocml:
        preconv.c

Revision Data
-------------
Index: preconv.c
===================================================================
RCS file: /home/cvs/mdocml/mdocml/preconv.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -Lpreconv.c -Lpreconv.c -u -p -r1.12 -r1.13
--- preconv.c
+++ preconv.c
@@ -19,6 +19,7 @@
 
 #include <sys/types.h>
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include "mandoc.h"
@@ -28,88 +29,70 @@ int
 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
     int *filenc)
 {
-	size_t		 i;
-	int		 state;
+	unsigned char	*cu;
+	int		 nby;
 	unsigned int	 accum;
-	unsigned char	 cu;
+
+	cu = ib->buf + *ii;
+	assert(*cu & 0x80);
 
 	if ( ! (*filenc & MPARSE_UTF8))
 		goto latin;
 
-	state = 0;
-	accum = 0U;
+	nby = 1;
+	while (nby < 5 && *cu & (1 << (7 - nby)))
+		nby++;
+
+	switch (nby) {
+	case 2:
+		accum = *cu & 0x1f;
+		if (accum < 0x02)  /* Obfuscated ASCII. */
+			goto latin;
+		break;
+	case 3:
+		accum = *cu & 0x0f;
+		break;
+	case 4:
+		accum = *cu & 0x07;
+		if (accum > 0x04) /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:  /* Bad sequence header. */
+		goto latin;
+	}
+
+	cu++;
+	switch (nby) {
+	case 3:
+		if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+		    (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+			goto latin;
+		break;
+	case 4:
+		if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+		    (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+			goto latin;
+		break;
+	default:
+		break;
+	}
 
-	for (i = *ii; i < ib->sz; i++) {
-		cu = ib->buf[i];
-		if (state) {
-			if ( ! (cu & 128) || (cu & 64)) {
-				/* Bad sequence header. */
-				break;
-			}
-
-			/* Accept only legitimate bit patterns. */
-
-			if (cu > 191 || cu < 128) {
-				/* Bad in-sequence bits. */
-				break;
-			}
-
-			accum |= (cu & 63) << --state * 6;
-
-			if (state)
-				continue;
-
-			if (accum < 0x80)
-				ob->buf[(*oi)++] = accum;
-			else
-				*oi += snprintf(ob->buf + *oi,
-				    11, "\\[u%.4X]", accum);
-			*ii = i + 1;
-			*filenc &= ~MPARSE_LATIN1;
-			return(1);
-		} else {
-			/*
-			 * Entering a UTF-8 state:  if we encounter a
-			 * UTF-8 bitmask, calculate the expected UTF-8
-			 * state from it.
-			 */
-			for (state = 0; state < 7; state++)
-				if ( ! (cu & (1 << (7 - state))))
-					break;
-
-			/* Accept only legitimate bit patterns. */
-
-			switch (state--) {
-			case (4):
-				if (cu <= 244 && cu >= 240) {
-					accum = (cu & 7) << 18;
-					continue;
-				}
-				/* Bad 4-sequence start bits. */
-				break;
-			case (3):
-				if (cu <= 239 && cu >= 224) {
-					accum = (cu & 15) << 12;
-					continue;
-				}
-				/* Bad 3-sequence start bits. */
-				break;
-			case (2):
-				if (cu <= 223 && cu >= 194) {
-					accum = (cu & 31) << 6;
-					continue;
-				}
-				/* Bad 2-sequence start bits. */
-				break;
-			default:
-				/* Bad sequence bit mask. */
-				break;
-			}
-			break;
-		}
+	while (--nby) {
+		if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+			goto latin;
+		accum <<= 6;
+		accum += *cu & 0x3f;
+		cu++;
 	}
 
-	/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+	assert(accum > 0x7f);
+	assert(accum < 0x110000);
+	assert(accum < 0xd800 || accum > 0xdfff);
+
+	*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+	*ii = (char *)cu - ib->buf;
+	*filenc &= ~MPARSE_LATIN1;
+	return(1);
 
 latin:
 	if ( ! (*filenc & MPARSE_LATIN1))
--
 To unsubscribe send an email to source+unsubscribe@mdocml.bsd.lv