source@mandoc.bsd.lv
 help / color / mirror / Atom feed
* mandoc: Garbage collect dead code intended to write five- and six-byte
@ 2024-05-14 21:19 schwarze
  0 siblings, 0 replies; only message in thread
From: schwarze @ 2024-05-14 21:19 UTC (permalink / raw)
  To: source

Log Message:
-----------
Garbage collect dead code intended to write five- and six-byte UTF-8
sequences since the Unicode standard has been explicitly prohibiting
the use of such sequences when encoding Unicode characters for more
than 20 years now.

While here, also weed out UTF-16 surrogates and codepoints in the
invalid range 110000 to 1FFFFF if any are encountered.  I hoped to
write "no functional change", but to my shame it turns out there
are unrelated bugs with \[uXXXX] parsing in roff_escape.c, so this
new anti-surrogate check is actually reachable until those other
bugs get fixed, and even after fixing those other bugs, it will
remain useful as a defense in depth.

Modified Files:
--------------
    mandoc:
        mandocdb.c

Revision Data
-------------
Index: mandocdb.c
===================================================================
RCS file: /home/cvs/mandoc/mandoc/mandocdb.c,v
diff -Lmandocdb.c -Lmandocdb.c -u -p -r1.273 -r1.274
--- mandocdb.c
+++ mandocdb.c
@@ -1,6 +1,6 @@
 /* $Id$ */
 /*
- * Copyright (c) 2011-2021 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011-2021, 2024 Ingo Schwarze <schwarze@openbsd.org>
  * Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2016 Ed Maste <emaste@freebsd.org>
  *
@@ -173,7 +173,7 @@ static	void	 say(const char *, const cha
 			__attribute__((__format__ (__printf__, 2, 3)));
 static	int	 set_basedir(const char *, int);
 static	int	 treescan(void);
-static	size_t	 utf8(unsigned int, char [7]);
+static	size_t	 utf8(unsigned int, char[5]);
 
 static	int		 nodb; /* no database changes */
 static	int		 mparse_options; /* abort the parse early */
@@ -1907,49 +1907,35 @@ putkeys(const struct mpage *mpage, char 
  * Take a Unicode codepoint and produce its UTF-8 encoding.
  * This isn't the best way to do this, but it works.
  * The magic numbers are from the UTF-8 packaging.
- * They're not as scary as they seem: read the UTF-8 spec for details.
+ * Read the UTF-8 spec or the utf8(7) manual page for details.
  */
 static size_t
-utf8(unsigned int cp, char out[7])
+utf8(unsigned int cp, char out[5])
 {
 	size_t		 rc;
 
-	rc = 0;
-	if (cp <= 0x0000007F) {
+	if (cp <= 0x7f) {
 		rc = 1;
 		out[0] = (char)cp;
-	} else if (cp <= 0x000007FF) {
+	} else if (cp <= 0x7ff) {
 		rc = 2;
 		out[0] = (cp >> 6  & 31) | 192;
 		out[1] = (cp       & 63) | 128;
-	} else if (cp <= 0x0000FFFF) {
+	} else if (cp >= 0xd800 && cp <= 0xdfff) {
+		rc = 0; /* reject UTF-16 surrogate */
+	} else if (cp <= 0xffff) {
 		rc = 3;
 		out[0] = (cp >> 12 & 15) | 224;
 		out[1] = (cp >> 6  & 63) | 128;
 		out[2] = (cp       & 63) | 128;
-	} else if (cp <= 0x001FFFFF) {
+	} else if (cp <= 0x10ffff) {
 		rc = 4;
 		out[0] = (cp >> 18 &  7) | 240;
 		out[1] = (cp >> 12 & 63) | 128;
 		out[2] = (cp >> 6  & 63) | 128;
 		out[3] = (cp       & 63) | 128;
-	} else if (cp <= 0x03FFFFFF) {
-		rc = 5;
-		out[0] = (cp >> 24 &  3) | 248;
-		out[1] = (cp >> 18 & 63) | 128;
-		out[2] = (cp >> 12 & 63) | 128;
-		out[3] = (cp >> 6  & 63) | 128;
-		out[4] = (cp       & 63) | 128;
-	} else if (cp <= 0x7FFFFFFF) {
-		rc = 6;
-		out[0] = (cp >> 30 &  1) | 252;
-		out[1] = (cp >> 24 & 63) | 128;
-		out[2] = (cp >> 18 & 63) | 128;
-		out[3] = (cp >> 12 & 63) | 128;
-		out[4] = (cp >> 6  & 63) | 128;
-		out[5] = (cp       & 63) | 128;
 	} else
-		return 0;
+		rc = 0;
 
 	out[rc] = '\0';
 	return rc;
--
 To unsubscribe send an email to source+unsubscribe@mandoc.bsd.lv


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-05-14 21:19 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-14 21:19 mandoc: Garbage collect dead code intended to write five- and six-byte schwarze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).