mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Reini Urban <rurban@cpan.org>
To: musl@lists.openwall.com
Subject: Re: Re: [PATCH] towupper/towlower: Update to Unicode 9.0
Date: Fri, 20 Oct 2017 11:00:04 +0200	[thread overview]
Message-ID: <CAHiT=DFLK4NQBkxhZKv8W2-W7UJj7etmJxyxwrU+6okjCGRVhw@mail.gmail.com> (raw)
In-Reply-To: <20170913181334.GT1627@brightrain.aerifal.cx>


[-- Attachment #1.1: Type: text/plain, Size: 1690 bytes --]

On Wed, Sep 13, 2017 at 8:13 PM, Rich Felker wrote:

> On Wed, Sep 13, 2017 at 12:05:19PM +0200, Reini Urban wrote:
> > Wait a bit with that. I think I found some more Unicode 9.0 issues with
> the tables,
> > and I’ve found a huge performance opportunity by sorting the 3 tables
> (mostly pairs),
> > and break the loops earlier.
> > This should come close to glibc table performance then, without the huge
> memory costs they have.
> >
> > I’ll write a perl regression testing script not to miss any more
> mappings, and maybe
> > improve the current musl logic. This will need 1-2 days.
> > I’ll also use it for cperl then.
>
> Thanks for the update. I still need to publish the table generation
> code for all the other tables -- I got it mostly dug up and cleaned up
> but got interrupted last time so it's still not posted. With that it
> will be possible to update other things too, not just case mappings.
>
> A few of the existing tables are using an older version of the
> tabulation code that formats the big arrays differently, so I'll
> probably first make a commit to reformat them, so that it's possible
> to mechanically check that this commit does not change the generated
> .o files, then use the uniform formatting as the basis the subsequent
> update to Unicode 9.0. That should not affect the case mapping file
> though since it's not machine-generated.
>


I haven't yet seen your table generator, so I updated the tables with my
version, as I
use them in safeclib.
Unicode 10.0 support plus sort tables for double search speed.

I also added a harmless patch to a check-syntax target for emacs flymake
support.

-- Reini

[-- Attachment #1.2: Type: text/html, Size: 2139 bytes --]

[-- Attachment #2: 0001-towupper-towlower-Update-to-Unicode-10.0-and-sort.patch --]
[-- Type: application/octet-stream, Size: 9420 bytes --]

From bd9f1e60ac55143c507c767ba070ab99a5760baa Mon Sep 17 00:00:00 2001
From: Reini Urban <rurban@cpan.org>
Date: Wed, 13 Sep 2017 10:09:03 +0200
Subject: [PATCH 1/2] towupper/towlower: Update to Unicode 10.0 and sort

taken from safeclib and cross-checked with the perl unicode tables.
sort the tables and exit when found. O(n) -> O(n/2)
---
 src/ctype/towctrans.c | 213 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 163 insertions(+), 50 deletions(-)

diff --git src/ctype/towctrans.c src/ctype/towctrans.c
index cf13a86..4745487 100644
--- src/ctype/towctrans.c
+++ src/ctype/towctrans.c
@@ -1,16 +1,21 @@
 #include <ctype.h>
 #include <wctype.h>
+#include <assert.h>
 #include "libc.h"
 
 #define CASEMAP(u1,u2,l) { (u1), (l)-(u1), (u2)-(u1)+1 }
 #define CASELACE(u1,u2) CASEMAP((u1),(u2),(u1)+1)
 
+/* Unicode 10.0 */
+
+/* must be sorted */
 static const struct {
 	unsigned short upper;
 	signed char lower;
 	unsigned char len;
 } casemaps[] = {
-	CASEMAP(0xc0,0xde,0xe0),
+	CASEMAP (0x00c0,0xd6,0xe0),
+	CASEMAP (0x00d8,0xde,0xf8),
 
 	CASELACE(0x0100,0x012e),
 	CASELACE(0x0132,0x0136),
@@ -18,11 +23,21 @@ static const struct {
 	CASELACE(0x014a,0x0176),
 	CASELACE(0x0179,0x017d),
 
-	CASELACE(0x370,0x372),
-	CASEMAP(0x391,0x3a1,0x3b1),
-	CASEMAP(0x3a3,0x3ab,0x3c3),
-	CASEMAP(0x400,0x40f,0x450),
-	CASEMAP(0x410,0x42f,0x430),
+	CASELACE(0x01a0,0x1a4),
+	CASELACE(0x01b3,0x1b5),
+	CASELACE(0x01cd,0x1db),
+	CASELACE(0x01de,0x1ee),
+	CASELACE(0x01f8,0x21e),
+	CASELACE(0x0222,0x232),
+	CASELACE(0x0246,0x24e),
+
+	CASELACE(0x0370,0x372),
+	CASEMAP (0x0388,0x38a,0x3ad),
+	CASEMAP (0x0393,0x39f,0x3b3),
+	CASEMAP (0x03a7,0x3ab,0x3c7),
+	CASELACE(0x03d8,0x3ee),
+	CASEMAP (0x0400,0x40f,0x450),
+	CASEMAP (0x0410,0x42f,0x430),
 
 	CASELACE(0x460,0x480),
 	CASELACE(0x48a,0x4be),
@@ -80,17 +95,40 @@ static const struct {
 	CASELACE(0xa77e,0xa786),
 
 	CASELACE(0xa790,0xa792),
+	CASELACE(0xa796,0xa79e),
 	CASELACE(0xa7a0,0xa7a8),
 
+	CASELACE(0xa7b4,0xa7b6), /* Unicode 8 */
+
 	CASEMAP(0xff21,0xff3a,0xff41),
 	{ 0,0,0 }
 };
 
+/* must be sorted */
+static const struct {
+    unsigned int upper;
+    int lower;
+    unsigned short len;
+} casemapsl[] = {
+	CASEMAP(0x13a0,0x13ef,0xab70),    /* CHEROKEE reverse */
+	CASEMAP(0xab70,0xabbf,0x13a0),    /* CHEROKEE */
+	CASEMAP(0x10400,0x10427,0x10428),
+	CASEMAP(0x104b0,0x104d3,0x104d8), /* Unicode 9 */
+	CASEMAP(0x10c80,0x10cb2,0x10cc0), /* Unicode 8 */
+	CASEMAP(0x118a0,0x118bf,0x118c0), /* Unicode 7 */
+	CASEMAP(0x1e900,0x1e921,0x1e922), /* Unicode 9 */
+	{ 0,0,0 }
+};
+
+/* must now be sorted */
 static const unsigned short pairs[][2] = {
+	/* upper - lower */
 	{ 'I',    0x0131 },
 	{ 'S',    0x017f },
+	{ 0x00b5, 0x03bc },
 	{ 0x0130, 'i'    },
 	{ 0x0178, 0x00ff },
+	{ 0x017f, 0x73 },
 	{ 0x0181, 0x0253 },
 	{ 0x0182, 0x0183 },
 	{ 0x0184, 0x0185 },
@@ -111,6 +149,7 @@ static const unsigned short pairs[][2] = {
 	{ 0x019c, 0x026f },
 	{ 0x019d, 0x0272 },
 	{ 0x019f, 0x0275 },
+	/*CASELACE(0x01a0,0x01a4),*/
 	{ 0x01a6, 0x0280 },
 	{ 0x01a7, 0x01a8 },
 	{ 0x01a9, 0x0283 },
@@ -119,38 +158,108 @@ static const unsigned short pairs[][2] = {
 	{ 0x01af, 0x01b0 },
 	{ 0x01b1, 0x028a },
 	{ 0x01b2, 0x028b },
+	{ 0x01b3, 0x01b4 },
+	{ 0x01b5, 0x01b6 },
 	{ 0x01b7, 0x0292 },
 	{ 0x01b8, 0x01b9 },
 	{ 0x01bc, 0x01bd },
 	{ 0x01c4, 0x01c6 },
-	{ 0x01c4, 0x01c5 },
+	/*{ 0x01c4, 0x01c5 },*/
 	{ 0x01c5, 0x01c6 },
 	{ 0x01c7, 0x01c9 },
-	{ 0x01c7, 0x01c8 },
+	/*{ 0x01c7, 0x01c8 },*/
 	{ 0x01c8, 0x01c9 },
 	{ 0x01ca, 0x01cc },
-	{ 0x01ca, 0x01cb },
+	/*{ 0x01ca, 0x01cb },*/
+	/*CASELACE(0x01cb,0x01db),*/
 	{ 0x01cb, 0x01cc },
+
 	{ 0x01f1, 0x01f3 },
-	{ 0x01f1, 0x01f2 },
+	/*{ 0x01f1, 0x01f2 },*/
 	{ 0x01f2, 0x01f3 },
 	{ 0x01f4, 0x01f5 },
 	{ 0x01f6, 0x0195 },
 	{ 0x01f7, 0x01bf },
+	/*CASELACE(0x01f8,0x021e),*/
 	{ 0x0220, 0x019e },
-	{ 0x0386, 0x03ac },
-	{ 0x0388, 0x03ad },
-	{ 0x0389, 0x03ae },
-	{ 0x038a, 0x03af },
+	/*CASELACE(0x0222,0x0232),*/
+	{ 0x023a, 0x2c65 },
+	{ 0x023b, 0x23c },
+	{ 0x023d, 0x19a },
+	{ 0x023e, 0x2c66 },
+	{ 0x0241, 0x242 },
+	{ 0x0243, 0x180 },
+	{ 0x0244, 0x289 },
+	{ 0x0245, 0x28c },
+
+	{ 0x0345, 0x3b9 },
+	{ 0x0376, 0x377 }, /* bogus greek 'symbol' */
+	{ 0x037f, 0x3f3 },
+	{ 0x0386, 0x3ac },
 	{ 0x038c, 0x03cc },
 	{ 0x038e, 0x03cd },
 	{ 0x038f, 0x03ce },
-	{ 0x0399, 0x0345 },
-	{ 0x0399, 0x1fbe },
-	{ 0x03a3, 0x03c2 },
+	{ 0x0391, 0x3b1 },
+	{ 0x0392, 0x3b2 },
+	{ 0x0392, 0x3d0 }, /* reverse */
+	/*CASEMAP (0x0393,0x39f,0x3b3),*/
+	{ 0x0395, 0x3f5 }, /* reverse */
+	{ 0x0398, 0x3d1 },
+	{ 0x0399, 0x1fbe },/* reverse */
+	{ 0x039a, 0x3f0 }, /* reverse */
+	{ 0x03a0, 0x3c0 },
+	{ 0x03a0, 0x3d6 }, /* reverse */
+	{ 0x03a1, 0x3c1 },
+	{ 0x03a1, 0x3f1 }, /* reverse */
+	{ 0x03a3, 0x3c3 },
+	{ 0x03a3, 0x3c2 }, /* reverse */
+	{ 0x03a4, 0x3c4 },
+	{ 0x03a5, 0x3c5 },
+	{ 0x03a6, 0x3c6 },
+	{ 0x03a6, 0x3d5 }, /* reverse */
+	/*CASEMAP(0x0391,0x3a1,0x3b1),*/
+	{ 0x03c2, 0x3c3 },
+	{ 0x03cf, 0x3d7 },
+	{ 0x03d0, 0x3b2 },
+	{ 0x03d1, 0x3b8 },
+	{ 0x03d5, 0x3c6 },
+	{ 0x03d6, 0x3c0 },
+	/*CASELACE(0x03d8,0x3ee),*/
+	/*CASEMAP(0x03da,0x3ee,0x3db),*/
+	{ 0x03f0, 0x03ba },
+	{ 0x03f1, 0x03c1 },
+	{ 0x03f4, 0x03b8 },
+	{ 0x03f5, 0x03b5 },
 	{ 0x03f7, 0x03f8 },
+	{ 0x03f9, 0x03f2 },
 	{ 0x03fa, 0x03fb },
+	{ 0x03fd, 0x037b },
+	{ 0x03fe, 0x037c },
+	{ 0x03ff, 0x037d },
+	/*CASEMAP(0x0400,0x40f,0x450),
+	  CASEMAP(0x0410,0x42f,0x430),*/
+	{ 0x412, 0x1c80 }, /* reverse */
+	{ 0x414, 0x1c81 }, /* reverse */
+	{ 0x41e, 0x1c82 }, /* reverse */
+	{ 0x421, 0x1c83 }, /* reverse */
+	{ 0x422, 0x1c84 }, /* reverse */
+	{ 0x422, 0x1c85 }, /* reverse */
+	{ 0x42a, 0x1c86 }, /* reverse */
+	{ 0x462, 0x463 },
+	{ 0x462, 0x1c87 }, /* reverse */
+
+	{ 0x04c0, 0x04cf},
+	/*CASELACE(0x04c1,0x4cd),*/
+	{ 0x0528, 0x0529},
+	{ 0x052a, 0x052b},
+	{ 0x052c, 0x052d},
+	{ 0x052e, 0x052f},
+
+	{ 0x10c7, 0x2d27 },
+	{ 0x10cd, 0x2d2d },
+
 	{ 0x1e60, 0x1e9b },
+	{ 0x1e9b, 0x1e61 },
 	{ 0x1e9e, 0xdf },
 
 	{ 0x1f59, 0x1f51 },
@@ -158,25 +267,11 @@ static const unsigned short pairs[][2] = {
 	{ 0x1f5d, 0x1f55 },
 	{ 0x1f5f, 0x1f57 },
 	{ 0x1fbc, 0x1fb3 },
+	{ 0x1fbe, 0x3b9 },
 	{ 0x1fcc, 0x1fc3 },
 	{ 0x1fec, 0x1fe5 },
 	{ 0x1ffc, 0x1ff3 },
 
-	{ 0x23a, 0x2c65 },
-	{ 0x23b, 0x23c },
-	{ 0x23d, 0x19a },
-	{ 0x23e, 0x2c66 },
-	{ 0x241, 0x242 },
-	{ 0x243, 0x180 },
-	{ 0x244, 0x289 },
-	{ 0x245, 0x28c },
-	{ 0x3f4, 0x3b8 },
-	{ 0x3f9, 0x3f2 },
-	{ 0x3fd, 0x37b },
-	{ 0x3fe, 0x37c },
-	{ 0x3ff, 0x37d },
-	{ 0x4c0, 0x4cf },
-
 	{ 0x2126, 0x3c9 },
 	{ 0x212a, 'k' },
 	{ 0x212b, 0xe5 },
@@ -196,25 +291,25 @@ static const unsigned short pairs[][2] = {
 	{ 0x2c7f, 0x240 },
 	{ 0x2cf2, 0x2cf3 },
 
+	{ 0xa64a, 0xa64b },
+	{ 0xa64a, 0x1c88 }, /* reverse */
+
 	{ 0xa77d, 0x1d79 },
 	{ 0xa78b, 0xa78c },
 	{ 0xa78d, 0x265 },
 	{ 0xa7aa, 0x266 },
 
-	{ 0x10c7, 0x2d27 },
-	{ 0x10cd, 0x2d2d },
+	{ 0xa7ab, 0x25c }, /* Unicode 7.0 */
+	{ 0xa7ac, 0x261 }, /* Unicode 7.0 */
+	{ 0xa7ad, 0x26c }, /* Unicode 7.0 */
+	{ 0xa7ae, 0x26a }, /* Unicode 9.0 */
+	{ 0xa7b0, 0x29e }, /* Unicode 7.0 */
+	{ 0xa7b1, 0x287 }, /* Unicode 7.0 */
+	{ 0xa7b2, 0x29d }, /* Unicode 7.0 */
+	{ 0xa7b3, 0xab53 }, /* Unicode 8.0 */
+	{ 0xa7b4, 0xa7b5 }, /* Unicode 8.0 */
 
-	/* bogus greek 'symbol' letters */
-	{ 0x376, 0x377 },
-	{ 0x39c, 0xb5 },
-	{ 0x392, 0x3d0 },
-	{ 0x398, 0x3d1 },
-	{ 0x3a6, 0x3d5 },
-	{ 0x3a0, 0x3d6 },
-	{ 0x39a, 0x3f0 },
-	{ 0x3a1, 0x3f1 },
-	{ 0x395, 0x3f5 },
-	{ 0x3cf, 0x3d7 },
+        { 0xa7b6, 0xa7b7 }, /* Unicode 8.0 */
 
 	{ 0,0 }
 };
@@ -229,29 +324,47 @@ static wchar_t __towcase(wchar_t wc, int lower)
 	if (!iswalpha(wc)
 	 || (unsigned)wc - 0x0600 <= 0x0fff-0x0600
 	 || (unsigned)wc - 0x2e00 <= 0xa63f-0x2e00
-	 || (unsigned)wc - 0xa800 <= 0xfeff-0xa800)
+	 || (unsigned)wc - 0xa800 <= 0xab69-0xa800
+	 || (unsigned)wc - 0xabc0 <= 0xfeff-0xabc0)
 		return wc;
 	/* special case because the diff between upper/lower is too big */
-	if (lower && (unsigned)wc - 0x10a0 < 0x2e)
+	if (lower && (unsigned)wc - 0x10a0 < 0x2e) {
 		if (wc>0x10c5 && wc != 0x10c7 && wc != 0x10cd) return wc;
 		else return wc + 0x2d00 - 0x10a0;
-	if (!lower && (unsigned)wc - 0x2d00 < 0x26)
+        }
+	if (!lower && (unsigned)wc - 0x2d00 < 0x26) {
 		if (wc>0x2d25 && wc != 0x2d27 && wc != 0x2d2d) return wc;
 		else return wc + 0x10a0 - 0x2d00;
+        }
 	for (i=0; casemaps[i].len; i++) {
 		int base = casemaps[i].upper + (lmask & casemaps[i].lower);
+		assert(i>0 ? casemaps[i].upper >= casemaps[i-1].upper : 1);
 		if ((unsigned)wc-base < casemaps[i].len) {
 			if (casemaps[i].lower == 1)
 				return wc + lower - ((wc-casemaps[i].upper)&1);
 			return wc + lmul*casemaps[i].lower;
 		}
+		if (lower && casemaps[i].upper > wc)
+			break;
 	}
 	for (i=0; pairs[i][1-lower]; i++) {
+		assert(i>0 ? pairs[i][0] >= pairs[i-1][0] : 1);
 		if (pairs[i][1-lower] == wc)
 			return pairs[i][lower];
+		if (lower && pairs[i][0] > wc)
+			break;
+	}
+	for (i=0; casemapsl[i].len; i++) {
+		unsigned long base = casemapsl[i].upper + (lmask & casemapsl[i].lower);
+		assert(i>0 ? casemapsl[i].upper >= casemapsl[i-1].upper : 1);
+		if ((unsigned)wc-base < casemapsl[i].len) {
+			if (casemapsl[i].lower == 1)
+				return wc + lower - ((wc-casemapsl[i].upper)&1);
+			return wc + lmul*casemapsl[i].lower;
+		}
+		if (lower && casemaps[i].upper > wc)
+			break;
 	}
-	if ((unsigned)wc - (0x10428 - 0x28*lower) < 0x28)
-		return wc - 0x28 + 0x50*lower;
 	return wc;
 }
 
-- 
2.8.4 (Apple Git-73)


[-- Attachment #3: 0002-add-emacs-flymake-support.patch --]
[-- Type: application/octet-stream, Size: 1031 bytes --]

From 347be94765fe4993e143ed33ae874c642446f3ca Mon Sep 17 00:00:00 2001
From: Reini Urban <rurban@cpan.org>
Date: Fri, 20 Oct 2017 10:46:44 +0200
Subject: [PATCH 2/2] add emacs flymake support

---
 Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git Makefile Makefile
index d2e8997..9eb0cd6 100644
--- Makefile
+++ Makefile
@@ -193,6 +193,11 @@ obj/%-clang: $(srcdir)/tools/%-clang.in config.mak
 	sed -e 's!@CC@!$(WRAPCC_CLANG)!g' -e 's!@PREFIX@!$(prefix)!g' -e 's!@INCDIR@!$(includedir)!g' -e 's!@LIBDIR@!$(libdir)!g' -e 's!@LDSO@!$(LDSO_PATHNAME)!g' $< > $@
 	chmod +x $@
 
+# emacs flymake-mode
+check-syntax:
+	test -n "$(CHK_SOURCES)" && \
+	  $(CC) $(CFLAGS_ALL) -o /dev/null -S $(CHK_SOURCES)
+
 $(DESTDIR)$(bindir)/%: obj/%
 	$(INSTALL) -D $< $@
 
@@ -239,4 +244,4 @@ clean:
 distclean: clean
 	rm -f config.mak
 
-.PHONY: all clean install install-libs install-headers install-tools
+.PHONY: all clean install install-libs install-headers install-tools check-syntax
-- 
2.8.4 (Apple Git-73)


  reply	other threads:[~2017-10-20  9:00 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-13  8:25 Reini Urban
2017-09-13 10:05 ` Reini Urban
2017-09-13 18:13   ` Rich Felker
2017-10-20  9:00     ` Reini Urban [this message]
2017-10-25 18:38       ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAHiT=DFLK4NQBkxhZKv8W2-W7UJj7etmJxyxwrU+6okjCGRVhw@mail.gmail.com' \
    --to=rurban@cpan.org \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).