From: Reini Urban <rurban@cpan.org>
To: musl@lists.openwall.com
Subject: Re: Re: [PATCH] towupper/towlower: Update to Unicode 9.0
Date: Fri, 20 Oct 2017 11:00:04 +0200 [thread overview]
Message-ID: <CAHiT=DFLK4NQBkxhZKv8W2-W7UJj7etmJxyxwrU+6okjCGRVhw@mail.gmail.com> (raw)
In-Reply-To: <20170913181334.GT1627@brightrain.aerifal.cx>
[-- Attachment #1.1: Type: text/plain, Size: 1690 bytes --]
On Wed, Sep 13, 2017 at 8:13 PM, Rich Felker wrote:
> On Wed, Sep 13, 2017 at 12:05:19PM +0200, Reini Urban wrote:
> > Wait a bit with that. I think I found some more Unicode 9.0 issues with
> the tables,
> > and I’ve found a huge performance opportunity by sorting the 3 tables
> (mostly pairs),
> > and break the loops earlier.
> > This should come close to glibc table performance then, without the huge
> memory costs they have.
> >
> > I’ll write a perl regression testing script not to miss any more
> mappings, and maybe
> > improve the current musl logic. This will need 1-2 days.
> > I’ll also use it for cperl then.
>
> Thanks for the update. I still need to publish the table generation
> code for all the other tables -- I got it mostly dug up and cleaned up
> but got interrupted last time so it's still not posted. With that it
> will be possible to update other things too, not just case mappings.
>
> A few of the existing tables are using an older version of the
> tabulation code that formats the big arrays differently, so I'll
> probably first make a commit to reformat them, so that it's possible
> to mechanically check that this commit does not change the generated
> .o files, then use the uniform formatting as the basis the subsequent
> update to Unicode 9.0. That should not affect the case mapping file
> though since it's not machine-generated.
>
I haven't yet seen your table generator, so I updated the tables with my
version, as I
use them in safeclib.
Unicode 10.0 support plus sort tables for double search speed.
I also added a harmless patch to a check-syntax target for emacs flymake
support.
-- Reini
[-- Attachment #1.2: Type: text/html, Size: 2139 bytes --]
[-- Attachment #2: 0001-towupper-towlower-Update-to-Unicode-10.0-and-sort.patch --]
[-- Type: application/octet-stream, Size: 9420 bytes --]
From bd9f1e60ac55143c507c767ba070ab99a5760baa Mon Sep 17 00:00:00 2001
From: Reini Urban <rurban@cpan.org>
Date: Wed, 13 Sep 2017 10:09:03 +0200
Subject: [PATCH 1/2] towupper/towlower: Update to Unicode 10.0 and sort
taken from safeclib and cross-checked with the perl unicode tables.
sort the tables and exit when found. O(n) -> O(n/2)
---
src/ctype/towctrans.c | 213 ++++++++++++++++++++++++++++++++++++++------------
1 file changed, 163 insertions(+), 50 deletions(-)
diff --git src/ctype/towctrans.c src/ctype/towctrans.c
index cf13a86..4745487 100644
--- src/ctype/towctrans.c
+++ src/ctype/towctrans.c
@@ -1,16 +1,21 @@
#include <ctype.h>
#include <wctype.h>
+#include <assert.h>
#include "libc.h"
#define CASEMAP(u1,u2,l) { (u1), (l)-(u1), (u2)-(u1)+1 }
#define CASELACE(u1,u2) CASEMAP((u1),(u2),(u1)+1)
+/* Unicode 10.0 */
+
+/* must be sorted */
static const struct {
unsigned short upper;
signed char lower;
unsigned char len;
} casemaps[] = {
- CASEMAP(0xc0,0xde,0xe0),
+ CASEMAP (0x00c0,0xd6,0xe0),
+ CASEMAP (0x00d8,0xde,0xf8),
CASELACE(0x0100,0x012e),
CASELACE(0x0132,0x0136),
@@ -18,11 +23,21 @@ static const struct {
CASELACE(0x014a,0x0176),
CASELACE(0x0179,0x017d),
- CASELACE(0x370,0x372),
- CASEMAP(0x391,0x3a1,0x3b1),
- CASEMAP(0x3a3,0x3ab,0x3c3),
- CASEMAP(0x400,0x40f,0x450),
- CASEMAP(0x410,0x42f,0x430),
+ CASELACE(0x01a0,0x1a4),
+ CASELACE(0x01b3,0x1b5),
+ CASELACE(0x01cd,0x1db),
+ CASELACE(0x01de,0x1ee),
+ CASELACE(0x01f8,0x21e),
+ CASELACE(0x0222,0x232),
+ CASELACE(0x0246,0x24e),
+
+ CASELACE(0x0370,0x372),
+ CASEMAP (0x0388,0x38a,0x3ad),
+ CASEMAP (0x0393,0x39f,0x3b3),
+ CASEMAP (0x03a7,0x3ab,0x3c7),
+ CASELACE(0x03d8,0x3ee),
+ CASEMAP (0x0400,0x40f,0x450),
+ CASEMAP (0x0410,0x42f,0x430),
CASELACE(0x460,0x480),
CASELACE(0x48a,0x4be),
@@ -80,17 +95,40 @@ static const struct {
CASELACE(0xa77e,0xa786),
CASELACE(0xa790,0xa792),
+ CASELACE(0xa796,0xa79e),
CASELACE(0xa7a0,0xa7a8),
+ CASELACE(0xa7b4,0xa7b6), /* Unicode 8 */
+
CASEMAP(0xff21,0xff3a,0xff41),
{ 0,0,0 }
};
+/* must be sorted */
+static const struct {
+ unsigned int upper;
+ int lower;
+ unsigned short len;
+} casemapsl[] = {
+ CASEMAP(0x13a0,0x13ef,0xab70), /* CHEROKEE reverse */
+ CASEMAP(0xab70,0xabbf,0x13a0), /* CHEROKEE */
+ CASEMAP(0x10400,0x10427,0x10428),
+ CASEMAP(0x104b0,0x104d3,0x104d8), /* Unicode 9 */
+ CASEMAP(0x10c80,0x10cb2,0x10cc0), /* Unicode 8 */
+ CASEMAP(0x118a0,0x118bf,0x118c0), /* Unicode 7 */
+ CASEMAP(0x1e900,0x1e921,0x1e922), /* Unicode 9 */
+ { 0,0,0 }
+};
+
+/* must now be sorted */
static const unsigned short pairs[][2] = {
+ /* upper - lower */
{ 'I', 0x0131 },
{ 'S', 0x017f },
+ { 0x00b5, 0x03bc },
{ 0x0130, 'i' },
{ 0x0178, 0x00ff },
+ { 0x017f, 0x73 },
{ 0x0181, 0x0253 },
{ 0x0182, 0x0183 },
{ 0x0184, 0x0185 },
@@ -111,6 +149,7 @@ static const unsigned short pairs[][2] = {
{ 0x019c, 0x026f },
{ 0x019d, 0x0272 },
{ 0x019f, 0x0275 },
+ /*CASELACE(0x01a0,0x01a4),*/
{ 0x01a6, 0x0280 },
{ 0x01a7, 0x01a8 },
{ 0x01a9, 0x0283 },
@@ -119,38 +158,108 @@ static const unsigned short pairs[][2] = {
{ 0x01af, 0x01b0 },
{ 0x01b1, 0x028a },
{ 0x01b2, 0x028b },
+ { 0x01b3, 0x01b4 },
+ { 0x01b5, 0x01b6 },
{ 0x01b7, 0x0292 },
{ 0x01b8, 0x01b9 },
{ 0x01bc, 0x01bd },
{ 0x01c4, 0x01c6 },
- { 0x01c4, 0x01c5 },
+ /*{ 0x01c4, 0x01c5 },*/
{ 0x01c5, 0x01c6 },
{ 0x01c7, 0x01c9 },
- { 0x01c7, 0x01c8 },
+ /*{ 0x01c7, 0x01c8 },*/
{ 0x01c8, 0x01c9 },
{ 0x01ca, 0x01cc },
- { 0x01ca, 0x01cb },
+ /*{ 0x01ca, 0x01cb },*/
+ /*CASELACE(0x01cb,0x01db),*/
{ 0x01cb, 0x01cc },
+
{ 0x01f1, 0x01f3 },
- { 0x01f1, 0x01f2 },
+ /*{ 0x01f1, 0x01f2 },*/
{ 0x01f2, 0x01f3 },
{ 0x01f4, 0x01f5 },
{ 0x01f6, 0x0195 },
{ 0x01f7, 0x01bf },
+ /*CASELACE(0x01f8,0x021e),*/
{ 0x0220, 0x019e },
- { 0x0386, 0x03ac },
- { 0x0388, 0x03ad },
- { 0x0389, 0x03ae },
- { 0x038a, 0x03af },
+ /*CASELACE(0x0222,0x0232),*/
+ { 0x023a, 0x2c65 },
+ { 0x023b, 0x23c },
+ { 0x023d, 0x19a },
+ { 0x023e, 0x2c66 },
+ { 0x0241, 0x242 },
+ { 0x0243, 0x180 },
+ { 0x0244, 0x289 },
+ { 0x0245, 0x28c },
+
+ { 0x0345, 0x3b9 },
+ { 0x0376, 0x377 }, /* bogus greek 'symbol' */
+ { 0x037f, 0x3f3 },
+ { 0x0386, 0x3ac },
{ 0x038c, 0x03cc },
{ 0x038e, 0x03cd },
{ 0x038f, 0x03ce },
- { 0x0399, 0x0345 },
- { 0x0399, 0x1fbe },
- { 0x03a3, 0x03c2 },
+ { 0x0391, 0x3b1 },
+ { 0x0392, 0x3b2 },
+ { 0x0392, 0x3d0 }, /* reverse */
+ /*CASEMAP (0x0393,0x39f,0x3b3),*/
+ { 0x0395, 0x3f5 }, /* reverse */
+ { 0x0398, 0x3d1 },
+ { 0x0399, 0x1fbe },/* reverse */
+ { 0x039a, 0x3f0 }, /* reverse */
+ { 0x03a0, 0x3c0 },
+ { 0x03a0, 0x3d6 }, /* reverse */
+ { 0x03a1, 0x3c1 },
+ { 0x03a1, 0x3f1 }, /* reverse */
+ { 0x03a3, 0x3c3 },
+ { 0x03a3, 0x3c2 }, /* reverse */
+ { 0x03a4, 0x3c4 },
+ { 0x03a5, 0x3c5 },
+ { 0x03a6, 0x3c6 },
+ { 0x03a6, 0x3d5 }, /* reverse */
+ /*CASEMAP(0x0391,0x3a1,0x3b1),*/
+ { 0x03c2, 0x3c3 },
+ { 0x03cf, 0x3d7 },
+ { 0x03d0, 0x3b2 },
+ { 0x03d1, 0x3b8 },
+ { 0x03d5, 0x3c6 },
+ { 0x03d6, 0x3c0 },
+ /*CASELACE(0x03d8,0x3ee),*/
+ /*CASEMAP(0x03da,0x3ee,0x3db),*/
+ { 0x03f0, 0x03ba },
+ { 0x03f1, 0x03c1 },
+ { 0x03f4, 0x03b8 },
+ { 0x03f5, 0x03b5 },
{ 0x03f7, 0x03f8 },
+ { 0x03f9, 0x03f2 },
{ 0x03fa, 0x03fb },
+ { 0x03fd, 0x037b },
+ { 0x03fe, 0x037c },
+ { 0x03ff, 0x037d },
+ /*CASEMAP(0x0400,0x40f,0x450),
+ CASEMAP(0x0410,0x42f,0x430),*/
+ { 0x412, 0x1c80 }, /* reverse */
+ { 0x414, 0x1c81 }, /* reverse */
+ { 0x41e, 0x1c82 }, /* reverse */
+ { 0x421, 0x1c83 }, /* reverse */
+ { 0x422, 0x1c84 }, /* reverse */
+ { 0x422, 0x1c85 }, /* reverse */
+ { 0x42a, 0x1c86 }, /* reverse */
+ { 0x462, 0x463 },
+ { 0x462, 0x1c87 }, /* reverse */
+
+ { 0x04c0, 0x04cf},
+ /*CASELACE(0x04c1,0x4cd),*/
+ { 0x0528, 0x0529},
+ { 0x052a, 0x052b},
+ { 0x052c, 0x052d},
+ { 0x052e, 0x052f},
+
+ { 0x10c7, 0x2d27 },
+ { 0x10cd, 0x2d2d },
+
{ 0x1e60, 0x1e9b },
+ { 0x1e9b, 0x1e61 },
{ 0x1e9e, 0xdf },
{ 0x1f59, 0x1f51 },
@@ -158,25 +267,11 @@ static const unsigned short pairs[][2] = {
{ 0x1f5d, 0x1f55 },
{ 0x1f5f, 0x1f57 },
{ 0x1fbc, 0x1fb3 },
+ { 0x1fbe, 0x3b9 },
{ 0x1fcc, 0x1fc3 },
{ 0x1fec, 0x1fe5 },
{ 0x1ffc, 0x1ff3 },
- { 0x23a, 0x2c65 },
- { 0x23b, 0x23c },
- { 0x23d, 0x19a },
- { 0x23e, 0x2c66 },
- { 0x241, 0x242 },
- { 0x243, 0x180 },
- { 0x244, 0x289 },
- { 0x245, 0x28c },
- { 0x3f4, 0x3b8 },
- { 0x3f9, 0x3f2 },
- { 0x3fd, 0x37b },
- { 0x3fe, 0x37c },
- { 0x3ff, 0x37d },
- { 0x4c0, 0x4cf },
-
{ 0x2126, 0x3c9 },
{ 0x212a, 'k' },
{ 0x212b, 0xe5 },
@@ -196,25 +291,25 @@ static const unsigned short pairs[][2] = {
{ 0x2c7f, 0x240 },
{ 0x2cf2, 0x2cf3 },
+ { 0xa64a, 0xa64b },
+ { 0xa64a, 0x1c88 }, /* reverse */
+
{ 0xa77d, 0x1d79 },
{ 0xa78b, 0xa78c },
{ 0xa78d, 0x265 },
{ 0xa7aa, 0x266 },
- { 0x10c7, 0x2d27 },
- { 0x10cd, 0x2d2d },
+ { 0xa7ab, 0x25c }, /* Unicode 7.0 */
+ { 0xa7ac, 0x261 }, /* Unicode 7.0 */
+ { 0xa7ad, 0x26c }, /* Unicode 7.0 */
+ { 0xa7ae, 0x26a }, /* Unicode 9.0 */
+ { 0xa7b0, 0x29e }, /* Unicode 7.0 */
+ { 0xa7b1, 0x287 }, /* Unicode 7.0 */
+ { 0xa7b2, 0x29d }, /* Unicode 7.0 */
+ { 0xa7b3, 0xab53 }, /* Unicode 8.0 */
+ { 0xa7b4, 0xa7b5 }, /* Unicode 8.0 */
- /* bogus greek 'symbol' letters */
- { 0x376, 0x377 },
- { 0x39c, 0xb5 },
- { 0x392, 0x3d0 },
- { 0x398, 0x3d1 },
- { 0x3a6, 0x3d5 },
- { 0x3a0, 0x3d6 },
- { 0x39a, 0x3f0 },
- { 0x3a1, 0x3f1 },
- { 0x395, 0x3f5 },
- { 0x3cf, 0x3d7 },
+ { 0xa7b6, 0xa7b7 }, /* Unicode 8.0 */
{ 0,0 }
};
@@ -229,29 +324,47 @@ static wchar_t __towcase(wchar_t wc, int lower)
if (!iswalpha(wc)
|| (unsigned)wc - 0x0600 <= 0x0fff-0x0600
|| (unsigned)wc - 0x2e00 <= 0xa63f-0x2e00
- || (unsigned)wc - 0xa800 <= 0xfeff-0xa800)
+ || (unsigned)wc - 0xa800 <= 0xab69-0xa800
+ || (unsigned)wc - 0xabc0 <= 0xfeff-0xabc0)
return wc;
/* special case because the diff between upper/lower is too big */
- if (lower && (unsigned)wc - 0x10a0 < 0x2e)
+ if (lower && (unsigned)wc - 0x10a0 < 0x2e) {
if (wc>0x10c5 && wc != 0x10c7 && wc != 0x10cd) return wc;
else return wc + 0x2d00 - 0x10a0;
- if (!lower && (unsigned)wc - 0x2d00 < 0x26)
+ }
+ if (!lower && (unsigned)wc - 0x2d00 < 0x26) {
if (wc>0x2d25 && wc != 0x2d27 && wc != 0x2d2d) return wc;
else return wc + 0x10a0 - 0x2d00;
+ }
for (i=0; casemaps[i].len; i++) {
int base = casemaps[i].upper + (lmask & casemaps[i].lower);
+ assert(i>0 ? casemaps[i].upper >= casemaps[i-1].upper : 1);
if ((unsigned)wc-base < casemaps[i].len) {
if (casemaps[i].lower == 1)
return wc + lower - ((wc-casemaps[i].upper)&1);
return wc + lmul*casemaps[i].lower;
}
+ if (lower && casemaps[i].upper > wc)
+ break;
}
for (i=0; pairs[i][1-lower]; i++) {
+ assert(i>0 ? pairs[i][0] >= pairs[i-1][0] : 1);
if (pairs[i][1-lower] == wc)
return pairs[i][lower];
+ if (lower && pairs[i][0] > wc)
+ break;
+ }
+ for (i=0; casemapsl[i].len; i++) {
+ unsigned long base = casemapsl[i].upper + (lmask & casemapsl[i].lower);
+ assert(i>0 ? casemapsl[i].upper >= casemapsl[i-1].upper : 1);
+ if ((unsigned)wc-base < casemapsl[i].len) {
+ if (casemapsl[i].lower == 1)
+ return wc + lower - ((wc-casemapsl[i].upper)&1);
+ return wc + lmul*casemapsl[i].lower;
+ }
+ if (lower && casemaps[i].upper > wc)
+ break;
}
- if ((unsigned)wc - (0x10428 - 0x28*lower) < 0x28)
- return wc - 0x28 + 0x50*lower;
return wc;
}
--
2.8.4 (Apple Git-73)
[-- Attachment #3: 0002-add-emacs-flymake-support.patch --]
[-- Type: application/octet-stream, Size: 1031 bytes --]
From 347be94765fe4993e143ed33ae874c642446f3ca Mon Sep 17 00:00:00 2001
From: Reini Urban <rurban@cpan.org>
Date: Fri, 20 Oct 2017 10:46:44 +0200
Subject: [PATCH 2/2] add emacs flymake support
---
Makefile | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git Makefile Makefile
index d2e8997..9eb0cd6 100644
--- Makefile
+++ Makefile
@@ -193,6 +193,11 @@ obj/%-clang: $(srcdir)/tools/%-clang.in config.mak
sed -e 's!@CC@!$(WRAPCC_CLANG)!g' -e 's!@PREFIX@!$(prefix)!g' -e 's!@INCDIR@!$(includedir)!g' -e 's!@LIBDIR@!$(libdir)!g' -e 's!@LDSO@!$(LDSO_PATHNAME)!g' $< > $@
chmod +x $@
+# emacs flymake-mode
+check-syntax:
+ test -n "$(CHK_SOURCES)" && \
+ $(CC) $(CFLAGS_ALL) -o /dev/null -S $(CHK_SOURCES)
+
$(DESTDIR)$(bindir)/%: obj/%
$(INSTALL) -D $< $@
@@ -239,4 +244,4 @@ clean:
distclean: clean
rm -f config.mak
-.PHONY: all clean install install-libs install-headers install-tools
+.PHONY: all clean install install-libs install-headers install-tools check-syntax
--
2.8.4 (Apple Git-73)
next prev parent reply other threads:[~2017-10-20 9:00 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-09-13 8:25 Reini Urban
2017-09-13 10:05 ` Reini Urban
2017-09-13 18:13 ` Rich Felker
2017-10-20 9:00 ` Reini Urban [this message]
2017-10-25 18:38 ` Rich Felker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAHiT=DFLK4NQBkxhZKv8W2-W7UJj7etmJxyxwrU+6okjCGRVhw@mail.gmail.com' \
--to=rurban@cpan.org \
--cc=musl@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.vuxu.org/mirror/musl/
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).