mailing list of musl libc
 help / color / mirror / code / Atom feed
* Possible bytelocale patch
@ 2014-07-03  7:13 Rich Felker
  0 siblings, 0 replies; only message in thread
From: Rich Felker @ 2014-07-03  7:13 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 835 bytes --]

As mentioned in the LC_CTYPE conformance thread, I may hold off on
committing these changes, but I figure it's best to go ahead and post
the patch. As-is, the patch introduced a conformance regression; I had
hoped mapping high bytes to a high PUA range would be acceptable, but
since musl returns true for iswprint() for PUA codepoints, this makes
isprint() and iswprint() inconsistent and thus non-conforming. As
there are fewer than 128 permanent non-characters in Unicode, I think
at least iswprint would need to have locale-specific behavior to fix
this problem.

Other than that, the patch works, it seems mostly complete (let me
know if anything is missing), and it presumably makes happy the people
who want to grep binary files (and all of the conservative corporate
titans who get the final word in the Austin Group)...

Rich

[-- Attachment #2: locale_part2.diff --]
[-- Type: text/plain, Size: 11776 bytes --]

diff --git a/include/stdlib.h b/include/stdlib.h
index f034c6e..fbe7a21 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -76,7 +76,8 @@ size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
 #define EXIT_FAILURE 1
 #define EXIT_SUCCESS 0
 
-#define MB_CUR_MAX ((size_t)+4)
+size_t __ctype_get_mb_cur_max(void);
+#define MB_CUR_MAX (__ctype_get_mb_cur_max())
 
 #define RAND_MAX (0x7fffffff)
 
diff --git a/src/ctype/__ctype_get_mb_cur_max.c b/src/ctype/__ctype_get_mb_cur_max.c
index d235f4d..94b0bd4 100644
--- a/src/ctype/__ctype_get_mb_cur_max.c
+++ b/src/ctype/__ctype_get_mb_cur_max.c
@@ -1,6 +1,7 @@
 #include <stddef.h>
+#include "locale_impl.h"
 
 size_t __ctype_get_mb_cur_max()
 {
-	return 4;
+	return MB_CUR_MAX;
 }
diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index 79be9fd..aafdc08 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -35,7 +35,8 @@ struct _IO_FILE {
 	int fd;
 	int pipe_pid;
 	long lockcount;
-	short dummy3;
+	unsigned char dummy3;
+	unsigned char utf8;
 	signed char mode;
 	signed char lbf;
 	int lock;
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index a0b0232..138d596 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -152,8 +152,10 @@ static void put_32(unsigned char *s, unsigned c, int e)
 }
 
 /* Adapt as needed */
-#define mbrtowc_utf8 mbrtowc
-#define wctomb_utf8 wctomb
+size_t __utf8rtowc(wchar_t *, const char *, size_t, mbstate_t *);
+size_t __wctoutf8(char *, wchar_t);
+#define mbrtowc_utf8 __utf8rtowc
+#define wctomb_utf8 __wctoutf8
 
 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
 {
diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c
index 13abf45..0b130e7 100644
--- a/src/locale/langinfo.c
+++ b/src/locale/langinfo.c
@@ -32,7 +32,7 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
 	int idx = item & 65535;
 	const char *str;
 
-	if (item == CODESET) return "UTF-8";
+	if (item == CODESET) return loc->ctype_utf8 ? "UTF-8" : "ASCII+8BIT";
 	
 	switch (cat) {
 	case LC_NUMERIC:
diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
index 9d2c3b1..695964c 100644
--- a/src/multibyte/btowc.c
+++ b/src/multibyte/btowc.c
@@ -1,7 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 wint_t btowc(int c)
 {
-	return c<128U ? c : EOF;
+	if (c+1U <= 128) return c;
+	if (MB_CUR_MAX == 1) return (unsigned char)c + 0x10f000;
+	return WEOF;
 }
diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c
index e7b3654..c96d6e7 100644
--- a/src/multibyte/mbrtowc.c
+++ b/src/multibyte/mbrtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st)
@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
 	if (!n) return -2;
 	if (!c) {
 		if (*s < 0x80) return !!(*wc = *s);
+		if (MB_CUR_MAX==1) return (*wc = *s + 0x10f000), 1;
 		if (*s-SA > SB-SA) goto ilseq;
 		c = bittab[*s++-SA]; n--;
 	}
diff --git a/src/multibyte/mbsrtowcs.c b/src/multibyte/mbsrtowcs.c
index 3c1343a..f4e46f3 100644
--- a/src/multibyte/mbsrtowcs.c
+++ b/src/multibyte/mbsrtowcs.c
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include <wchar.h>
 #include <errno.h>
+#include <string.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs
 		}
 	}
 
+	if (MB_CUR_MAX==1) {
+		if (!ws) return strlen((const char *)s);
+		for (;;) {
+			if (!wn) {
+				*src = (const void *)s;
+				return wn0;
+			}
+			if (!*s) break;
+			c = *s++;
+			*ws++ = c + (0x10f000 & -(c>>7));
+			wn--;
+		}
+		*ws = 0;
+		*src = 0;
+		return wn0-wn;
+	}
+
 	if (!ws) for (;;) {
 		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
 			while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) {
diff --git a/src/multibyte/mbtowc.c b/src/multibyte/mbtowc.c
index 803d221..6a2e3f9 100644
--- a/src/multibyte/mbtowc.c
+++ b/src/multibyte/mbtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
 	if (!wc) wc = &dummy;
 
 	if (*s < 0x80) return !!(*wc = *s);
+	if (MB_CUR_MAX==1) return (*wc = *s + 0x10f000), 1;
 	if (*s-SA > SB-SA) goto ilseq;
 	c = bittab[*s++-SA];
 
diff --git a/src/multibyte/utf8rtowc.c b/src/multibyte/utf8rtowc.c
new file mode 100644
index 0000000..6bb5220
--- /dev/null
+++ b/src/multibyte/utf8rtowc.c
@@ -0,0 +1,48 @@
+/* 
+ * This code was written by Rich Felker in 2010; no copyright is claimed.
+ * This code is in the public domain. Attribution is appreciated but
+ * unnecessary.
+ */
+
+#include <wchar.h>
+#include <errno.h>
+#include "internal.h"
+
+size_t __utf8rtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st)
+{
+	unsigned c;
+	const unsigned char *s = (const void *)src;
+	const unsigned N = n;
+
+	c = *(unsigned *)st;
+	if (!wc) wc = (void *)&wc;
+
+	if (!n) return -2;
+	if (!c) {
+		if (*s < 0x80) return !!(*wc = *s);
+		if (*s-SA > SB-SA) goto ilseq;
+		c = bittab[*s++-SA]; n--;
+	}
+
+	if (n) {
+		if (OOB(c,*s)) goto ilseq;
+loop:
+		c = c<<6 | *s++-0x80; n--;
+		if (!(c&(1U<<31))) {
+			*(unsigned *)st = 0;
+			*wc = c;
+			return N-n;
+		}
+		if (n) {
+			if (*s-0x80u >= 0x40) goto ilseq;
+			goto loop;
+		}
+	}
+
+	*(unsigned *)st = c;
+	return -2;
+ilseq:
+	*(unsigned *)st = 0;
+	errno = EILSEQ;
+	return -1;
+}
diff --git a/src/multibyte/wcrtomb.c b/src/multibyte/wcrtomb.c
index 59f733d..0c7d0f0 100644
--- a/src/multibyte/wcrtomb.c
+++ b/src/multibyte/wcrtomb.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 
 size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 {
@@ -13,6 +14,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 	if ((unsigned)wc < 0x80) {
 		*s = wc;
 		return 1;
+	} else if (MB_CUR_MAX == 1) {
+		if ((unsigned)wc - 0x10f080 >= 0x80) {
+			errno = EILSEQ;
+			return -1;
+		}
+		*s = wc;
+		return 1;
 	} else if ((unsigned)wc < 0x800) {
 		*s++ = 0xc0 | (wc>>6);
 		*s = 0x80 | (wc&0x3f);
diff --git a/src/multibyte/wctob.c b/src/multibyte/wctob.c
index d6353ee..c365016 100644
--- a/src/multibyte/wctob.c
+++ b/src/multibyte/wctob.c
@@ -1,8 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 int wctob(wint_t c)
 {
 	if (c < 128U) return c;
+	if (MB_CUR_MAX == 1 && c-0x10f080U < 128) return (unsigned char)c;
 	return EOF;
 }
diff --git a/src/multibyte/wctoraw8.c b/src/multibyte/wctoraw8.c
new file mode 100644
index 0000000..24c4d8c
--- /dev/null
+++ b/src/multibyte/wctoraw8.c
@@ -0,0 +1,13 @@
+#include <wchar.h>
+#include <errno.h>
+
+int __wctoraw8(wchar_t wc)
+{
+	if ((unsigned)wc < 0x80) {
+		return wc;
+	} else if ((unsigned)wc - 0x10f080 < 0x80) {
+		return wc - 0x10f000;
+	}
+	errno = EILSEQ;
+	return -1;
+}
diff --git a/src/multibyte/wctoutf8.c b/src/multibyte/wctoutf8.c
new file mode 100644
index 0000000..bf05783
--- /dev/null
+++ b/src/multibyte/wctoutf8.c
@@ -0,0 +1,33 @@
+/* 
+ * This code was written by Rich Felker in 2010; no copyright is claimed.
+ * This code is in the public domain. Attribution is appreciated but
+ * unnecessary.
+ */
+
+#include <wchar.h>
+#include <errno.h>
+
+size_t __wctoutf8(char * s, wchar_t wc)
+{
+	if ((unsigned)wc < 0x80) {
+		*s = wc;
+		return 1;
+	} else if ((unsigned)wc < 0x800) {
+		*s++ = 0xc0 | (wc>>6);
+		*s = 0x80 | (wc&0x3f);
+		return 2;
+	} else if ((unsigned)wc < 0xd800 || (unsigned)wc-0xe000 < 0x2000) {
+		*s++ = 0xe0 | (wc>>12);
+		*s++ = 0x80 | ((wc>>6)&0x3f);
+		*s = 0x80 | (wc&0x3f);
+		return 3;
+	} else if ((unsigned)wc-0x10000 < 0x100000) {
+		*s++ = 0xf0 | (wc>>18);
+		*s++ = 0x80 | ((wc>>12)&0x3f);
+		*s++ = 0x80 | ((wc>>6)&0x3f);
+		*s = 0x80 | (wc&0x3f);
+		return 4;
+	}
+	errno = EILSEQ;
+	return -1;
+}
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index 8626d54..b3f10f9 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -2,6 +2,8 @@
 #include <wchar.h>
 #include <errno.h>
 
+size_t __utf8rtowc(wchar_t *, const char *, size_t, mbstate_t *);
+
 wint_t __fgetwc_unlocked(FILE *f)
 {
 	mbstate_t st = { 0 };
@@ -10,7 +12,14 @@ wint_t __fgetwc_unlocked(FILE *f)
 	unsigned char b;
 	size_t l;
 
-	f->mode |= f->mode+1;
+	if (!f->utf8) {
+		fwide(f, 1);
+		if (!f->utf8) {
+			c = getc_unlocked(f);
+			if (c >= 128) c += 0x10f000;
+			return c;
+		}
+	}
 
 	/* Convert character from buffer if possible */
 	if (f->rpos < f->rend) {
diff --git a/src/stdio/fputwc.c b/src/stdio/fputwc.c
index 7b621dd..dfb9b8d 100644
--- a/src/stdio/fputwc.c
+++ b/src/stdio/fputwc.c
@@ -3,14 +3,22 @@
 #include <limits.h>
 #include <ctype.h>
 
+int __wctoraw8(wchar_t);
+
 wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 {
 	char mbc[MB_LEN_MAX];
 	int l;
 
-	f->mode |= f->mode+1;
+	if (!f->utf8) {
+		fwide(f, 1);
+		if (!f->utf8) {
+			c = __wctoraw8(c);
+			return c<0 ? WEOF : putc_unlocked(c, f);
+		}
+	}
 
-	if (isascii(c)) {
+	if (c < 128U) {
 		c = putc_unlocked(c, f);
 	} else if (f->wpos + MB_LEN_MAX < f->wend) {
 		l = wctomb((void *)f->wpos, c);
diff --git a/src/stdio/fwide.c b/src/stdio/fwide.c
index 8088e7a..0ebaff4 100644
--- a/src/stdio/fwide.c
+++ b/src/stdio/fwide.c
@@ -1,5 +1,6 @@
 #include <wchar.h>
 #include "stdio_impl.h"
+#include "locale_impl.h"
 
 #define SH (8*sizeof(int)-1)
 #define NORMALIZE(x) ((x)>>SH | -((-(x))>>SH))
@@ -7,7 +8,10 @@
 int fwide(FILE *f, int mode)
 {
 	FLOCK(f);
-	if (!f->mode) f->mode = NORMALIZE(mode);
+	if (!f->mode && mode) {
+		f->mode = NORMALIZE(mode);
+		f->utf8 = (MB_CUR_MAX > 1);
+	}
 	mode = f->mode;
 	FUNLOCK(f);
 	return mode;
diff --git a/src/stdio/ungetwc.c b/src/stdio/ungetwc.c
index 8cc85a6..2b6137e 100644
--- a/src/stdio/ungetwc.c
+++ b/src/stdio/ungetwc.c
@@ -4,6 +4,8 @@
 #include <ctype.h>
 #include <string.h>
 
+int __wctoraw8(wchar_t);
+
 wint_t ungetwc(wint_t c, FILE *f)
 {
 	unsigned char mbc[MB_LEN_MAX];
@@ -11,20 +13,25 @@ wint_t ungetwc(wint_t c, FILE *f)
 
 	if (c == WEOF) return c;
 
-	/* Try conversion early so we can fail without locking if invalid */
-	if (!isascii(c) && (l = wctomb((void *)mbc, c)) < 0)
-		return WEOF;
-
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	if (!f->utf8) {
+		fwide(f, 1);
+		if (!f->utf8) {
+			c = __wctoraw8(c);
+			c = ungetc(c, f);
+			FUNLOCK(f);
+			return c;
+		}
+	}
 
-	if ((!f->rend && __toread(f)) || f->rpos < f->buf - UNGET + l) {
+	if ((c>=128U && (l = wctomb((void *)mbc, c)) < 0) ||
+	    (!f->rend && __toread(f)) || f->rpos < f->buf - UNGET + l) {
 		FUNLOCK(f);
 		return EOF;
 	}
 
-	if (isascii(c)) *--f->rpos = c;
+	if (c<128U) *--f->rpos = c;
 	else memcpy(f->rpos -= l, mbc, l);
 
 	f->flags &= ~F_EOF;
diff --git a/src/stdio/vfwprintf.c b/src/stdio/vfwprintf.c
index c640059..ec0565d 100644
--- a/src/stdio/vfwprintf.c
+++ b/src/stdio/vfwprintf.c
@@ -355,7 +355,7 @@ int vfwprintf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap)
 	}
 
 	FLOCK(f);
-	f->mode |= f->mode+1;
+	fwide(f, 1);
 	ret = wprintf_core(f, fmt, &ap2, nl_arg, nl_type);
 	FUNLOCK(f);
 	va_end(ap2);
diff --git a/src/stdio/vfwscanf.c b/src/stdio/vfwscanf.c
index ac5c2c2..223aad4 100644
--- a/src/stdio/vfwscanf.c
+++ b/src/stdio/vfwscanf.c
@@ -104,7 +104,7 @@ int vfwscanf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap)
 
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	fwide(f, 1);
 
 	for (p=fmt; *p; p++) {
 

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2014-07-03  7:13 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-03  7:13 Possible bytelocale patch Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).