mailing list of musl libc
 help / color / mirror / code / Atom feed
* [PATCH] Byte-based C locale, draft 1
@ 2015-06-06 21:40 Rich Felker
  2015-06-06 22:39 ` Harald Becker
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-06 21:40 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2828 bytes --]

Attached is the first draft of a proposed byte-based C locale. The
patch is about 400 lines but most of it is context, because it's
basically a lot of tiny changes spread out over lots of files.

With this patch applied, the plain "C" (or "POSIX") locale has
converts each of the bytes in the range 0x80 to 0xff to a wchar_t
value in the range 0xdf80 to 0xdfff, the end of the low surrogates
range. I had originally intended to use the range 0x7fffff80 to
0x7fffffff, but C11 introduced mbrtoc16 and c16rtomb, imposing a
requirement that all characters in the locale's character set have a
mapping into char16_t. The easiest way to achieve this was to use a
range of wchar_t values that are already representable in char16_t but
that don't overlap with valid characters, and in turn the only way to
do that was with unpaired surrogates.

The intent is that the wchar_t values produced for high byte in the C
locale should not be treated as having any meaning as characters. They
are simply UTF-8 code units (in the language of Unicode) and, to
reflect this, nl_langinfo(CODESET) returns "UTF-8-CODE-UNITS". Their
usefulness is that programs that process data through wchar_t can
safely round-trip arbitrary bytes, and, more importantly, regex and
fnmatch patterns can be used to match byte patterns instead of
character patterns.

The logic for how locales are chosen is unchanged, so roughly
speaking, the C locale only gets used in applications which either
don't use the locale API at all (in which case they should not expect
functions that depend on LC_CTYPE to work as expected) or which end up
requesting it explicitly or via environment defaults. In particular,
the C locale is active only when one of the following applies:

- The application has not called setlocale at all for LC_CTYPE.

- The application has explicitly requested "C" or "POSIX" for LC_CTYPE
  in a call to setlocale or newlocale followed by uselocale.

- The application has requested the default locale for LC_CTYPE, via
  an empty string as the locale name or a base of (locale_t)0 and a
  mask omitting LC_CTYPE_MASK, in a call to setlocale or newlocale
  followed by uselocale, and the contents of the standard
  locale-related environment variables yield "C" or "POSIX" for
  LC_CTYPE.

Before applying this I should probably overhaul fnmatch.c again. I
believe it has some hard-coded UTF-8 processing code in it for the
useless "check the tail before middle" step that I've been wanting to
eliminate. Alternatively I could just apply a quick fix to make it
work right without any invasive changes.

Other than possible weird cases with fnmatch (which are largely
harmless but might inhibit matching high bytes in non-UTF-8 mode),
this code should be ready for testing. I'd appreciate some feedback
from anyone interested in the feature.

Rich

[-- Attachment #2: bytelocale_v1.diff --]
[-- Type: text/plain, Size: 10221 bytes --]

diff --git a/include/stdlib.h b/include/stdlib.h
index 97ce5a7..d2c911f 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -76,7 +76,8 @@ size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
 #define EXIT_FAILURE 1
 #define EXIT_SUCCESS 0
 
-#define MB_CUR_MAX ((size_t)+4)
+size_t __ctype_get_mb_cur_max(void);
+#define MB_CUR_MAX (__ctype_get_mb_cur_max())
 
 #define RAND_MAX (0x7fffffff)
 
diff --git a/src/ctype/__ctype_get_mb_cur_max.c b/src/ctype/__ctype_get_mb_cur_max.c
index d235f4d..94b0bd4 100644
--- a/src/ctype/__ctype_get_mb_cur_max.c
+++ b/src/ctype/__ctype_get_mb_cur_max.c
@@ -1,6 +1,7 @@
 #include <stddef.h>
+#include "locale_impl.h"
 
 size_t __ctype_get_mb_cur_max()
 {
-	return 4;
+	return MB_CUR_MAX;
 }
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index f15e156..7577b51 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -33,3 +33,6 @@ const char *__lctrans_cur(const char *);
 
 #undef MB_CUR_MAX
 #define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
+
+#define CODEUNIT(c) (0xdfff & (signed char)(c))
+#define IS_CODEUNIT(c) ((unsigned)(c)-0xdf80 < 0x80)
\ No newline at end of file
diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index e1325fe..72c5519 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -47,6 +47,7 @@ struct _IO_FILE {
 	unsigned char *shend;
 	off_t shlim, shcnt;
 	FILE *prev_locked, *next_locked;
+	struct __locale_struct *locale;
 };
 
 size_t __stdio_read(FILE *, unsigned char *, size_t);
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index e6121ae..1eeea94 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <stdint.h>
+#include "locale_impl.h"
 
 #define UTF_32BE    0300
 #define UTF_16LE    0301
@@ -165,9 +166,12 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 	int err;
 	unsigned char type = map[-1];
 	unsigned char totype = tomap[-1];
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	if (!in || !*in || !*inb) return 0;
 
+	*ploc = UTF8_LOCALE;
+
 	for (; *inb; *in+=l, *inb-=l) {
 		c = *(unsigned char *)*in;
 		l = 1;
@@ -431,6 +435,7 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 			break;
 		}
 	}
+	*ploc = loc;
 	return x;
 ilseq:
 	err = EILSEQ;
@@ -445,5 +450,6 @@ starved:
 	x = -1;
 end:
 	errno = err;
+	*ploc = loc;
 	return x;
 }
diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c
index a1ada24..776b447 100644
--- a/src/locale/langinfo.c
+++ b/src/locale/langinfo.c
@@ -33,7 +33,8 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
 	int idx = item & 65535;
 	const char *str;
 
-	if (item == CODESET) return "UTF-8";
+	if (item == CODESET)
+		return MB_CUR_MAX==1 ? "UTF-8-CODE-UNITS" : "UTF-8";
 	
 	switch (cat) {
 	case LC_NUMERIC:
diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
index 9d2c3b1..dc088a2 100644
--- a/src/multibyte/btowc.c
+++ b/src/multibyte/btowc.c
@@ -1,7 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 wint_t btowc(int c)
 {
-	return c<128U ? c : EOF;
+	if (c+1U <= 128) return c;
+	if (MB_CUR_MAX==1) return CODEUNIT(c);
+	return WEOF;
 }
diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c
index e7b3654..40e2e1a 100644
--- a/src/multibyte/mbrtowc.c
+++ b/src/multibyte/mbrtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st)
@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
 	if (!n) return -2;
 	if (!c) {
 		if (*s < 0x80) return !!(*wc = *s);
+		if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 		if (*s-SA > SB-SA) goto ilseq;
 		c = bittab[*s++-SA]; n--;
 	}
diff --git a/src/multibyte/mbsrtowcs.c b/src/multibyte/mbsrtowcs.c
index 3c1343a..eb8f72a 100644
--- a/src/multibyte/mbsrtowcs.c
+++ b/src/multibyte/mbsrtowcs.c
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include <wchar.h>
 #include <errno.h>
+#include <string.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs
 		}
 	}
 
+	if (MB_CUR_MAX==1) {
+		if (!ws) return strlen((const char *)s);
+		for (;;) {
+			if (!wn) {
+				*src = (const void *)s;
+				return wn0;
+			}
+			if (!*s) break;
+			c = *s++;
+			*ws++ = CODEUNIT(c);
+			wn--;
+		}
+		*ws = 0;
+		*src = 0;
+		return wn0-wn;
+	}
+
 	if (!ws) for (;;) {
 		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
 			while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) {
diff --git a/src/multibyte/mbtowc.c b/src/multibyte/mbtowc.c
index 803d221..c147754 100644
--- a/src/multibyte/mbtowc.c
+++ b/src/multibyte/mbtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
 	if (!wc) wc = &dummy;
 
 	if (*s < 0x80) return !!(*wc = *s);
+	if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 	if (*s-SA > SB-SA) goto ilseq;
 	c = bittab[*s++-SA];
 
diff --git a/src/multibyte/wcrtomb.c b/src/multibyte/wcrtomb.c
index 59f733d..75c972c 100644
--- a/src/multibyte/wcrtomb.c
+++ b/src/multibyte/wcrtomb.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 
 size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 {
@@ -13,6 +14,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 	if ((unsigned)wc < 0x80) {
 		*s = wc;
 		return 1;
+	} else if (MB_CUR_MAX == 1) {
+		if (!IS_CODEUNIT(wc)) {
+			errno = EILSEQ;
+			return -1;
+		}
+		*s = wc;
+		return 1;
 	} else if ((unsigned)wc < 0x800) {
 		*s++ = 0xc0 | (wc>>6);
 		*s = 0x80 | (wc&0x3f);
diff --git a/src/multibyte/wctob.c b/src/multibyte/wctob.c
index d6353ee..412e3c8 100644
--- a/src/multibyte/wctob.c
+++ b/src/multibyte/wctob.c
@@ -1,8 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 int wctob(wint_t c)
 {
 	if (c < 128U) return c;
+	if (MB_CUR_MAX==1 && IS_CODEUNIT(c)) return (unsigned char)c;
 	return EOF;
 }
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index 8626d54..e455cfe 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -1,8 +1,9 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <errno.h>
 
-wint_t __fgetwc_unlocked(FILE *f)
+static wint_t __fgetwc_unlocked_internal(FILE *f)
 {
 	mbstate_t st = { 0 };
 	wchar_t wc;
@@ -10,8 +11,6 @@ wint_t __fgetwc_unlocked(FILE *f)
 	unsigned char b;
 	size_t l;
 
-	f->mode |= f->mode+1;
-
 	/* Convert character from buffer if possible */
 	if (f->rpos < f->rend) {
 		l = mbrtowc(&wc, (void *)f->rpos, f->rend - f->rpos, &st);
@@ -39,6 +38,16 @@ wint_t __fgetwc_unlocked(FILE *f)
 	return wc;
 }
 
+wint_t __fgetwc_unlocked(FILE *f)
+{
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
+	wchar_t wc = __fgetwc_unlocked_internal(f);
+	*ploc = loc;
+	return wc;
+}
+
 wint_t fgetwc(FILE *f)
 {
 	wint_t c;
diff --git a/src/stdio/fputwc.c b/src/stdio/fputwc.c
index 7b621dd..a1c8ac8 100644
--- a/src/stdio/fputwc.c
+++ b/src/stdio/fputwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -7,8 +8,10 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 {
 	char mbc[MB_LEN_MAX];
 	int l;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
-	f->mode |= f->mode+1;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (isascii(c)) {
 		c = putc_unlocked(c, f);
@@ -20,6 +23,7 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 		l = wctomb(mbc, c);
 		if (l < 0 || __fwritex((void *)mbc, l, f) < l) c = WEOF;
 	}
+	*ploc = loc;
 	return c;
 }
 
diff --git a/src/stdio/fputws.c b/src/stdio/fputws.c
index 5723cbc..0ed02f1 100644
--- a/src/stdio/fputws.c
+++ b/src/stdio/fputws.c
@@ -1,23 +1,28 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 
 int fputws(const wchar_t *restrict ws, FILE *restrict f)
 {
 	unsigned char buf[BUFSIZ];
 	size_t l=0;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	fwide(f, 1);
+	*ploc = f->locale;
 
 	while (ws && (l = wcsrtombs((void *)buf, (void*)&ws, sizeof buf, 0))+1 > 1)
 		if (__fwritex(buf, l, f) < l) {
 			FUNLOCK(f);
+			*ploc = loc;
 			return -1;
 		}
 
 	FUNLOCK(f);
 
+	*ploc = loc;
 	return l; /* 0 or -1 */
 }
 
diff --git a/src/stdio/fwide.c b/src/stdio/fwide.c
index 8088e7a..8410b15 100644
--- a/src/stdio/fwide.c
+++ b/src/stdio/fwide.c
@@ -1,13 +1,14 @@
-#include <wchar.h>
 #include "stdio_impl.h"
-
-#define SH (8*sizeof(int)-1)
-#define NORMALIZE(x) ((x)>>SH | -((-(x))>>SH))
+#include "locale_impl.h"
 
 int fwide(FILE *f, int mode)
 {
 	FLOCK(f);
-	if (!f->mode) f->mode = NORMALIZE(mode);
+	if (mode) {
+		if (!f->locale) f->locale = MB_CUR_MAX==1
+			? C_LOCALE : UTF8_LOCALE;
+		if (!f->mode) f->mode = mode>0 ? 1 : -1;
+	}
 	mode = f->mode;
 	FUNLOCK(f);
 	return mode;
diff --git a/src/stdio/ungetwc.c b/src/stdio/ungetwc.c
index 394f92a..80d6e20 100644
--- a/src/stdio/ungetwc.c
+++ b/src/stdio/ungetwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -8,15 +9,18 @@ wint_t ungetwc(wint_t c, FILE *f)
 {
 	unsigned char mbc[MB_LEN_MAX];
 	int l=1;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (!f->rpos) __toread(f);
 	if (!f->rpos || f->rpos < f->buf - UNGET + l || c == WEOF ||
 	    (!isascii(c) && (l = wctomb((void *)mbc, c)) < 0)) {
 		FUNLOCK(f);
+		*ploc = loc;
 		return WEOF;
 	}
 
@@ -26,5 +30,6 @@ wint_t ungetwc(wint_t c, FILE *f)
 	f->flags &= ~F_EOF;
 
 	FUNLOCK(f);
+	*ploc = loc;
 	return c;
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2015-06-16  4:35 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-06 21:40 [PATCH] Byte-based C locale, draft 1 Rich Felker
2015-06-06 22:39 ` Harald Becker
2015-06-06 23:10   ` Rich Felker
2015-06-06 23:59     ` Harald Becker
2015-06-07  0:24       ` Rich Felker
2015-06-07 23:59         ` Build option to disable locale [was: Byte-based C locale, draft 1] Harald Becker
2015-06-08  0:28           ` Josiah Worcester
2015-06-08  1:57             ` Harald Becker
2015-06-08  2:36               ` Rich Felker
2015-06-08  3:35                 ` Harald Becker
2015-06-08  3:51                   ` Josiah Worcester
2015-06-08  0:33           ` Rich Felker
2015-06-08  2:46             ` Harald Becker
2015-06-08  4:06               ` Rich Felker
2015-06-09  3:20               ` Isaac Dunham
2015-06-09  4:27                 ` Rich Felker
2015-06-07  1:17 ` [PATCH] Byte-based C locale, draft 1 Rich Felker
2015-06-07  2:50 ` Rich Felker
2015-06-13  7:06   ` [PATCH] Byte-based C locale, draft 2 Rich Felker
2015-06-16  4:26     ` Rich Felker
2015-06-16  4:35       ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).