[PATCH] Byte-based C locale, draft 1

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] Byte-based C locale, draft 1
@ 2015-06-06 21:40 Rich Felker
  2015-06-06 22:39 ` Harald Becker
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-06 21:40 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2828 bytes --]

Attached is the first draft of a proposed byte-based C locale. The
patch is about 400 lines but most of it is context, because it's
basically a lot of tiny changes spread out over lots of files.

With this patch applied, the plain "C" (or "POSIX") locale has
converts each of the bytes in the range 0x80 to 0xff to a wchar_t
value in the range 0xdf80 to 0xdfff, the end of the low surrogates
range. I had originally intended to use the range 0x7fffff80 to
0x7fffffff, but C11 introduced mbrtoc16 and c16rtomb, imposing a
requirement that all characters in the locale's character set have a
mapping into char16_t. The easiest way to achieve this was to use a
range of wchar_t values that are already representable in char16_t but
that don't overlap with valid characters, and in turn the only way to
do that was with unpaired surrogates.

The intent is that the wchar_t values produced for high byte in the C
locale should not be treated as having any meaning as characters. They
are simply UTF-8 code units (in the language of Unicode) and, to
reflect this, nl_langinfo(CODESET) returns "UTF-8-CODE-UNITS". Their
usefulness is that programs that process data through wchar_t can
safely round-trip arbitrary bytes, and, more importantly, regex and
fnmatch patterns can be used to match byte patterns instead of
character patterns.

The logic for how locales are chosen is unchanged, so roughly
speaking, the C locale only gets used in applications which either
don't use the locale API at all (in which case they should not expect
functions that depend on LC_CTYPE to work as expected) or which end up
requesting it explicitly or via environment defaults. In particular,
the C locale is active only when one of the following applies:

- The application has not called setlocale at all for LC_CTYPE.

- The application has explicitly requested "C" or "POSIX" for LC_CTYPE
  in a call to setlocale or newlocale followed by uselocale.

- The application has requested the default locale for LC_CTYPE, via
  an empty string as the locale name or a base of (locale_t)0 and a
  mask omitting LC_CTYPE_MASK, in a call to setlocale or newlocale
  followed by uselocale, and the contents of the standard
  locale-related environment variables yield "C" or "POSIX" for
  LC_CTYPE.

Before applying this I should probably overhaul fnmatch.c again. I
believe it has some hard-coded UTF-8 processing code in it for the
useless "check the tail before middle" step that I've been wanting to
eliminate. Alternatively I could just apply a quick fix to make it
work right without any invasive changes.

Other than possible weird cases with fnmatch (which are largely
harmless but might inhibit matching high bytes in non-UTF-8 mode),
this code should be ready for testing. I'd appreciate some feedback
from anyone interested in the feature.

Rich

[-- Attachment #2: bytelocale_v1.diff --]
[-- Type: text/plain, Size: 10221 bytes --]

diff --git a/include/stdlib.h b/include/stdlib.h
index 97ce5a7..d2c911f 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -76,7 +76,8 @@ size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
 #define EXIT_FAILURE 1
 #define EXIT_SUCCESS 0
 
-#define MB_CUR_MAX ((size_t)+4)
+size_t __ctype_get_mb_cur_max(void);
+#define MB_CUR_MAX (__ctype_get_mb_cur_max())
 
 #define RAND_MAX (0x7fffffff)
 
diff --git a/src/ctype/__ctype_get_mb_cur_max.c b/src/ctype/__ctype_get_mb_cur_max.c
index d235f4d..94b0bd4 100644
--- a/src/ctype/__ctype_get_mb_cur_max.c
+++ b/src/ctype/__ctype_get_mb_cur_max.c
@@ -1,6 +1,7 @@
 #include <stddef.h>
+#include "locale_impl.h"
 
 size_t __ctype_get_mb_cur_max()
 {
-	return 4;
+	return MB_CUR_MAX;
 }
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index f15e156..7577b51 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -33,3 +33,6 @@ const char *__lctrans_cur(const char *);
 
 #undef MB_CUR_MAX
 #define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
+
+#define CODEUNIT(c) (0xdfff & (signed char)(c))
+#define IS_CODEUNIT(c) ((unsigned)(c)-0xdf80 < 0x80)
\ No newline at end of file
diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index e1325fe..72c5519 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -47,6 +47,7 @@ struct _IO_FILE {
 	unsigned char *shend;
 	off_t shlim, shcnt;
 	FILE *prev_locked, *next_locked;
+	struct __locale_struct *locale;
 };
 
 size_t __stdio_read(FILE *, unsigned char *, size_t);
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index e6121ae..1eeea94 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <stdint.h>
+#include "locale_impl.h"
 
 #define UTF_32BE    0300
 #define UTF_16LE    0301
@@ -165,9 +166,12 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 	int err;
 	unsigned char type = map[-1];
 	unsigned char totype = tomap[-1];
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	if (!in || !*in || !*inb) return 0;
 
+	*ploc = UTF8_LOCALE;
+
 	for (; *inb; *in+=l, *inb-=l) {
 		c = *(unsigned char *)*in;
 		l = 1;
@@ -431,6 +435,7 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 			break;
 		}
 	}
+	*ploc = loc;
 	return x;
 ilseq:
 	err = EILSEQ;
@@ -445,5 +450,6 @@ starved:
 	x = -1;
 end:
 	errno = err;
+	*ploc = loc;
 	return x;
 }
diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c
index a1ada24..776b447 100644
--- a/src/locale/langinfo.c
+++ b/src/locale/langinfo.c
@@ -33,7 +33,8 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
 	int idx = item & 65535;
 	const char *str;
 
-	if (item == CODESET) return "UTF-8";
+	if (item == CODESET)
+		return MB_CUR_MAX==1 ? "UTF-8-CODE-UNITS" : "UTF-8";
 	
 	switch (cat) {
 	case LC_NUMERIC:
diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
index 9d2c3b1..dc088a2 100644
--- a/src/multibyte/btowc.c
+++ b/src/multibyte/btowc.c
@@ -1,7 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 wint_t btowc(int c)
 {
-	return c<128U ? c : EOF;
+	if (c+1U <= 128) return c;
+	if (MB_CUR_MAX==1) return CODEUNIT(c);
+	return WEOF;
 }
diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c
index e7b3654..40e2e1a 100644
--- a/src/multibyte/mbrtowc.c
+++ b/src/multibyte/mbrtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st)
@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
 	if (!n) return -2;
 	if (!c) {
 		if (*s < 0x80) return !!(*wc = *s);
+		if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 		if (*s-SA > SB-SA) goto ilseq;
 		c = bittab[*s++-SA]; n--;
 	}
diff --git a/src/multibyte/mbsrtowcs.c b/src/multibyte/mbsrtowcs.c
index 3c1343a..eb8f72a 100644
--- a/src/multibyte/mbsrtowcs.c
+++ b/src/multibyte/mbsrtowcs.c
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include <wchar.h>
 #include <errno.h>
+#include <string.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs
 		}
 	}
 
+	if (MB_CUR_MAX==1) {
+		if (!ws) return strlen((const char *)s);
+		for (;;) {
+			if (!wn) {
+				*src = (const void *)s;
+				return wn0;
+			}
+			if (!*s) break;
+			c = *s++;
+			*ws++ = CODEUNIT(c);
+			wn--;
+		}
+		*ws = 0;
+		*src = 0;
+		return wn0-wn;
+	}
+
 	if (!ws) for (;;) {
 		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
 			while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) {
diff --git a/src/multibyte/mbtowc.c b/src/multibyte/mbtowc.c
index 803d221..c147754 100644
--- a/src/multibyte/mbtowc.c
+++ b/src/multibyte/mbtowc.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 #include "internal.h"
 
 int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
 	if (!wc) wc = &dummy;
 
 	if (*s < 0x80) return !!(*wc = *s);
+	if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 	if (*s-SA > SB-SA) goto ilseq;
 	c = bittab[*s++-SA];
 
diff --git a/src/multibyte/wcrtomb.c b/src/multibyte/wcrtomb.c
index 59f733d..75c972c 100644
--- a/src/multibyte/wcrtomb.c
+++ b/src/multibyte/wcrtomb.c
@@ -6,6 +6,7 @@
 
 #include <wchar.h>
 #include <errno.h>
+#include "locale_impl.h"
 
 size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 {
@@ -13,6 +14,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 	if ((unsigned)wc < 0x80) {
 		*s = wc;
 		return 1;
+	} else if (MB_CUR_MAX == 1) {
+		if (!IS_CODEUNIT(wc)) {
+			errno = EILSEQ;
+			return -1;
+		}
+		*s = wc;
+		return 1;
 	} else if ((unsigned)wc < 0x800) {
 		*s++ = 0xc0 | (wc>>6);
 		*s = 0x80 | (wc&0x3f);
diff --git a/src/multibyte/wctob.c b/src/multibyte/wctob.c
index d6353ee..412e3c8 100644
--- a/src/multibyte/wctob.c
+++ b/src/multibyte/wctob.c
@@ -1,8 +1,10 @@
 #include <stdio.h>
 #include <wchar.h>
+#include "locale_impl.h"
 
 int wctob(wint_t c)
 {
 	if (c < 128U) return c;
+	if (MB_CUR_MAX==1 && IS_CODEUNIT(c)) return (unsigned char)c;
 	return EOF;
 }
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index 8626d54..e455cfe 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -1,8 +1,9 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <errno.h>
 
-wint_t __fgetwc_unlocked(FILE *f)
+static wint_t __fgetwc_unlocked_internal(FILE *f)
 {
 	mbstate_t st = { 0 };
 	wchar_t wc;
@@ -10,8 +11,6 @@ wint_t __fgetwc_unlocked(FILE *f)
 	unsigned char b;
 	size_t l;
 
-	f->mode |= f->mode+1;
-
 	/* Convert character from buffer if possible */
 	if (f->rpos < f->rend) {
 		l = mbrtowc(&wc, (void *)f->rpos, f->rend - f->rpos, &st);
@@ -39,6 +38,16 @@ wint_t __fgetwc_unlocked(FILE *f)
 	return wc;
 }
 
+wint_t __fgetwc_unlocked(FILE *f)
+{
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
+	wchar_t wc = __fgetwc_unlocked_internal(f);
+	*ploc = loc;
+	return wc;
+}
+
 wint_t fgetwc(FILE *f)
 {
 	wint_t c;
diff --git a/src/stdio/fputwc.c b/src/stdio/fputwc.c
index 7b621dd..a1c8ac8 100644
--- a/src/stdio/fputwc.c
+++ b/src/stdio/fputwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -7,8 +8,10 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 {
 	char mbc[MB_LEN_MAX];
 	int l;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
-	f->mode |= f->mode+1;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (isascii(c)) {
 		c = putc_unlocked(c, f);
@@ -20,6 +23,7 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 		l = wctomb(mbc, c);
 		if (l < 0 || __fwritex((void *)mbc, l, f) < l) c = WEOF;
 	}
+	*ploc = loc;
 	return c;
 }
 
diff --git a/src/stdio/fputws.c b/src/stdio/fputws.c
index 5723cbc..0ed02f1 100644
--- a/src/stdio/fputws.c
+++ b/src/stdio/fputws.c
@@ -1,23 +1,28 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 
 int fputws(const wchar_t *restrict ws, FILE *restrict f)
 {
 	unsigned char buf[BUFSIZ];
 	size_t l=0;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	fwide(f, 1);
+	*ploc = f->locale;
 
 	while (ws && (l = wcsrtombs((void *)buf, (void*)&ws, sizeof buf, 0))+1 > 1)
 		if (__fwritex(buf, l, f) < l) {
 			FUNLOCK(f);
+			*ploc = loc;
 			return -1;
 		}
 
 	FUNLOCK(f);
 
+	*ploc = loc;
 	return l; /* 0 or -1 */
 }
 
diff --git a/src/stdio/fwide.c b/src/stdio/fwide.c
index 8088e7a..8410b15 100644
--- a/src/stdio/fwide.c
+++ b/src/stdio/fwide.c
@@ -1,13 +1,14 @@
-#include <wchar.h>
 #include "stdio_impl.h"
-
-#define SH (8*sizeof(int)-1)
-#define NORMALIZE(x) ((x)>>SH | -((-(x))>>SH))
+#include "locale_impl.h"
 
 int fwide(FILE *f, int mode)
 {
 	FLOCK(f);
-	if (!f->mode) f->mode = NORMALIZE(mode);
+	if (mode) {
+		if (!f->locale) f->locale = MB_CUR_MAX==1
+			? C_LOCALE : UTF8_LOCALE;
+		if (!f->mode) f->mode = mode>0 ? 1 : -1;
+	}
 	mode = f->mode;
 	FUNLOCK(f);
 	return mode;
diff --git a/src/stdio/ungetwc.c b/src/stdio/ungetwc.c
index 394f92a..80d6e20 100644
--- a/src/stdio/ungetwc.c
+++ b/src/stdio/ungetwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -8,15 +9,18 @@ wint_t ungetwc(wint_t c, FILE *f)
 {
 	unsigned char mbc[MB_LEN_MAX];
 	int l=1;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
-	f->mode |= f->mode+1;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (!f->rpos) __toread(f);
 	if (!f->rpos || f->rpos < f->buf - UNGET + l || c == WEOF ||
 	    (!isascii(c) && (l = wctomb((void *)mbc, c)) < 0)) {
 		FUNLOCK(f);
+		*ploc = loc;
 		return WEOF;
 	}
 
@@ -26,5 +30,6 @@ wint_t ungetwc(wint_t c, FILE *f)
 	f->flags &= ~F_EOF;
 
 	FUNLOCK(f);
+	*ploc = loc;
 	return c;
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 21:40 [PATCH] Byte-based C locale, draft 1 Rich Felker
@ 2015-06-06 22:39 ` Harald Becker
  2015-06-06 23:10   ` Rich Felker
  2015-06-07  1:17 ` [PATCH] Byte-based C locale, draft 1 Rich Felker
  2015-06-07  2:50 ` Rich Felker
  2 siblings, 1 reply; 21+ messages in thread
From: Harald Becker @ 2015-06-06 22:39 UTC (permalink / raw)
  To: musl

Hi Rich !

On 06.06.2015 23:40, Rich Felker wrote:
> Attached is the first draft of a proposed byte-based C locale. The
> patch is about 400 lines but most of it is context, because it's
> basically a lot of tiny changes spread out over lots of files.

Sorry for my hopping in. I like  musl, but I really dislike all this 
wchar_t and locale management. Ok, I see it is required for POSIX 
compatibility, so I have to accept the existence of this stuff. If 
linking statically, I'm able to avoid most of this and this code never 
gets in, but when linking shared it makes all into the shared library 
(which is my point of criticism).

So I like to see a build switch to disable all this locale stuff, 
assuming just bare char type and sole only C / UTF-8 locale, with a 
minimum on locale overhead (all locale functions should just fail except 
for there very simple / standard usage cases). Is there any chance to 
see such a build switch in the main stream, or do I really need to 
create an maintain a separate patch for this (the hard way)?

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 22:39 ` Harald Becker
@ 2015-06-06 23:10   ` Rich Felker
  2015-06-06 23:59     ` Harald Becker
  0 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-06 23:10 UTC (permalink / raw)
  To: musl

On Sun, Jun 07, 2015 at 12:39:00AM +0200, Harald Becker wrote:
> Hi Rich !
> 
> On 06.06.2015 23:40, Rich Felker wrote:
> >Attached is the first draft of a proposed byte-based C locale. The
> >patch is about 400 lines but most of it is context, because it's
> >basically a lot of tiny changes spread out over lots of files.
> 
> Sorry for my hopping in. I like  musl, but I really dislike all this
> wchar_t and locale management. Ok, I see it is required for POSIX
> compatibility, so I have to accept the existence of this stuff. If
> linking statically, I'm able to avoid most of this and this code
> never gets in, but when linking shared it makes all into the shared
> library (which is my point of criticism).
> 
> So I like to see a build switch to disable all this locale stuff,
> assuming just bare char type and sole only C / UTF-8 locale, with a
> minimum on locale overhead (all locale functions should just fail
> except for there very simple / standard usage cases). Is there any
> chance to see such a build switch in the main stream, or do I really
> need to create an maintain a separate patch for this (the hard way)?

I'm not clear whether your comments are about the proposed byte-based
C locale (the patch you're replying to) or about existing code in
musl. Could you be more specific?

Rich


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 23:10   ` Rich Felker
@ 2015-06-06 23:59     ` Harald Becker
  2015-06-07  0:24       ` Rich Felker
  0 siblings, 1 reply; 21+ messages in thread
From: Harald Becker @ 2015-06-06 23:59 UTC (permalink / raw)
  To: musl

On 07.06.2015 01:10, Rich Felker wrote:
> I'm not clear whether your comments are about the proposed byte-based
> C locale (the patch you're replying to) or about existing code in
> musl. Could you be more specific?

I used the discussion about the locale stuff to hop in and ask for 
getting the most wanted feature I'm looking for:

A bare bone (shared) library without all this locale, wchar_t and multi 
byte overhead. Just having a library which always assume C locale and 
UTF-8 byte streams, as I ought all this locale stuff is a big block of 
code, which I heavily dislike and never use (got my own UTF-8 handling 
when and where required). This library with disabled locale stuff should 
otherwise still be compatible, just adding the bare minimum of overhead 
on those locale functions (allowing only the single C locale).

Is that specific enough?

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 23:59     ` Harald Becker
@ 2015-06-07  0:24       ` Rich Felker
  2015-06-07 23:59         ` Build option to disable locale [was: Byte-based C locale, draft 1] Harald Becker
  0 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-07  0:24 UTC (permalink / raw)
  To: musl

On Sun, Jun 07, 2015 at 01:59:53AM +0200, Harald Becker wrote:
> On 07.06.2015 01:10, Rich Felker wrote:
> >I'm not clear whether your comments are about the proposed byte-based
> >C locale (the patch you're replying to) or about existing code in
> >musl. Could you be more specific?
> 
> I used the discussion about the locale stuff to hop in and ask for
> getting the most wanted feature I'm looking for:

OK, so I'll take this as mostly unrelated to the patch.

> A bare bone (shared) library without all this locale, wchar_t and
> multi byte overhead. Just having a library which always assume C
> locale and UTF-8 byte streams, as I ought all this locale stuff is a
> big block of code, which I heavily dislike and never use (got my own
> UTF-8 handling when and where required). This library with disabled
> locale stuff should otherwise still be compatible, just adding the
> bare minimum of overhead on those locale functions (allowing only
> the single C locale).
> 
> Is that specific enough?

It's somewhat more clear what you're talking about, but I'm still not
sure what specific pieces of code you would want to omit from libc.so.
Which of the following would you want to remove or keep?

- UTF-8 encoding and decoding
- Character properties
- Case mappings
- Internal message translation (nl_langinfo strings, errors, etc.)
- Message translation API (gettext)
- Charset conversion (iconv)
- Non-ASCII characters in regex and fnmatch patterns/brackers

Rich


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 21:40 [PATCH] Byte-based C locale, draft 1 Rich Felker
  2015-06-06 22:39 ` Harald Becker
@ 2015-06-07  1:17 ` Rich Felker
  2015-06-07  2:50 ` Rich Felker
  2 siblings, 0 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-07  1:17 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1130 bytes --]

On Sat, Jun 06, 2015 at 05:40:07PM -0400, Rich Felker wrote:
> Before applying this I should probably overhaul fnmatch.c again. I
> believe it has some hard-coded UTF-8 processing code in it for the
> useless "check the tail before middle" step that I've been wanting to
> eliminate. Alternatively I could just apply a quick fix to make it
> work right without any invasive changes.
> 
> Other than possible weird cases with fnmatch (which are largely
> harmless but might inhibit matching high bytes in non-UTF-8 mode),
> this code should be ready for testing. I'd appreciate some feedback
> from anyone interested in the feature.

On further review, the special last-component handling fnmatch does is
not wrong, just wrongly ordered. It should take place after the "sea
of stars" component is processsed, rather than before, to avoid O(n)
operation (essentially strlen) when an early failure could be
detected. But since only the ordering is wrong, I think fixing it is
orthogonal to the bytelocale work, and a single-line patch to add a
case for MB_CUR_MAX==1 should just be added to this proposed patch
(see attached).

Rich

[-- Attachment #2: bytelocale_v1_fnmatch.diff --]
[-- Type: text/plain, Size: 717 bytes --]

diff --git a/src/regex/fnmatch.c b/src/regex/fnmatch.c
index 7f6b65f..978fff8 100644
--- a/src/regex/fnmatch.c
+++ b/src/regex/fnmatch.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <wctype.h>
+#include "locale_impl.h"
 
 #define END 0
 #define UNMATCHABLE -2
@@ -229,7 +230,7 @@ static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
 	 * On illegal sequences we may get it wrong, but in that case
 	 * we necessarily have a matching failure anyway. */
 	for (s=endstr; s>str && tailcnt; tailcnt--) {
-		if (s[-1] < 128U) s--;
+		if (s[-1] < 128U || MB_CUR_MAX==1) s--;
 		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
 	}
 	if (tailcnt) return FNM_NOMATCH;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 1
  2015-06-06 21:40 [PATCH] Byte-based C locale, draft 1 Rich Felker
  2015-06-06 22:39 ` Harald Becker
  2015-06-07  1:17 ` [PATCH] Byte-based C locale, draft 1 Rich Felker
@ 2015-06-07  2:50 ` Rich Felker
  2015-06-13  7:06   ` [PATCH] Byte-based C locale, draft 2 Rich Felker
  2 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-07  2:50 UTC (permalink / raw)
  To: musl

On Sat, Jun 06, 2015 at 05:40:07PM -0400, Rich Felker wrote:
> Attached is the first draft of a proposed byte-based C locale. The
> patch is about 400 lines but most of it is context, because it's
> basically a lot of tiny changes spread out over lots of files.
> [...]

If we go forward with this, I think I can factor it into 3 parts:

1. Add checks for MB_CUR_MAX==1 and the bytelocale support they would
   activate, and the CODEUNIT/IS_CODEUNIT macros needed for these code
   paths. This patch would be a complete nop and would not even affect
   codegen with a decent compiler since MB_CUR_MAX==4 is a constant
   right now.

2. Introduce stdio saving of active LC_CTYPE at the time of stream
   orientation (fwide) and save/restore of current locale around stdio
   ops that need it (fputwc, fgetwc, ungetwc) and iconv usage of
   multibyte functions. This patch would increase code size in a few
   places but would not change behavior.

3. Replace the constant MB_CUR_MAX macro with a runtime-variable value
   dependent on CURRENT_LOCALE->cat[LC_CTYPE]. This would actually
   activate the byte-based C locale support. locale_impl.h is actually
   already doing this, so I think I should remove that definition
   before making any changes and only bring it back if/when stage 3
   here is committed.

In principle stages 1 and 2 could be committed in either order;
they're independent. Stage 3 is also independent in what it touches,
but if it's already committed before stage 1/2, then committing stage
1 without stage 2 is a functional regression (stdio functions no
longer behave according to spec; iconv stops working in C locale).

Rich

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-07  0:24       ` Rich Felker
@ 2015-06-07 23:59         ` Harald Becker
  2015-06-08  0:28           ` Josiah Worcester
  2015-06-08  0:33           ` Rich Felker
  0 siblings, 2 replies; 21+ messages in thread
From: Harald Becker @ 2015-06-07 23:59 UTC (permalink / raw)
  To: musl

On 07.06.2015 02:24, Rich Felker wrote:
> It's somewhat more clear what you're talking about, but I'm still not
> sure what specific pieces of code you would want to omit from libc.so.
> Which of the following would you want to remove or keep?

I did not look into all the details ...

In general: Keep the API, but add stubs with minimal operation or fail 
for none C locale (etc.).

> - UTF-8 encoding and decoding

May be of use to keep, if on bare minimum.

> - Character properties
 > - Case mappings

Keep ASCII, map all none ASCII to a single value.

> - Internal message translation (nl_langinfo strings, errors, etc.)
 > - Message translation API (gettext)

No translation at all, keep the English messages (as short as possible).

> - Charset conversion (iconv)

Copy ASCII / UTF-8, but fail for all other.

> - Non-ASCII characters in regex and fnmatch patterns/brackers

May be the question to allow for UTF-8, but only those, no other 
charsets (should allow to do some optimization and avoid all the 
extended overhead).

fnmatch: Match None ASCII just 1:1, no other special operation.

regex: Don't have the experience on the internals of this topic. In 
general allow for 1:1 matching of none ASCII characters, but otherwise 
behave as C locale (e.g. equivalence classes).

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-07 23:59         ` Build option to disable locale [was: Byte-based C locale, draft 1] Harald Becker
@ 2015-06-08  0:28           ` Josiah Worcester
  2015-06-08  1:57             ` Harald Becker
  2015-06-08  0:33           ` Rich Felker
  1 sibling, 1 reply; 21+ messages in thread
From: Josiah Worcester @ 2015-06-08  0:28 UTC (permalink / raw)
  To: musl

On Sun, Jun 7, 2015 at 6:59 PM, Harald Becker <ralda@gmx.de> wrote:
> On 07.06.2015 02:24, Rich Felker wrote:
>>
>> It's somewhat more clear what you're talking about, but I'm still not
>> sure what specific pieces of code you would want to omit from libc.so.
>> Which of the following would you want to remove or keep?
>
>
> I did not look into all the details ...
>

To start with: keep in mind that in the case of static linking most of
this is not at all pulled in except when strictly necessary. Static
linking might be more relevant to your needs.

> In general: Keep the API, but add stubs with minimal operation or fail for
> none C locale (etc.).
>
>> - UTF-8 encoding and decoding
>
>
> May be of use to keep, if on bare minimum.

Seeing as the UTF-8 decoder is very small already, I'd be shocked if
you could make an argument for removing that.

>> - Character properties
>
>> - Case mappings
>
> Keep ASCII, map all none ASCII to a single value.

This would be not-quite-right. Also, the case mapping tables are quite
small. towctrans.lo which contains the case mappings is 1106 bytes.

>> - Internal message translation (nl_langinfo strings, errors, etc.)
>
>> - Message translation API (gettext)
>
> No translation at all, keep the English messages (as short as possible).

musl does not have any translations in it at all. It only has a small
portion of logic able to load external translations. locale_map.lo and
__mo_lookup.lo which are together responsible for this, are a total of
1471 bytes.

>> - Charset conversion (iconv)
>
>
> Copy ASCII / UTF-8, but fail for all other.

Though quite possible, it's worth noting that musl iconv is not very
large. iconv.lo is 128408 bytes, or 125k.

>> - Non-ASCII characters in regex and fnmatch patterns/brackers
>
>
> May be the question to allow for UTF-8, but only those, no other charsets
> (should allow to do some optimization and avoid all the extended overhead).

This is already the case.

> fnmatch: Match None ASCII just 1:1, no other special operation.

fnmatch.lo itself is 2227 bytes right now and none of that is in UTF-8
handling. The body of that is in mbtowc.lo and mbsrtowcs.lo, which are
227 bytes and 636 bytes respectively.

> regex: Don't have the experience on the internals of this topic. In general
> allow for 1:1 matching of none ASCII characters, but otherwise behave as C
> locale (e.g. equivalence classes).
>

The regex equivalence classes are handled via the isw* functions which
(as mentioned above) are quite small.

In short, it seems like if we made these changes we'd maybe be able to
trim out 135k and almost all of that would be in iconv. Though I
appreciate the desire for smaller code, this doesn't quite seem like
the place to go looking.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-07 23:59         ` Build option to disable locale [was: Byte-based C locale, draft 1] Harald Becker
  2015-06-08  0:28           ` Josiah Worcester
@ 2015-06-08  0:33           ` Rich Felker
  2015-06-08  2:46             ` Harald Becker
  1 sibling, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-08  0:33 UTC (permalink / raw)
  To: musl

On Mon, Jun 08, 2015 at 01:59:35AM +0200, Harald Becker wrote:
> On 07.06.2015 02:24, Rich Felker wrote:
> >It's somewhat more clear what you're talking about, but I'm still not
> >sure what specific pieces of code you would want to omit from libc.so.
> >Which of the following would you want to remove or keep?
> 
> I did not look into all the details ...
> 
> In general: Keep the API, but add stubs with minimal operation or
> fail for none C locale (etc.).
> 
> >- UTF-8 encoding and decoding
> 
> May be of use to keep, if on bare minimum.

This is roughly 3k of code, and is mandatory if you want to say you
"support UTF-8" at all. I'll note the other parts that fundamentally
depend on it.

> >- Character properties
> > - Case mappings
> 
> Keep ASCII, map all none ASCII to a single value.

I assume by "map to a single value" you mean uniform properties for
all non-ASCII Unicode characters, e.g. just printable but nothing
else. Case-mapping everything down to one character would not be a
good idea. :-)

Character properties are roughly 11k of code. Case mappings are 1k of
code.

Note that while some of the properties are arguably not very useful
(the wctype system does not give you enough information to do serious
text processing with them), without the wcwidth property, you cannot
properly display non-ASCII text on a terminal. So at least this one,
which takes 3k, is pretty critical to "UTF-8 support".

> >- Internal message translation (nl_langinfo strings, errors, etc.)
> > - Message translation API (gettext)
> 
> No translation at all, keep the English messages (as short as possible).

The internal translation support is about 2k. The gettext system is
roughly another 2k on top of that (and depends on the former).

I agree this is completely non-mandatory for "UTF-8 support" and
that's why musl originally didn't have it.

> >- Charset conversion (iconv)
> 
> Copy ASCII / UTF-8, but fail for all other.

iconv is big. About 128k. The ability to selectively omit some or all
legacy charsets from iconv is a long-term goal.

Of course if you have an actual need for character set conversion,
e.g. reading email in mutt, then your alternative to musl's 128k iconv
is GNU libiconv weighing in at several MB...

> >- Non-ASCII characters in regex and fnmatch patterns/brackers
> 
> May be the question to allow for UTF-8, but only those, no other
> charsets (should allow to do some optimization and avoid all the
> extended overhead).

That's how it is now.

> fnmatch: Match None ASCII just 1:1, no other special operation.
> 
> regex: Don't have the experience on the internals of this topic. In
> general allow for 1:1 matching of none ASCII characters, but
> otherwise behave as C locale (e.g. equivalence classes).

For both fnmatch and regex, the single-character-match (? or .
respectively) matches characters, not bytes. Likewise bracket
expressions match characters. In order for this to work at all, you
need UTF-8 decoding (see above).

There's no directly measurable code size cost for these items; the
savings from not doing UTF-8 would come from completely different code
that doesn't now exist in musl for bypassing mbtowc and just working
directly on input bytes.

So aside from iconv, the above seem to total around 19k, and at least
6k of that is mandatory if you want to be able to claim to support
UTF-8. So the topic at hand seems to be whether you can save <13k of
libc.so size by hacking out character handling/locale related features
that are non-essential to basic UTF-8 support...

Rich

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  0:28           ` Josiah Worcester
@ 2015-06-08  1:57             ` Harald Becker
  2015-06-08  2:36               ` Rich Felker
  0 siblings, 1 reply; 21+ messages in thread
From: Harald Becker @ 2015-06-08  1:57 UTC (permalink / raw)
  To: musl

Hi Josiah !

On 08.06.2015 02:28, Josiah Worcester wrote:
> To start with: keep in mind that in the case of static linking most of
> this is not at all pulled in except when strictly necessary. Static
> linking might be more relevant to your needs.

You missed my lead in message: I don't talk about static linking (which 
is what I currently do, replacing other stuff). I'm looking for a really 
small shared C library.

As I think, the time for many different char sets is gone, we should 
step toward a small lib which fits better for two purposes:

1) really small systems with a minimal set of operation (usually those 
systems work with ASCII only, pure C locale).

2) Systems which use ASCII for it's majority, and use only a base set of 
UTF-8 operation.

This should be compatible for all systems using C locale, but does not 
fit full desktop system setups.

The problem is: Stripping all the functions on every new release is too 
much work, so it needs some minimal support in the library build 
process. Which should be pure optional, enabling the full version as 
default.

>>> - UTF-8 encoding and decoding
>> May be of use to keep, if on bare minimum.
>
> Seeing as the UTF-8 decoder is very small already, I'd be shocked if
> you could make an argument for removing that.

That's why I told "keep". I know it to be small, so keep it small and 
fast, and UTF-8 only (not supporting other multi byte char sets).

>
>>> - Character properties
>>
>>> - Case mappings
>>
>> Keep ASCII, map all none ASCII to a single value.
>
> This would be not-quite-right. Also, the case mapping tables are quite
> small. towctrans.lo which contains the case mappings is 1106 bytes.

Sorry for my poor English. I mean, let all functions behave as for 
locale "C" (ASCII), but don't fail / break when there is an embedded 
UTF-8 sequence. Just say "this is not ASCII" or may be: "this is UTF-8".

>>> - Internal message translation (nl_langinfo strings, errors, etc.)
>>
>>> - Message translation API (gettext)
>>
>> No translation at all, keep the English messages (as short as possible).
>
> musl does not have any translations in it at all. It only has a small
> portion of logic able to load external translations. locale_map.lo and
> __mo_lookup.lo which are together responsible for this, are a total of
> 1471 bytes.

I mean drop the portion to load external translations.

>>> - Charset conversion (iconv)
>>
>>
>> Copy ASCII / UTF-8, but fail for all other.
>
> Though quite possible, it's worth noting that musl iconv is not very
> large. iconv.lo is 128408 bytes, or 125k.

A big hunk to kick off. Just keep a stub that allows for ASCII and UTF-8 
and 1:1 copy operation. Should be possible to draw that down to not more 
than 1k.

> The regex equivalence classes are handled via the isw* functions which
> (as mentioned above) are quite small.

So handle them as for the "C" locale, but don't fail/break when someone 
enters an UTF-8 sequence.

> In short, it seems like if we made these changes we'd maybe be able to
> trim out 135k and almost all of that would be in iconv. Though I
> appreciate the desire for smaller code, this doesn't quite seem like
> the place to go looking.

My request is, to get a shared lib which does not need to include all 
that other char set and locale code. An optional build option, but with 
the need of some minimal support, so it can persist for future releases. 
I dislike doing all the strip down work over and over again for any new 
release.

The resulting library shall be standards conform and fully operational, 
as long as the applications only use the bare C locale, but should allow 
to pass through embedded UTF-8 sequences.

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  1:57             ` Harald Becker
@ 2015-06-08  2:36               ` Rich Felker
  2015-06-08  3:35                 ` Harald Becker
  0 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-08  2:36 UTC (permalink / raw)
  To: musl

On Mon, Jun 08, 2015 at 03:57:01AM +0200, Harald Becker wrote:
> Hi Josiah !
> 
> On 08.06.2015 02:28, Josiah Worcester wrote:
> >To start with: keep in mind that in the case of static linking most of
> >this is not at all pulled in except when strictly necessary. Static
> >linking might be more relevant to your needs.
> 
> You missed my lead in message: I don't talk about static linking
> (which is what I currently do, replacing other stuff). I'm looking
> for a really small shared C library.

Do you have an application in mind where saving ~13k in libc.so would
make the difference between being able to use it or not?

> The problem is: Stripping all the functions on every new release is
> too much work, so it needs some minimal support in the library build
> process. Which should be pure optional, enabling the full version as
> default.

Maintaining useless configuration knobs and having to test them all is
also too much work, and the main reason why uClibc is dying.

> >>>- UTF-8 encoding and decoding
> >>May be of use to keep, if on bare minimum.
> >
> >Seeing as the UTF-8 decoder is very small already, I'd be shocked if
> >you could make an argument for removing that.
> 
> That's why I told "keep". I know it to be small, so keep it small
> and fast, and UTF-8 only (not supporting other multi byte char
> sets).

That's how it always has been. You could actually make it smaller by
writing the string functions as trivial character-at-a-time loops on
top of mbtowc/wctomb, but you'd be sacrificing performance to save a
few hundred bytes at most. This does not seem like a worthwhile
trade-off.

> >>>- Character properties
> >>
> >>>- Case mappings
> >>
> >>Keep ASCII, map all none ASCII to a single value.
> >
> >This would be not-quite-right. Also, the case mapping tables are quite
> >small. towctrans.lo which contains the case mappings is 1106 bytes.
> 
> Sorry for my poor English. I mean, let all functions behave as for
> locale "C" (ASCII), but don't fail / break when there is an embedded
> UTF-8 sequence. Just say "this is not ASCII" or may be: "this is
> UTF-8".

musl's "C" locale is _not_ ASCII but is UTF-8. Supporting both
byte-at-a-time operation in the C locale and proper UTF-8 handling
otherwise is the topic of this thread, and it very mildly increases
code size.

> >>>- Internal message translation (nl_langinfo strings, errors, etc.)
> >>
> >>>- Message translation API (gettext)
> >>
> >>No translation at all, keep the English messages (as short as possible).
> >
> >musl does not have any translations in it at all. It only has a small
> >portion of logic able to load external translations. locale_map.lo and
> >__mo_lookup.lo which are together responsible for this, are a total of
> >1471 bytes.
> 
> I mean drop the portion to load external translations.

This probably could be a configurable option at some point, but it's
not going to save you any meaningful amount of space. Best case would
be saving about 4k.

> >>>- Charset conversion (iconv)
> >>
> >>
> >>Copy ASCII / UTF-8, but fail for all other.
> >
> >Though quite possible, it's worth noting that musl iconv is not very
> >large. iconv.lo is 128408 bytes, or 125k.
> 
> A big hunk to kick off. Just keep a stub that allows for ASCII and
> UTF-8 and 1:1 copy operation. Should be possible to draw that down
> to not more than 1k.

119k of the 128k for iconv is legacy CJK character set tables. The
code for them is only 1k or so. Legacy 8bit codepages are another 6k.
The easy way to make things optional here would be just dropping
tables for configured-out charsets.

> >The regex equivalence classes are handled via the isw* functions which
> >(as mentioned above) are quite small.
> 
> So handle them as for the "C" locale, but don't fail/break when
> someone enters an UTF-8 sequence.

"Don't break" means handling UTF-8. Thankfully that's small and
changing this code is not i

> >In short, it seems like if we made these changes we'd maybe be able to
> >trim out 135k and almost all of that would be in iconv. Though I
> >appreciate the desire for smaller code, this doesn't quite seem like
> >the place to go looking.
> 
> My request is, to get a shared lib which does not need to include
> all that other char set and locale code. An optional build option,
> but with the need of some minimal support, so it can persist for
> future releases. I dislike doing all the strip down work over and
> over again for any new release.

Is this a practical need or an ideological one?

> The resulting library shall be standards conform and fully
> operational, as long as the applications only use the bare C locale,
> but should allow to pass through embedded UTF-8 sequences.

Your goal keeps shifting back between supporting UTF-8 and "passing
through sequences" which is _NOT_ supporting UTF-8. If I accidentally
type a non-ASCII character on a command line then press backspace and
the character disappears but there are still two hidden junk bytes on
the command line, that is a broken system. At many levels, you can get
by with just treating text as abstract byte strings that can pass
through anything, but for any kind of visual presentation (even on a
terminal) or entry/editing, you need to know character identity.
Fortunately, as I've been trying to say, that's extremely cheap. You
could save more space by making the string functions or qsort or
something naive and slow...

Rich

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  0:33           ` Rich Felker
@ 2015-06-08  2:46             ` Harald Becker
  2015-06-08  4:06               ` Rich Felker
  2015-06-09  3:20               ` Isaac Dunham
  0 siblings, 2 replies; 21+ messages in thread
From: Harald Becker @ 2015-06-08  2:46 UTC (permalink / raw)
  To: musl

On 08.06.2015 02:33, Rich Felker wrote:
> On Mon, Jun 08, 2015 at 01:59:35AM +0200, Harald Becker wrote:
>> On 07.06.2015 02:24, Rich Felker wrote:
>>> It's somewhat more clear what you're talking about, but I'm still not
>>> sure what specific pieces of code you would want to omit from libc.so.
>>> Which of the following would you want to remove or keep?
>>
>> I did not look into all the details ...
>>
>> In general: Keep the API, but add stubs with minimal operation or
>> fail for none C locale (etc.).
>>
>>> - UTF-8 encoding and decoding
>>
>> May be of use to keep, if on bare minimum.
>
> This is roughly 3k of code, and is mandatory if you want to say you
> "support UTF-8" at all. I'll note the other parts that fundamentally
> depend on it.

3k ? Which functions do you add to this? ... and I don't see it is so 
mandatory for pure C locale. UTF-8 shall only pass through and not break 
the base operation.

>>> - Character properties
>>> - Case mappings
>>
>> Keep ASCII, map all none ASCII to a single value.
>
> I assume by "map to a single value" you mean uniform properties for
> all non-ASCII Unicode characters, e.g. just printable but nothing
> else. Case-mapping everything down to one character would not be a
> good idea. :-)

I mean any none ASCII say "NOT ASCII" (or may be "UTF-8"). Don't do any 
case mapping for none "C" locale, etc.

> Character properties are roughly 11k of code. Case mappings are 1k of
> code.

When narrowing things to pure ASCII / C locale, are there still no 
chance to cut this down?

> Note that while some of the properties are arguably not very useful
> (the wctype system does not give you enough information to do serious
> text processing with them), without the wcwidth property, you cannot
> properly display non-ASCII text on a terminal. So at least this one,
> which takes 3k, is pretty critical to "UTF-8 support".

There are not so many applications which require full text processing. 
And the resulting lib shall work correct for any text in the base C 
locale, but shall allow to embed UTF-8 sequences. If an application 
needs to handle those sequences special it has to be done in the 
application.

Again: This not to build a lib for a general purpose desktop system. It 
is for an optional stripped down version.

>>> - Internal message translation (nl_langinfo strings, errors, etc.)
>>> - Message translation API (gettext)
>>
>> No translation at all, keep the English messages (as short as possible).
>
> The internal translation support is about 2k. The gettext system is
> roughly another 2k on top of that (and depends on the former).

Strip down to nearly nothing. Return the key string as result of 
translation, just as if there is no translation available.

> iconv is big. About 128k. The ability to selectively omit some or all
> legacy charsets from iconv is a long-term goal.

This is why I like to cut that down. With dropping everything except 
ASCII / C locale support and may be a base set of UTF-8 operation, it 
should be possible to do heavy optimization.

> Of course if you have an actual need for character set conversion,
> e.g. reading email in mutt, then your alternative to musl's 128k iconv
> is GNU libiconv weighing in at several MB...

If you really have a need for conversion, this option is not for you, or 
you need to do the conversion in the application ... there are not so 
many applications which require such full and flexible character set 
conversions, and even those work with the stripped down version as long 
as you stay at pure ASCII text. For many dedicated and small systems a 
pure ASCII operation may be all required (e.g. emulator sets, container, 
etc.).

> For both fnmatch and regex, the single-character-match (? or .
> respectively) matches characters, not bytes. Likewise bracket
> expressions match characters. In order for this to work at all, you
> need UTF-8 decoding (see above).

No! When optimizing for pure UTF-8 support, you can do clever 
optimizations, eliminating that for and back of UTF-8 character 
conversion completely. That is what I like to get.

e.g. it is simple to match characters not bytes without decoding the 
UTF-8 sequences by just skipping the extension bytes (0x80..0xBF) where 
required.

> There's no directly measurable code size cost for these items; the
> savings from not doing UTF-8 would come from completely different code
> that doesn't now exist in musl for bypassing mbtowc and just working
> directly on input bytes.

That is what i like to get, just working on the stream of input bytes, 
and leave UTF-8 a sequence of bytes in the string. Many applications 
don't matter those sequences either, and for other applications they may 
be acted on correct, when using some clever programming.

> So aside from iconv, the above seem to total around 19k, and at least
> 6k of that is mandatory if you want to be able to claim to support
> UTF-8. So the topic at hand seems to be whether you can save <13k of
> libc.so size by hacking out character handling/locale related features
> that are non-essential to basic UTF-8 support...

I like to get a stripped down version, which eliminate all the 
unnecessary char set handling code used in dedicated systems, but 
stripping that on every release is too much work to do.

The benefit may be for:

- embedded systems
- small initramfs based systems
- container systems
- minimal chroot environments

It's intention is not as a lib for general purpose desktop systems, but 
musl has reached a state where it's standard compliance boosts it over 
other small libs. The caveat is every new functionality make it 
completely into the shared library. So I'm looking for a possibility to 
drop all that char set and locale handling stuff, without losing API 
compatibility for base C locale operation, and passing through or doing 
some clever handling on UTF-8 sequences.

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  2:36               ` Rich Felker
@ 2015-06-08  3:35                 ` Harald Becker
  2015-06-08  3:51                   ` Josiah Worcester
  0 siblings, 1 reply; 21+ messages in thread
From: Harald Becker @ 2015-06-08  3:35 UTC (permalink / raw)
  To: musl

On 08.06.2015 04:36, Rich Felker wrote:
> Do you have an application in mind where saving ~13k in libc.so would
> make the difference between being able to use it or not?

I think I answered this already in the other message.

> Maintaining useless configuration knobs and having to test them all is
> also too much work, and the main reason why uClibc is dying.

Too many knobs make it difficult to maintain, this is always a problem, 
but without controllability to get a monolithic hunk which may not fit 
all needs. That is why I look for a knob to remove all the char set and 
locale handling, dropping to a C locale only ... with some clever 
optimization for dedicated UTF-8 handling.

> That's how it always has been. You could actually make it smaller by
> writing the string functions as trivial character-at-a-time loops on
> top of mbtowc/wctomb, but you'd be sacrificing performance to save a
> few hundred bytes at most. This does not seem like a worthwhile
> trade-off.

Rich, UTF-8 allows for character operation without converting for and 
back the individual sequences. I never mbtowc/wctomb, or even any other 
wide character function. There are only a few operations which really 
need to work at Unicode code point values (not characters).

> musl's "C" locale is _not_ ASCII but is UTF-8. Supporting both
> byte-at-a-time operation in the C locale and proper UTF-8 handling
> otherwise is the topic of this thread, and it very mildly increases
> code size.

So this is why I jumped in on that topic. You approximately get near 
what I like to get when you read my request as: "Create a lib with that 
single C locale handling, without any other locale or char set stuff".

... but I think you still do some UTF-8 operations too complicated. It 
looks to me you are fixating on that wide character and multi byte 
parts, which I consider not to be required. UTF-8 is a sequence of 
bytes, so keep them just as this sequence. Most string operations allow 
for clever optimization.

> This probably could be a configurable option at some point, but it's
> not going to save you any meaningful amount of space. Best case would
> be saving about 4k.

... another 4k of unnecessary code.

> 119k of the 128k for iconv is legacy CJK character set tables. The
> code for them is only 1k or so. Legacy 8bit codepages are another 6k.
> The easy way to make things optional here would be just dropping
> tables for configured-out charsets.

Kick them all off, just ASCII, UTF-8, and 1:1 copy operation.

> "Don't break" means handling UTF-8.

"Dont break" means clever handling of embedded UTF-8 sequences, without 
conversion of every sequence to wide char values, or similar.

> Is this a practical need or an ideological one?

I consider this practical need, as I like to setup highly specialized 
chroot environments (isolating the applications to there controlled set 
of accessible data, like virtual hosts).

> Your goal keeps shifting back between supporting UTF-8 and "passing
> through sequences" which is _NOT_ supporting UTF-8. If I accidentally
> type a non-ASCII character on a command line then press backspace and
> the character disappears but there are still two hidden junk bytes on
> the command line, that is a broken system.

That would be a broken input system or read line system. In cooked mode 
you won't get that either, and in raw mode you need and can handle that 
with some simple checks. So you don't need that full multi byte and wide 
character handling to handle UTF-8.

> by with just treating text as abstract byte strings that can pass
> through anything, but for any kind of visual presentation (even on a
> terminal) or entry/editing, you need to know character identity.

Do you? I consider know. The terminal program (or font handling part) 
may need this. Even on on a simple textual Linux console, the kernel 
knows how to display the character. The only thing to know, is the 
correct number of bytes to send for a specific character.

> Fortunately, as I've been trying to say, that's extremely cheap. You
> could save more space by making the string functions or qsort or
> something naive and slow...

Do they need to be slow? UTF-8 can be handled without that full wide 
character and multi byte stuff you are throwing in, and I want to get 
bare ASCII operation plus this simple UTF-8 handling. Mostly musl does 
this, but adds in some other char set and locale handling stuff, which I 
like to opt out. On statical linking it is easy, but I like to get a 
shared library without all that extra stuff.

--
Harald

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  3:35                 ` Harald Becker
@ 2015-06-08  3:51                   ` Josiah Worcester
  0 siblings, 0 replies; 21+ messages in thread
From: Josiah Worcester @ 2015-06-08  3:51 UTC (permalink / raw)
  To: musl

On Sun, Jun 7, 2015 at 10:35 PM, Harald Becker <ralda@gmx.de> wrote:
> On 08.06.2015 04:36, Rich Felker wrote:
>>
>> Do you have an application in mind where saving ~13k in libc.so would
>> make the difference between being able to use it or not?
>
>
> I think I answered this already in the other message.
>
>> Maintaining useless configuration knobs and having to test them all is
>> also too much work, and the main reason why uClibc is dying.
>
>
> Too many knobs make it difficult to maintain, this is always a problem, but
> without controllability to get a monolithic hunk which may not fit all
> needs. That is why I look for a knob to remove all the char set and locale
> handling, dropping to a C locale only ... with some clever optimization for
> dedicated UTF-8 handling.
>

The point is "all the charset and locale handling" is miniscule,
spread-out, and not easily removed. It's also only UTF-8 handling.

>> That's how it always has been. You could actually make it smaller by
>> writing the string functions as trivial character-at-a-time loops on
>> top of mbtowc/wctomb, but you'd be sacrificing performance to save a
>> few hundred bytes at most. This does not seem like a worthwhile
>> trade-off.
>
>
> Rich, UTF-8 allows for character operation without converting for and back
> the individual sequences. I never mbtowc/wctomb, or even any other wide
> character function. There are only a few operations which really need to
> work at Unicode code point values (not characters).

You need to be incredibly careful and essentially parse straight to
codepoints in order to not accept invalid UTF-8. Most notably,
over-long sequences and surrogate pairs should not be accepted and to
understand those you have to parse it.

>> musl's "C" locale is _not_ ASCII but is UTF-8. Supporting both
>> byte-at-a-time operation in the C locale and proper UTF-8 handling
>> otherwise is the topic of this thread, and it very mildly increases
>> code size.
>
>
> So this is why I jumped in on that topic. You approximately get near what I
> like to get when you read my request as: "Create a lib with that single C
> locale handling, without any other locale or char set stuff".
>
> ... but I think you still do some UTF-8 operations too complicated. It looks
> to me you are fixating on that wide character and multi byte parts, which I
> consider not to be required. UTF-8 is a sequence of bytes, so keep them just
> as this sequence. Most string operations allow for clever optimization.

When it's reasonable musl does operate right on byte sequences. But
there are operations  that only make sense on units of codepoints.

>> This probably could be a configurable option at some point, but it's
>> not going to save you any meaningful amount of space. Best case would
>> be saving about 4k.
>
> ... another 4k of unnecessary code.

4k is not exactly worth optimizing unless there's a really compelling
reason, especially when it provides this much benefit.

>> 119k of the 128k for iconv is legacy CJK character set tables. The
>> code for them is only 1k or so. Legacy 8bit codepages are another 6k.
>> The easy way to make things optional here would be just dropping
>> tables for configured-out charsets.
>
>
> Kick them all off, just ASCII, UTF-8, and 1:1 copy operation.
>
>> "Don't break" means handling UTF-8.
>
>
> "Dont break" means clever handling of embedded UTF-8 sequences, without
> conversion of every sequence to wide char values, or similar.

If you don't parse UTF-8 you accept invalid UTF-8 which is itself a
terrible bug.

>> Is this a practical need or an ideological one?
>
>
> I consider this practical need, as I like to setup highly specialized chroot
> environments (isolating the applications to there controlled set of
> accessible data, like virtual hosts).

We are arguing over 135 kilobytes maximum. Even if (*if*) you could
strip some space from libc here, I guarantee there's much more
interesting things to optimize. For instance, your kernel is several
megabytes, your applications are incredibly unlikely to be reasonably
efficient with space, etc.

As such this is vastly more ideological (and silly) than practical.
Especially as those kilobytes (kilobytes!) are very useful kilobytes.

>
>> Your goal keeps shifting back between supporting UTF-8 and "passing
>> through sequences" which is _NOT_ supporting UTF-8. If I accidentally
>> type a non-ASCII character on a command line then press backspace and
>> the character disappears but there are still two hidden junk bytes on
>> the command line, that is a broken system.
>
>
> That would be a broken input system or read line system. In cooked mode you
> won't get that either, and in raw mode you need and can handle that with
> some simple checks. So you don't need that full multi byte and wide
> character handling to handle UTF-8.

You need to know the width in units of terminal cells of characters if
you're doing anything involving moving the cursor or editing the
screen though...
And again, you need to parse UTF-8 to not accept invalid UTF-8.

>> by with just treating text as abstract byte strings that can pass
>> through anything, but for any kind of visual presentation (even on a
>> terminal) or entry/editing, you need to know character identity.
>
>
> Do you? I consider know. The terminal program (or font handling part) may
> need this. Even on on a simple textual Linux console, the kernel knows how
> to display the character. The only thing to know, is the correct number of
> bytes to send for a specific character.

And the width of the character in terminal cells.

>> Fortunately, as I've been trying to say, that's extremely cheap. You
>> could save more space by making the string functions or qsort or
>> something naive and slow...
>
>
> Do they need to be slow? UTF-8 can be handled without that full wide
> character and multi byte stuff you are throwing in, and I want to get bare
> ASCII operation plus this simple UTF-8 handling. Mostly musl does this, but
> adds in some other char set and locale handling stuff, which I like to opt
> out. On statical linking it is easy, but I like to get a shared library
> without all that extra stuff.

"Simple UTF-8 handling" is what we have. Have you looked at the wide
character stuff? If you start taking stuff off you literally stop
doing UTF-8 right.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  2:46             ` Harald Becker
@ 2015-06-08  4:06               ` Rich Felker
  2015-06-09  3:20               ` Isaac Dunham
  1 sibling, 0 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-08  4:06 UTC (permalink / raw)
  To: musl

On Mon, Jun 08, 2015 at 04:46:42AM +0200, Harald Becker wrote:
> On 08.06.2015 02:33, Rich Felker wrote:
> >On Mon, Jun 08, 2015 at 01:59:35AM +0200, Harald Becker wrote:
> >>On 07.06.2015 02:24, Rich Felker wrote:
> >>>It's somewhat more clear what you're talking about, but I'm still not
> >>>sure what specific pieces of code you would want to omit from libc.so.
> >>>Which of the following would you want to remove or keep?
> >>
> >>I did not look into all the details ...
> >>
> >>In general: Keep the API, but add stubs with minimal operation or
> >>fail for none C locale (etc.).
> >>
> >>>- UTF-8 encoding and decoding
> >>
> >>May be of use to keep, if on bare minimum.
> >
> >This is roughly 3k of code, and is mandatory if you want to say you
> >"support UTF-8" at all. I'll note the other parts that fundamentally
> >depend on it.
> 
> 3k ? Which functions do you add to this? ... and I don't see it is
> so mandatory for pure C locale. UTF-8 shall only pass through and
> not break the base operation.

Some of that is just the sheer number of interfaces mandated by the
language; each one is going to be at least 50 bytes or so even if it
basically does nothing. But most of it is about having non-pitiful
performance for string ops. The core UTF-8 encode/decode is under 800
bytes total and could be reduced to around 500 at a moderate
performance cost.

> >>>- Character properties
> >>>- Case mappings
> >>
> >>Keep ASCII, map all none ASCII to a single value.
> >
> >I assume by "map to a single value" you mean uniform properties for
> >all non-ASCII Unicode characters, e.g. just printable but nothing
> >else. Case-mapping everything down to one character would not be a
> >good idea. :-)
> 
> I mean any none ASCII say "NOT ASCII" (or may be "UTF-8"). Don't do
> any case mapping for none "C" locale, etc.

Whatever you do, case mappings are one of the cheapest parts, at only
1k. I don't see any benefit to be had here. You'd probably spend that
much in _code_ size complexifying the code to deal with the
possibility of variable case-mapping state.

> >Character properties are roughly 11k of code. Case mappings are 1k of
> >code.
> 
> When narrowing things to pure ASCII / C locale, are there still no
> chance to cut this down?

In that case you have a broken system that does not support UTF-8.
This is not something musl will support. One of the core principles of
musl is that there are not two tiers of characters/languages.
Everything is UTF-8 and all characters are usable anywhere.

> >Note that while some of the properties are arguably not very useful
> >(the wctype system does not give you enough information to do serious
> >text processing with them), without the wcwidth property, you cannot
> >properly display non-ASCII text on a terminal. So at least this one,
> >which takes 3k, is pretty critical to "UTF-8 support".
> 
> There are not so many applications which require full text
> processing. And the resulting lib shall work correct for any text in
> the base C locale, but shall allow to embed UTF-8 sequences. If an
> application needs to handle those sequences special it has to be
> done in the application.
> 
> Again: This not to build a lib for a general purpose desktop system.
> It is for an optional stripped down version.

For what purpose? A size reduction of 1-2% is not going to make the
difference between libc.so fitting somebody's needs or not fitting
them. And musl is pretty close to being as small as you can make a
full POSIX libc without resorting to abysmal performance (e.g.
quadratic qsort and strstr, all byte-at-a-time string functions,
unbuffered stdio, etc.) so I don't think there's even a theoretical
"these savings might be more than 2% if other parts of libc were more
size-optimized".

> >>>- Internal message translation (nl_langinfo strings, errors, etc.)
> >>>- Message translation API (gettext)
> >>
> >>No translation at all, keep the English messages (as short as possible).
> >
> >The internal translation support is about 2k. The gettext system is
> >roughly another 2k on top of that (and depends on the former).
> 
> Strip down to nearly nothing. Return the key string as result of
> translation, just as if there is no translation available.

Yes, that's doable but not very beneficial.

> >iconv is big. About 128k. The ability to selectively omit some or all
> >legacy charsets from iconv is a long-term goal.
> 
> This is why I like to cut that down. With dropping everything except
> ASCII / C locale support and may be a base set of UTF-8 operation,
> it should be possible to do heavy optimization.

This is the one item in the list that is a strong candidate for
configurability. It's very large (roughly 25% the size of libc.so) and
omitting it has no impact on C/POSIX correctness or on first-class
status of all characters, It's just optional, implementation-defined
functionality (albeit very useful functionality).

> >Of course if you have an actual need for character set conversion,
> >e.g. reading email in mutt, then your alternative to musl's 128k iconv
> >is GNU libiconv weighing in at several MB...
> 
> If you really have a need for conversion, this option is not for
> you, or you need to do the conversion in the application ... there
> are not so many applications which require such full and flexible
> character set conversions, and even those work with the stripped
> down version as long as you stay at pure ASCII text. For many
> dedicated and small systems a pure ASCII operation may be all
> required (e.g. emulator sets, container, etc.).
> 
> >For both fnmatch and regex, the single-character-match (? or .
> >respectively) matches characters, not bytes. Likewise bracket
> >expressions match characters. In order for this to work at all, you
> >need UTF-8 decoding (see above).
> 
> No! When optimizing for pure UTF-8 support, you can do clever
> optimizations, eliminating that for and back of UTF-8 character
> conversion completely. That is what I like to get.

I would like to do UTF-8-specific optimizations in regex, but what
they buy you is performance, not code size. The code is moderately
larger because generating a byte-based NFA for matching dot, bracket,
etc. of UTF-8 characters is much more complex than a character-based
NFA.

> e.g. it is simple to match characters not bytes without decoding the
> UTF-8 sequences by just skipping the extension bytes (0x80..0xBF)
> where required.

No, that doesn't work because it also matches junk, including illegal
sequences which are highly unsafe to match. For a UTF-8-specific regex
implementation that works in bytes, "." has to compile as the pattern
represented by the ABNF in RFC 3629.

> >So aside from iconv, the above seem to total around 19k, and at least
> >6k of that is mandatory if you want to be able to claim to support
> >UTF-8. So the topic at hand seems to be whether you can save <13k of
> >libc.so size by hacking out character handling/locale related features
> >that are non-essential to basic UTF-8 support...
> 
> I like to get a stripped down version, which eliminate all the
> unnecessary char set handling code used in dedicated systems, but
> stripping that on every release is too much work to do.
> 
> The benefit may be for:
> 
> - embedded systems
> - small initramfs based systems
> - container systems
> - minimal chroot environments

"May be"? I would like at least one citation for an instance where
stripping 1-2% of the size from libc.so makes a difference for any of
these. As stated before I'm open to (and in fact aiming for, in the
long term) making it possible to reduce iconv coverage, but I haven't
seen anything remotely convincing as an argument to break things all
over libc and break the property that the entire UTF-8 character space
has first-class status for the sake of saving a couple kb.

> It's intention is not as a lib for general purpose desktop systems,
> but musl has reached a state where it's standard compliance boosts
> it over other small libs. The caveat is every new functionality make
> it completely into the shared library. So I'm looking for a
> possibility to drop all that char set and locale handling stuff,
> without losing API compatibility for base C locale operation, and
> passing through or doing some clever handling on UTF-8 sequences.

Doing UTF-8 right is hard because it's a balance of lots of
constraints. Most people get them wrong. Pretending UTF-8 is just
another 8-bit encoding does not work for the vast majority of cases,
and while there certainly ARE cases where that is possible, and there
ARE people who are capable of finding them and taking advantage of
them, if nothing else this thread has served to demonstrate that
strong desire to take these shortcuts is not highly correlated with
being qualified to know when they can be taken...

Rich

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-08  2:46             ` Harald Becker
  2015-06-08  4:06               ` Rich Felker
@ 2015-06-09  3:20               ` Isaac Dunham
  2015-06-09  4:27                 ` Rich Felker
  1 sibling, 1 reply; 21+ messages in thread
From: Isaac Dunham @ 2015-06-09  3:20 UTC (permalink / raw)
  To: musl

On Mon, Jun 08, 2015 at 04:46:42AM +0200, Harald Becker wrote:
> On 08.06.2015 02:33, Rich Felker wrote:
> >So aside from iconv, the above seem to total around 19k, and at least
> >6k of that is mandatory if you want to be able to claim to support
> >UTF-8. So the topic at hand seems to be whether you can save <13k of
> >libc.so size by hacking out character handling/locale related features
> >that are non-essential to basic UTF-8 support...
> 
> I like to get a stripped down version, which eliminate all the unnecessary
> char set handling code used in dedicated systems, but stripping that on
> every release is too much work to do.
> 
> The benefit may be for:
> 
> - embedded systems
> - small initramfs based systems
> - container systems
> - minimal chroot environments

Somehow it sounds like you may not have gotten wat Rich was asking.

IIRC, the goals of musl include full native support for UTF-8; keeping 
the time complexity to a minimum; and clean, correct code.

Dropping out 'legacy' charsets doesn't really sacrifice those goals.
But the other changes are have a much bigger impact on them.
So you're probably going to have to convince Rich that there *is* a
major benefit ('is' != 'could be').

For container systems or minimal chroot environments, you're dealing
with something that doesn't have a hard size limit, and if a chroot
or container runs ~6 MB ordinarily, you might be able to run 0.3% more
on the same hardware. That's probably not enough of a case.
For initramfs-based systems, you've got a similar situation but no
chance to multiply the effect, unless you're using a VM or hypervisor.

Now, since embedded systems have hard limits on size, you might be
able to make a case there. But you will need to come up with somthing
more specific, such as "I have a system where I could upgrade the kernel
to 2.6.xx *if* musl were ~20k smaller than building with a minimal
iconv" or "If we did this, there would be enough space to switch XYZ
router firmware from telnetd to dropbear".

HTH,
Isaac Dunham

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: Build option to disable locale [was: Byte-based C locale, draft 1]
  2015-06-09  3:20               ` Isaac Dunham
@ 2015-06-09  4:27                 ` Rich Felker
  0 siblings, 0 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-09  4:27 UTC (permalink / raw)
  To: musl

On Mon, Jun 08, 2015 at 08:20:26PM -0700, Isaac Dunham wrote:
> On Mon, Jun 08, 2015 at 04:46:42AM +0200, Harald Becker wrote:
> > On 08.06.2015 02:33, Rich Felker wrote:
> > >So aside from iconv, the above seem to total around 19k, and at least
> > >6k of that is mandatory if you want to be able to claim to support
> > >UTF-8. So the topic at hand seems to be whether you can save <13k of
> > >libc.so size by hacking out character handling/locale related features
> > >that are non-essential to basic UTF-8 support...
> > 
> > I like to get a stripped down version, which eliminate all the unnecessary
> > char set handling code used in dedicated systems, but stripping that on
> > every release is too much work to do.
> > 
> > The benefit may be for:
> > 
> > - embedded systems
> > - small initramfs based systems
> > - container systems
> > - minimal chroot environments
> 
> Somehow it sounds like you may not have gotten wat Rich was asking.
> 
> IIRC, the goals of musl include full native support for UTF-8; keeping 
> the time complexity to a minimum; and clean, correct code.
> 
> Dropping out 'legacy' charsets doesn't really sacrifice those goals.
> But the other changes are have a much bigger impact on them.
> So you're probably going to have to convince Rich that there *is* a
> major benefit ('is' != 'could be').
> 
> For container systems or minimal chroot environments, you're dealing
> with something that doesn't have a hard size limit, and if a chroot
> or container runs ~6 MB ordinarily, you might be able to run 0.3% more
> on the same hardware. That's probably not enough of a case.
> For initramfs-based systems, you've got a similar situation but no
> chance to multiply the effect, unless you're using a VM or hypervisor.
> 
> Now, since embedded systems have hard limits on size, you might be
> able to make a case there. But you will need to come up with somthing
> more specific, such as "I have a system where I could upgrade the kernel
> to 2.6.xx *if* musl were ~20k smaller than building with a minimal
> iconv" or "If we did this, there would be enough space to switch XYZ
> router firmware from telnetd to dropbear".

Yes, this is roughly what I was saying. Thank you for expressing it
better than I could.

And along those lines, if you really need to minimize libc.so for such
a special case, the solution is not manually maintaining extra knobs
and #ifdefs, but changing the way libc.so is generated. Instead of
linking all the object files directly, put them in a .a file first,
then link with something like:

$CC -shared -o libc.so -Wl,-u,sym1 -Wl,-u,sym2 ... libc_so.a

where the list sym1, sym2, ... is generated from 'nm' output for all
the binaries you need to run, plus a few mandatory libc-internal
symbols that need to be linked. This will produce the minimal libc.so
needed for your exact set of programs.

In the specific case of UTF-8 and locale-related code, I believe that
if none of your programs call setlocale or use any of the wchar
functions, regex/fnmatch/glob, or iconv explicitly, the only code that
we discussed that would get linked into libc.so is mbtowc.c and
wcrtomb.c, for a total of about 550 bytes. Even these would be omitted
if you don't use printf or scanf (printf needs wcrtomb; scanf needs
mbtowc). Using fnmatch/glob/regex would pull in another ~9k for the
character class and case mapping functions.

Rich

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] Byte-based C locale, draft 2
  2015-06-07  2:50 ` Rich Felker
@ 2015-06-13  7:06   ` Rich Felker
  2015-06-16  4:26     ` Rich Felker
  0 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-13  7:06 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 2584 bytes --]

On Sat, Jun 06, 2015 at 10:50:25PM -0400, Rich Felker wrote:
> On Sat, Jun 06, 2015 at 05:40:07PM -0400, Rich Felker wrote:
> > Attached is the first draft of a proposed byte-based C locale. The
> > patch is about 400 lines but most of it is context, because it's
> > basically a lot of tiny changes spread out over lots of files.
> > [...]
> 
> If we go forward with this, I think I can factor it into 3 parts:
> 
> 1. Add checks for MB_CUR_MAX==1 and the bytelocale support they would
>    activate, and the CODEUNIT/IS_CODEUNIT macros needed for these code
>    paths. This patch would be a complete nop and would not even affect
>    codegen with a decent compiler since MB_CUR_MAX==4 is a constant
>    right now.
> 
> 2. Introduce stdio saving of active LC_CTYPE at the time of stream
>    orientation (fwide) and save/restore of current locale around stdio
>    ops that need it (fputwc, fgetwc, ungetwc) and iconv usage of
>    multibyte functions. This patch would increase code size in a few
>    places but would not change behavior.
> 
> 3. Replace the constant MB_CUR_MAX macro with a runtime-variable value
>    dependent on CURRENT_LOCALE->cat[LC_CTYPE]. This would actually
>    activate the byte-based C locale support. locale_impl.h is actually
>    already doing this, so I think I should remove that definition
>    before making any changes and only bring it back if/when stage 3
>    here is committed.
> 
> In principle stages 1 and 2 could be committed in either order;
> they're independent. Stage 3 is also independent in what it touches,
> but if it's already committed before stage 1/2, then committing stage
> 1 without stage 2 is a functional regression (stdio functions no
> longer behave according to spec; iconv stops working in C locale).

Attached is the 3-part factorization described above, as patches
against commit 536c6d5a4205e2a3f161f2983ce1e0ac3082187d.

As predicted, part 1 does not change the generated code at all, at
least for my toolchain.

If nobody has further comments/discussion, I'll probably begin
committing this soon, starting with part 1, and the rest as I test it
more. While in the past there were ideological objections, including
by myself, all the feedback this time has been from people who want
this feature for compatibility (and future standards conformance), and
I think I've managed to do it in a way that's basically cost-free and
does not compromise musl's principle that UTF-8 is first-class, but
instead just gives you a way (only if/when you want it) to process
UTF-8 as code units instead of codepoints.

Rich

[-- Attachment #2: bytelocale-part1.diff --]
[-- Type: text/plain, Size: 5495 bytes --]

diff --git a/src/ctype/__ctype_get_mb_cur_max.c b/src/ctype/__ctype_get_mb_cur_max.c
index d235f4d..8e946fc 100644
--- a/src/ctype/__ctype_get_mb_cur_max.c
+++ b/src/ctype/__ctype_get_mb_cur_max.c
@@ -1,6 +1,7 @@
-#include <stddef.h>
+#include <stdlib.h>
+#include "locale_impl.h"
 
 size_t __ctype_get_mb_cur_max()
 {
-	return 4;
+	return MB_CUR_MAX;
 }
diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c
index a1ada24..776b447 100644
--- a/src/locale/langinfo.c
+++ b/src/locale/langinfo.c
@@ -33,7 +33,8 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
 	int idx = item & 65535;
 	const char *str;
 
-	if (item == CODESET) return "UTF-8";
+	if (item == CODESET)
+		return MB_CUR_MAX==1 ? "UTF-8-CODE-UNITS" : "UTF-8";
 	
 	switch (cat) {
 	case LC_NUMERIC:
diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
index 9d2c3b1..8de060f 100644
--- a/src/multibyte/btowc.c
+++ b/src/multibyte/btowc.c
@@ -1,7 +1,10 @@
-#include <stdio.h>
 #include <wchar.h>
+#include <stdlib.h>
+#include "internal.h"
 
 wint_t btowc(int c)
 {
-	return c<128U ? c : EOF;
+	if (c < 128U) return c;
+	if (MB_CUR_MAX==1) return CODEUNIT(c);
+	return WEOF;
 }
diff --git a/src/multibyte/internal.h b/src/multibyte/internal.h
index cc017fa..53d62ed 100644
--- a/src/multibyte/internal.h
+++ b/src/multibyte/internal.h
@@ -23,3 +23,10 @@ extern const uint32_t bittab[];
 
 #define SA 0xc2u
 #define SB 0xf4u
+
+/* Arbitrary encoding for representing code units instead of characters. */
+#define CODEUNIT(c) (0xdfff & (signed char)(c))
+#define IS_CODEUNIT(c) ((unsigned)(c)-0xdf80 < 0x80)
+
+/* Get inline definition of MB_CUR_MAX. */
+#include "locale_impl.h"
diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c
index e7b3654..ca7da70 100644
--- a/src/multibyte/mbrtowc.c
+++ b/src/multibyte/mbrtowc.c
@@ -4,6 +4,7 @@
  * unnecessary.
  */
 
+#include <stdlib.h>
 #include <wchar.h>
 #include <errno.h>
 #include "internal.h"
@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
 	if (!n) return -2;
 	if (!c) {
 		if (*s < 0x80) return !!(*wc = *s);
+		if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 		if (*s-SA > SB-SA) goto ilseq;
 		c = bittab[*s++-SA]; n--;
 	}
diff --git a/src/multibyte/mbsrtowcs.c b/src/multibyte/mbsrtowcs.c
index 3c1343a..e23083d 100644
--- a/src/multibyte/mbsrtowcs.c
+++ b/src/multibyte/mbsrtowcs.c
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include <wchar.h>
 #include <errno.h>
+#include <string.h>
+#include <stdlib.h>
 #include "internal.h"
 
 size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs
 		}
 	}
 
+	if (MB_CUR_MAX==1) {
+		if (!ws) return strlen((const char *)s);
+		for (;;) {
+			if (!wn) {
+				*src = (const void *)s;
+				return wn0;
+			}
+			if (!*s) break;
+			c = *s++;
+			*ws++ = CODEUNIT(c);
+			wn--;
+		}
+		*ws = 0;
+		*src = 0;
+		return wn0-wn;
+	}
+
 	if (!ws) for (;;) {
 		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
 			while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) {
diff --git a/src/multibyte/mbtowc.c b/src/multibyte/mbtowc.c
index 803d221..71a9506 100644
--- a/src/multibyte/mbtowc.c
+++ b/src/multibyte/mbtowc.c
@@ -4,6 +4,7 @@
  * unnecessary.
  */
 
+#include <stdlib.h>
 #include <wchar.h>
 #include <errno.h>
 #include "internal.h"
@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
 	if (!wc) wc = &dummy;
 
 	if (*s < 0x80) return !!(*wc = *s);
+	if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
 	if (*s-SA > SB-SA) goto ilseq;
 	c = bittab[*s++-SA];
 
diff --git a/src/multibyte/wcrtomb.c b/src/multibyte/wcrtomb.c
index 59f733d..ddc37a5 100644
--- a/src/multibyte/wcrtomb.c
+++ b/src/multibyte/wcrtomb.c
@@ -4,8 +4,10 @@
  * unnecessary.
  */
 
+#include <stdlib.h>
 #include <wchar.h>
 #include <errno.h>
+#include "internal.h"
 
 size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 {
@@ -13,6 +15,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
 	if ((unsigned)wc < 0x80) {
 		*s = wc;
 		return 1;
+	} else if (MB_CUR_MAX == 1) {
+		if (!IS_CODEUNIT(wc)) {
+			errno = EILSEQ;
+			return -1;
+		}
+		*s = wc;
+		return 1;
 	} else if ((unsigned)wc < 0x800) {
 		*s++ = 0xc0 | (wc>>6);
 		*s = 0x80 | (wc&0x3f);
diff --git a/src/multibyte/wctob.c b/src/multibyte/wctob.c
index d6353ee..4aeda6a 100644
--- a/src/multibyte/wctob.c
+++ b/src/multibyte/wctob.c
@@ -1,8 +1,10 @@
-#include <stdio.h>
 #include <wchar.h>
+#include <stdlib.h>
+#include "internal.h"
 
 int wctob(wint_t c)
 {
 	if (c < 128U) return c;
+	if (MB_CUR_MAX==1 && IS_CODEUNIT(c)) return (unsigned char)c;
 	return EOF;
 }
diff --git a/src/regex/fnmatch.c b/src/regex/fnmatch.c
index 7f6b65f..978fff8 100644
--- a/src/regex/fnmatch.c
+++ b/src/regex/fnmatch.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <wctype.h>
+#include "locale_impl.h"
 
 #define END 0
 #define UNMATCHABLE -2
@@ -229,7 +230,7 @@ static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
 	 * On illegal sequences we may get it wrong, but in that case
 	 * we necessarily have a matching failure anyway. */
 	for (s=endstr; s>str && tailcnt; tailcnt--) {
-		if (s[-1] < 128U) s--;
+		if (s[-1] < 128U || MB_CUR_MAX==1) s--;
 		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
 	}
 	if (tailcnt) return FNM_NOMATCH;

[-- Attachment #3: bytelocale-part2.diff --]
[-- Type: text/plain, Size: 5126 bytes --]

diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index e1325fe..72c5519 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -47,6 +47,7 @@ struct _IO_FILE {
 	unsigned char *shend;
 	off_t shlim, shcnt;
 	FILE *prev_locked, *next_locked;
+	struct __locale_struct *locale;
 };
 
 size_t __stdio_read(FILE *, unsigned char *, size_t);
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index e6121ae..1eeea94 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <stdint.h>
+#include "locale_impl.h"
 
 #define UTF_32BE    0300
 #define UTF_16LE    0301
@@ -165,9 +166,12 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 	int err;
 	unsigned char type = map[-1];
 	unsigned char totype = tomap[-1];
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	if (!in || !*in || !*inb) return 0;
 
+	*ploc = UTF8_LOCALE;
+
 	for (; *inb; *in+=l, *inb-=l) {
 		c = *(unsigned char *)*in;
 		l = 1;
@@ -431,6 +435,7 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 			break;
 		}
 	}
+	*ploc = loc;
 	return x;
 ilseq:
 	err = EILSEQ;
@@ -445,5 +450,6 @@ starved:
 	x = -1;
 end:
 	errno = err;
+	*ploc = loc;
 	return x;
 }
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index b261b44..e455cfe 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -1,8 +1,9 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <errno.h>
 
-wint_t __fgetwc_unlocked(FILE *f)
+static wint_t __fgetwc_unlocked_internal(FILE *f)
 {
 	mbstate_t st = { 0 };
 	wchar_t wc;
@@ -10,8 +11,6 @@ wint_t __fgetwc_unlocked(FILE *f)
 	unsigned char b;
 	size_t l;
 
-	if (f->mode <= 0) fwide(f, 1);
-
 	/* Convert character from buffer if possible */
 	if (f->rpos < f->rend) {
 		l = mbrtowc(&wc, (void *)f->rpos, f->rend - f->rpos, &st);
@@ -39,6 +38,16 @@ wint_t __fgetwc_unlocked(FILE *f)
 	return wc;
 }
 
+wint_t __fgetwc_unlocked(FILE *f)
+{
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
+	wchar_t wc = __fgetwc_unlocked_internal(f);
+	*ploc = loc;
+	return wc;
+}
+
 wint_t fgetwc(FILE *f)
 {
 	wint_t c;
diff --git a/src/stdio/fputwc.c b/src/stdio/fputwc.c
index 1bf165b..0be5666 100644
--- a/src/stdio/fputwc.c
+++ b/src/stdio/fputwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -7,8 +8,10 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 {
 	char mbc[MB_LEN_MAX];
 	int l;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (isascii(c)) {
 		c = putc_unlocked(c, f);
@@ -18,8 +21,11 @@ wint_t __fputwc_unlocked(wchar_t c, FILE *f)
 		else f->wpos += l;
 	} else {
 		l = wctomb(mbc, c);
-		if (l < 0 || __fwritex((void *)mbc, l, f) < l) c = WEOF;
+		if (l < 0 || __fwritex((void *)mbc, l, f) < l)
+			c = WEOF;
 	}
+	if (c==WEOF) f->flags |= F_ERR;
+	*ploc = loc;
 	return c;
 }
 
diff --git a/src/stdio/fputws.c b/src/stdio/fputws.c
index 317d65f..0ed02f1 100644
--- a/src/stdio/fputws.c
+++ b/src/stdio/fputws.c
@@ -1,23 +1,28 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 
 int fputws(const wchar_t *restrict ws, FILE *restrict f)
 {
 	unsigned char buf[BUFSIZ];
 	size_t l=0;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
 	fwide(f, 1);
+	*ploc = f->locale;
 
 	while (ws && (l = wcsrtombs((void *)buf, (void*)&ws, sizeof buf, 0))+1 > 1)
 		if (__fwritex(buf, l, f) < l) {
 			FUNLOCK(f);
+			*ploc = loc;
 			return -1;
 		}
 
 	FUNLOCK(f);
 
+	*ploc = loc;
 	return l; /* 0 or -1 */
 }
 
diff --git a/src/stdio/fwide.c b/src/stdio/fwide.c
index 8088e7a..8410b15 100644
--- a/src/stdio/fwide.c
+++ b/src/stdio/fwide.c
@@ -1,13 +1,14 @@
-#include <wchar.h>
 #include "stdio_impl.h"
-
-#define SH (8*sizeof(int)-1)
-#define NORMALIZE(x) ((x)>>SH | -((-(x))>>SH))
+#include "locale_impl.h"
 
 int fwide(FILE *f, int mode)
 {
 	FLOCK(f);
-	if (!f->mode) f->mode = NORMALIZE(mode);
+	if (mode) {
+		if (!f->locale) f->locale = MB_CUR_MAX==1
+			? C_LOCALE : UTF8_LOCALE;
+		if (!f->mode) f->mode = mode>0 ? 1 : -1;
+	}
 	mode = f->mode;
 	FUNLOCK(f);
 	return mode;
diff --git a/src/stdio/ungetwc.c b/src/stdio/ungetwc.c
index d4c7de3..80d6e20 100644
--- a/src/stdio/ungetwc.c
+++ b/src/stdio/ungetwc.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "locale_impl.h"
 #include <wchar.h>
 #include <limits.h>
 #include <ctype.h>
@@ -8,15 +9,18 @@ wint_t ungetwc(wint_t c, FILE *f)
 {
 	unsigned char mbc[MB_LEN_MAX];
 	int l=1;
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
 
 	FLOCK(f);
 
 	if (f->mode <= 0) fwide(f, 1);
+	*ploc = f->locale;
 
 	if (!f->rpos) __toread(f);
 	if (!f->rpos || f->rpos < f->buf - UNGET + l || c == WEOF ||
 	    (!isascii(c) && (l = wctomb((void *)mbc, c)) < 0)) {
 		FUNLOCK(f);
+		*ploc = loc;
 		return WEOF;
 	}
 
@@ -26,5 +30,6 @@ wint_t ungetwc(wint_t c, FILE *f)
 	f->flags &= ~F_EOF;
 
 	FUNLOCK(f);
+	*ploc = loc;
 	return c;
 }

[-- Attachment #4: bytelocale-part3.diff --]
[-- Type: text/plain, Size: 783 bytes --]

diff --git a/include/stdlib.h b/include/stdlib.h
index 97ce5a7..d2c911f 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -76,7 +76,8 @@ size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t);
 #define EXIT_FAILURE 1
 #define EXIT_SUCCESS 0
 
-#define MB_CUR_MAX ((size_t)+4)
+size_t __ctype_get_mb_cur_max(void);
+#define MB_CUR_MAX (__ctype_get_mb_cur_max())
 
 #define RAND_MAX (0x7fffffff)
 
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index 85db793..f5e4d9b 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -34,4 +34,7 @@ const char *__lctrans_cur(const char *);
 
 #define CURRENT_UTF8 (!!__pthread_self()->locale->cat[LC_CTYPE])
 
+#undef MB_CUR_MAX
+#define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
+
 #endif

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 2
  2015-06-13  7:06   ` [PATCH] Byte-based C locale, draft 2 Rich Felker
@ 2015-06-16  4:26     ` Rich Felker
  2015-06-16  4:35       ` Rich Felker
  0 siblings, 1 reply; 21+ messages in thread
From: Rich Felker @ 2015-06-16  4:26 UTC (permalink / raw)
  To: musl

On Sat, Jun 13, 2015 at 03:06:55AM -0400, Rich Felker wrote:
> diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
> index 9d2c3b1..8de060f 100644
> --- a/src/multibyte/btowc.c
> +++ b/src/multibyte/btowc.c
> @@ -1,7 +1,10 @@
> -#include <stdio.h>
>  #include <wchar.h>
> +#include <stdlib.h>
> +#include "internal.h"
>  
>  wint_t btowc(int c)
>  {
> -	return c<128U ? c : EOF;
> +	if (c < 128U) return c;
> +	if (MB_CUR_MAX==1) return CODEUNIT(c);
> +	return WEOF;
>  }

This was mildly buggy before the patch, and worse with it -- c==EOF
will no longer produce WEOF. Fixed the old bug and updating the patch.

Rich


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Byte-based C locale, draft 2
  2015-06-16  4:26     ` Rich Felker
@ 2015-06-16  4:35       ` Rich Felker
  0 siblings, 0 replies; 21+ messages in thread
From: Rich Felker @ 2015-06-16  4:35 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 834 bytes --]

On Tue, Jun 16, 2015 at 12:26:39AM -0400, Rich Felker wrote:
> On Sat, Jun 13, 2015 at 03:06:55AM -0400, Rich Felker wrote:
> > diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
> > index 9d2c3b1..8de060f 100644
> > --- a/src/multibyte/btowc.c
> > +++ b/src/multibyte/btowc.c
> > @@ -1,7 +1,10 @@
> > -#include <stdio.h>
> >  #include <wchar.h>
> > +#include <stdlib.h>
> > +#include "internal.h"
> >  
> >  wint_t btowc(int c)
> >  {
> > -	return c<128U ? c : EOF;
> > +	if (c < 128U) return c;
> > +	if (MB_CUR_MAX==1) return CODEUNIT(c);
> > +	return WEOF;
> >  }
> 
> This was mildly buggy before the patch, and worse with it -- c==EOF
> will no longer produce WEOF. Fixed the old bug and updating the patch.

Updated version of this file's patch (against the fixed old code which
I already committed) is attached.

Rich

[-- Attachment #2: bytelocale_new_btowc.diff --]
[-- Type: text/plain, Size: 447 bytes --]

diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c
index 29cb798..a7369a1 100644
--- a/src/multibyte/btowc.c
+++ b/src/multibyte/btowc.c
@@ -1,8 +1,11 @@
 #include <stdio.h>
 #include <wchar.h>
+#include <stdlib.h>
+#include "internal.h"
 
 wint_t btowc(int c)
 {
-	c = (unsigned char)c;
-	return c<128U ? c : EOF;
+	if ((unsigned char)c < 128) return (unsigned char)c;
+	if (MB_CUR_MAX==1 && c!=EOF) return CODEUNIT(c);
+	return WEOF;
 }

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2015-06-16  4:35 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-06 21:40 [PATCH] Byte-based C locale, draft 1 Rich Felker
2015-06-06 22:39 ` Harald Becker
2015-06-06 23:10   ` Rich Felker
2015-06-06 23:59     ` Harald Becker
2015-06-07  0:24       ` Rich Felker
2015-06-07 23:59         ` Build option to disable locale [was: Byte-based C locale, draft 1] Harald Becker
2015-06-08  0:28           ` Josiah Worcester
2015-06-08  1:57             ` Harald Becker
2015-06-08  2:36               ` Rich Felker
2015-06-08  3:35                 ` Harald Becker
2015-06-08  3:51                   ` Josiah Worcester
2015-06-08  0:33           ` Rich Felker
2015-06-08  2:46             ` Harald Becker
2015-06-08  4:06               ` Rich Felker
2015-06-09  3:20               ` Isaac Dunham
2015-06-09  4:27                 ` Rich Felker
2015-06-07  1:17 ` [PATCH] Byte-based C locale, draft 1 Rich Felker
2015-06-07  2:50 ` Rich Felker
2015-06-13  7:06   ` [PATCH] Byte-based C locale, draft 2 Rich Felker
2015-06-16  4:26     ` Rich Felker
2015-06-16  4:35       ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).