mailing list of musl libc
 help / color / mirror / code / Atom feed
* Locale framework, part 1
@ 2014-07-02 17:18 Rich Felker
  0 siblings, 0 replies; only message in thread
From: Rich Felker @ 2014-07-02 17:18 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 1704 bytes --]

Despite it being a good bit more work, I've taken the time to factor
the locale changes into two parts to make it more clear what's going
on. I plan to commit this very soon, with possible changes after it's
committed, but I'm posting it here for some immediate discussion
taking place on IRC.

Part 1 adds the framework for setlocale: a real locale data structure,
an operation to change the individual category settings, tracking the
active locale per thread, etc. It also has tracking, as described in
the previous thread, of byte-based versus utf8-based status, but this
status is not used anywhere. So, while separate "C" and "C.UTF-8"
locale states exist after part 1 is applied, they both still have
nl_langinfo(CODESET) being "UTF-8".

Part 2 will add the actual functionality for byte-based C locale,
which requires trivial changes in the existing multibyte functions
which operate based on the current locale, and moderately invasive
changes in the wide stdio functions which have to bind to the locale
that was in effect when the stream became wide-oriented.

Part 3 has not yet been started, and will add actual positive features
related to the locale setting, such as tiny working catgets/gettext,
locale-based time formatting, etc. If I'm not mistaken, this can all
be implemented on top of .mo files and a trivial gettext lookup
function (e.g. LC_TIME can be gettext lookups for the corresponding
C-locale time-formatting string) or catgets (using the nl_langinfo
item indexes at integer catgets keys rather than string-based keys).
This will probably be a project for a subsequent release cycle, as
some more discussion should go into design before it happens.

Attached is part 1.

Rich

[-- Attachment #2: locale_part1.diff --]
[-- Type: text/plain, Size: 10920 bytes --]

diff --git a/src/env/__init_tls.c b/src/env/__init_tls.c
index 13cf2ee..efa0728 100644
--- a/src/env/__init_tls.c
+++ b/src/env/__init_tls.c
@@ -16,6 +16,7 @@ int __init_tp(void *p)
 	if (!r) libc.can_do_threads = 1;
 	libc.has_thread_pointer = 1;
 	td->tid = td->pid = __syscall(SYS_set_tid_address, &td->tid);
+	td->locale = &libc.global_locale;
 	return 0;
 }
 
diff --git a/src/internal/libc.h b/src/internal/libc.h
index fb4d9bc..037d16b 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -5,6 +5,11 @@
 #include <stdio.h>
 #include <limits.h>
 
+struct __locale_struct {
+	int ctype_utf8;
+	char *messages_name;
+};
+
 struct __libc {
 	int has_thread_pointer;
 	int can_do_threads;
@@ -16,6 +21,9 @@ struct __libc {
 	int ofl_lock[2];
 	size_t tls_size;
 	size_t page_size;
+	volatile int uselocale_cnt;
+	volatile int bytelocale_cnt_minus_1;
+	struct __locale_struct global_locale;
 };
 
 extern size_t __hwcap;
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index f41c6f2..2747b85 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -1,5 +1,17 @@
 #include <locale.h>
+#include <stdlib.h>
+#include "libc.h"
+#include "pthread_impl.h"
 
-struct __locale_struct {
-	int dummy;
-};
+#define LOCALE_NAME_MAX 15
+
+int __setlocalecat(locale_t, int, const char *);
+
+#define CURRENT_LOCALE \
+	(libc.uselocale_cnt ? __pthread_self()->locale : &libc.global_locale)
+
+#define CURRENT_UTF8 \
+	(libc.bytelocale_cnt_minus_1<0 || __pthread_self()->locale->ctype_utf8)
+
+#undef MB_CUR_MAX
+#define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
diff --git a/src/locale/__setlocalecat.c b/src/locale/__setlocalecat.c
new file mode 100644
index 0000000..f1e4bf0
--- /dev/null
+++ b/src/locale/__setlocalecat.c
@@ -0,0 +1,46 @@
+#include <locale.h>
+#include <string.h>
+#include "locale_impl.h"
+#include "libc.h"
+#include "atomic.h"
+
+static const char envvars[][12] = {
+	"LC_CTYPE",
+	"LC_NUMERIC",
+	"LC_TIME",
+	"LC_COLLATE",
+	"LC_MONETARY",
+	"LC_MESSAGES",
+};
+
+int __setlocalecat(locale_t loc, int cat, const char *val)
+{
+	if (!*val) {
+		(val = getenv("LC_ALL")) ||
+		(val = getenv(envvars[cat])) ||
+		(val = getenv("LANG")) ||
+		(val = "C.UTF-8");
+	}
+
+	size_t n = strnlen(val, LOCALE_NAME_MAX);
+	int builtin = (val[0]=='C' && !val[1])
+		|| !strcmp(val, "C.UTF-8")
+		|| !strcmp(val, "POSIX");
+
+	switch (cat) {
+	case LC_CTYPE:
+		a_store(&loc->ctype_utf8, !builtin || val[1]=='.');
+		break;
+	case LC_MESSAGES:
+		if (builtin) {
+			loc->messages_name[0] = 0;
+		} else {
+			memcpy(loc->messages_name, val, n);
+			loc->messages_name[n] = 0;
+		}
+		/* fall through */
+	default:
+		break;
+	}
+	return 0;
+}
diff --git a/src/locale/duplocale.c b/src/locale/duplocale.c
index f9fc1ff..1336870 100644
--- a/src/locale/duplocale.c
+++ b/src/locale/duplocale.c
@@ -3,12 +3,19 @@
 #include "locale_impl.h"
 #include "libc.h"
 
-locale_t duplocale(locale_t old)
+locale_t __duplocale(locale_t old)
 {
-	locale_t new;
-	new = calloc(1, sizeof *new);
+	locale_t new = calloc(1, sizeof *new + LOCALE_NAME_MAX + 1);
+	if (!new) return 0;
+	new->messages_name = (void *)(new+1);
+
+	if (old == LC_GLOBAL_LOCALE) old = &libc.global_locale;
+	new->ctype_utf8 = old->ctype_utf8;
+	if (old->messages_name)
+		strcpy(new->messages_name, old->messages_name);
+
 	if (new && old != LC_GLOBAL_LOCALE) memcpy(new, old, sizeof *new);
 	return new;
 }
 
-weak_alias(duplocale, __duplocale);
+weak_alias(__duplocale, duplocale);
diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c
index 7bb56ee..13abf45 100644
--- a/src/locale/langinfo.c
+++ b/src/locale/langinfo.c
@@ -1,5 +1,6 @@
 #include <locale.h>
 #include <langinfo.h>
+#include "locale_impl.h"
 #include "libc.h"
 
 static const char c_time[] =
@@ -60,7 +61,7 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
 
 char *__nl_langinfo(nl_item item)
 {
-	return __nl_langinfo_l(item, 0);
+	return __nl_langinfo_l(item, CURRENT_LOCALE);
 }
 
 weak_alias(__nl_langinfo, nl_langinfo);
diff --git a/src/locale/newlocale.c b/src/locale/newlocale.c
index 447c8fc..39501d0 100644
--- a/src/locale/newlocale.c
+++ b/src/locale/newlocale.c
@@ -3,12 +3,24 @@
 #include "locale_impl.h"
 #include "libc.h"
 
-locale_t newlocale(int mask, const char *name, locale_t base)
+locale_t __newlocale(int mask, const char *name, locale_t loc)
 {
-	if (*name && strcmp(name, "C") && strcmp(name, "POSIX"))
-		return 0;
-	if (!base) base = calloc(1, sizeof *base);
-	return base;
+	int i;
+
+	if (!loc) {
+		loc = calloc(1, sizeof *loc + LOCALE_NAME_MAX + 1);
+		if (!loc) return 0;
+		loc->messages_name = (void *)(loc+1);
+		for (i=0; i<LC_ALL; i++)
+			if (!(mask & (1<<i)))
+				__setlocalecat(loc, i, "");
+	}
+
+	for (i=0; i<LC_ALL; i++)
+		if (mask & (1<<i))
+			__setlocalecat(loc, i, name);
+
+	return loc;
 }
 
-weak_alias(newlocale, __newlocale);
+weak_alias(__newlocale, newlocale);
diff --git a/src/locale/setlocale.c b/src/locale/setlocale.c
index 28f29b8..cbc0b55 100644
--- a/src/locale/setlocale.c
+++ b/src/locale/setlocale.c
@@ -1,9 +1,67 @@
 #include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include "locale_impl.h"
+#include "libc.h"
+#include "atomic.h"
 
-char *setlocale(int category, const char *locale)
+static char buf[2+4*(LOCALE_NAME_MAX+1)];
+
+char *setlocale(int cat, const char *name)
 {
-	/* Note: plain "C" would be better, but puts some broken
-	 * software into legacy 8-bit-codepage mode, ignoring
-	 * the standard library's multibyte encoding */
-	return "C.UTF-8";
+	if (!libc.global_locale.messages_name) {
+		libc.global_locale.messages_name =
+			buf + 2 + 3*(LOCALE_NAME_MAX+1);
+	}
+
+	if ((unsigned)cat > LC_ALL) return 0;
+
+	/* For LC_ALL, setlocale is required to return a string which
+	 * encodes the current setting for all categories. The format of
+	 * this string is unspecified, and only the following code, which
+	 * performs both the serialization and deserialization, depends
+	 * on the format, so it can easily be changed if needed. */
+	if (cat == LC_ALL) {
+		if (name) {
+			char part[LOCALE_NAME_MAX+1];
+			int i, j;
+			if (name[0] && name[1]==';'
+			    && strlen(name) > 2 + 3*(LOCALE_NAME_MAX+1)) {
+				part[0] = name[0];
+				part[1] = 0;
+				setlocale(LC_CTYPE, part);
+				part[LOCALE_NAME_MAX] = 0;
+				for (i=LC_TIME; i<LC_MESSAGES; i++) {
+					memcpy(part, name + 2 + (i-2)*(LOCALE_NAME_MAX+1), LOCALE_NAME_MAX);
+					for (j=LOCALE_NAME_MAX-1; j && part[j]==';'; j--)
+						part[j] = 0;
+					setlocale(i, part);
+				}
+				setlocale(LC_MESSAGES, name + 2 + 3*(LOCALE_NAME_MAX+1));
+			} else {
+				for (i=0; i<LC_ALL; i++)
+					setlocale(i, name);
+			}
+		}
+		memset(buf, ';', 2 + 3*(LOCALE_NAME_MAX+1));
+		buf[0] = libc.global_locale.ctype_utf8 ? 'U' : 'C';
+		return buf;
+	}
+
+	if (name) {
+		int adj = libc.global_locale.ctype_utf8;
+		__setlocalecat(&libc.global_locale, cat, name);
+		adj -= libc.global_locale.ctype_utf8;
+		if (adj) a_fetch_add(&libc.bytelocale_cnt_minus_1, adj);
+	}
+
+	switch (cat) {
+	case LC_CTYPE:
+		return libc.global_locale.ctype_utf8 ? "C.UTF-8" : "C";
+	case LC_MESSAGES:
+		return libc.global_locale.messages_name[0]
+			? libc.global_locale.messages_name : "C";
+	default:
+		return "C";
+	}
 }
diff --git a/src/locale/strfmon.c b/src/locale/strfmon.c
index e25db97..7cf2136 100644
--- a/src/locale/strfmon.c
+++ b/src/locale/strfmon.c
@@ -3,6 +3,7 @@
 #include <stdarg.h>
 #include <monetary.h>
 #include <errno.h>
+#include "locale_impl.h"
 
 static ssize_t vstrfmon_l(char *s, size_t n, locale_t loc, const char *fmt, va_list ap)
 {
@@ -93,7 +94,7 @@ ssize_t strfmon(char *restrict s, size_t n, const char *restrict fmt, ...)
 	ssize_t ret;
 
 	va_start(ap, fmt);
-	ret = vstrfmon_l(s, n, 0, fmt, ap);
+	ret = vstrfmon_l(s, n, CURRENT_LOCALE, fmt, ap);
 	va_end(ap);
 
 	return ret;
diff --git a/src/locale/uselocale.c b/src/locale/uselocale.c
index 4fc5c64..5106795 100644
--- a/src/locale/uselocale.c
+++ b/src/locale/uselocale.c
@@ -2,12 +2,25 @@
 #include "pthread_impl.h"
 #include "libc.h"
 
-locale_t uselocale(locale_t l)
+locale_t __uselocale(locale_t new)
 {
 	pthread_t self = __pthread_self();
 	locale_t old = self->locale;
-	if (l) self->locale = l;
-	return old;
+	locale_t global = &libc.global_locale;
+
+	if (new == LC_GLOBAL_LOCALE) new = global;
+
+	if (new && new != old) {
+		int adj = 0;
+		if (new == global) a_dec(&libc.uselocale_cnt);
+		else if (!new->ctype_utf8) adj++;
+		if (old == global) a_inc(&libc.uselocale_cnt);
+		else if (!old->ctype_utf8) adj--;
+		a_fetch_add(&libc.bytelocale_cnt_minus_1, adj);
+		self->locale = new;
+	}
+
+	return old == global ? LC_GLOBAL_LOCALE : old;
 }
 
-weak_alias(uselocale, __uselocale);
+weak_alias(__uselocale, uselocale);
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index e9c8160..a7493c1 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -57,6 +57,12 @@ _Noreturn void pthread_exit(void *result)
 		exit(0);
 	}
 
+	if (self->locale != &libc.global_locale) {
+		a_dec(&libc.uselocale_cnt);
+		if (self->locale->ctype_utf8)
+			a_dec(&libc.bytelocale_cnt_minus_1);
+	}
+
 	if (self->detached && self->map_base) {
 		/* Detached threads must avoid the kernel clear_child_tid
 		 * feature, since the virtual address will have been
@@ -205,6 +211,7 @@ int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attrp
 	new->start_arg = arg;
 	new->self = new;
 	new->tsd = (void *)tsd;
+	new->locale = &libc.global_locale;
 	if (attr._a_detach) {
 		new->detached = 1;
 		flags -= CLONE_CHILD_CLEARTID;
diff --git a/src/time/strftime.c b/src/time/strftime.c
index 75ebca6..794fbe1 100644
--- a/src/time/strftime.c
+++ b/src/time/strftime.c
@@ -5,6 +5,7 @@
 #include <locale.h>
 #include <time.h>
 #include <limits.h>
+#include "locale_impl.h"
 #include "libc.h"
 #include "time_impl.h"
 
@@ -263,7 +264,7 @@ size_t __strftime_l(char *restrict s, size_t n, const char *restrict f, const st
 
 size_t strftime(char *restrict s, size_t n, const char *restrict f, const struct tm *restrict tm)
 {
-	return __strftime_l(s, n, f, tm, 0);
+	return __strftime_l(s, n, f, tm, CURRENT_LOCALE);
 }
 
 weak_alias(__strftime_l, strftime_l);
diff --git a/src/time/wcsftime.c b/src/time/wcsftime.c
index 8d2a2eb..638e64f 100644
--- a/src/time/wcsftime.c
+++ b/src/time/wcsftime.c
@@ -1,6 +1,7 @@
 #include <wchar.h>
 #include <time.h>
 #include <locale.h>
+#include "locale_impl.h"
 #include "libc.h"
 
 const char *__strftime_fmt_1(char (*s)[100], size_t *l, int f, const struct tm *tm, locale_t loc);
@@ -64,7 +65,7 @@ size_t __wcsftime_l(wchar_t *restrict s, size_t n, const wchar_t *restrict f, co
 
 size_t wcsftime(wchar_t *restrict wcs, size_t n, const wchar_t *restrict f, const struct tm *restrict tm)
 {
-	return __wcsftime_l(wcs, n, f, tm, 0);
+	return __wcsftime_l(wcs, n, f, tm, CURRENT_LOCALE);
 }
 
 weak_alias(__wcsftime_l, wcsftime_l);

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2014-07-02 17:18 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-02 17:18 Locale framework, part 1 Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).