IDNA support in name lookups

mailing list of musl libc
 help / color / mirror / code / Atom feed

* IDNA support in name lookups
@ 2017-03-29 11:26 Joakim Sindholt
  2017-04-02  7:30 ` [PATCH v2] " Joakim Sindholt
  0 siblings, 1 reply; 7+ messages in thread
From: Joakim Sindholt @ 2017-03-29 11:26 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 167 bytes --]

Here's a first draft patch for internationalized domain name support.
I implemented it based on the pseudocode in RFC3492[1].

[1] https://tools.ietf.org/html/rfc3492

[-- Attachment #2: 0001-add-IDNA-support-to-name-lookups.patch --]
[-- Type: text/x-diff, Size: 6835 bytes --]

From 7542dfe05b33b200360f982caf1631615cde30fb Mon Sep 17 00:00:00 2001
From: Joakim Sindholt <opensource@zhasha.com>
Date: Wed, 29 Mar 2017 11:51:02 +0200
Subject: [PATCH] add IDNA support to name lookups

---
 src/network/lookup_name.c | 202 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 194 insertions(+), 8 deletions(-)

diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
index fb7303a..3590cb1 100644
--- a/src/network/lookup_name.c
+++ b/src/network/lookup_name.c
@@ -10,9 +10,21 @@
 #include <unistd.h>
 #include <pthread.h>
 #include <errno.h>
+#include <wchar.h>
 #include "lookup.h"
 #include "stdio_impl.h"
 #include "syscall.h"
+#include "locale_impl.h"
+
+enum {
+	base         = 36,
+	tmin         = 1,
+	tmax         = 26,
+	skew         = 38,
+	damp         = 700,
+	initial_bias = 72,
+	initial_n    = 128,
+};
 
 static int is_valid_hostname(const char *host)
 {
@@ -22,6 +34,162 @@ static int is_valid_hostname(const char *host)
 	return !*s;
 }
 
+static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime)
+{
+	unsigned int k = 0;
+	delta /= firsttime ? damp : 2;
+	delta += delta / numpoints;
+	while (delta > ((base - tmin) * tmax) / 2) {
+		delta /= base - tmin;
+		k += base;
+	}
+	return k + ((base - tmin + 1) * delta) / (delta + skew);
+}
+
+static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max)
+{
+	static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789";
+	const unsigned char *usrc = (void *)src;
+	unsigned int codepoints = 0;
+	unsigned int dlen = 0;
+	unsigned int si, mi;
+	unsigned int n = initial_n;
+	unsigned int delta = 0;
+	unsigned int bias = initial_bias;
+	unsigned int h, b;
+	for (si = 0; si < len; ++si) {
+		if (usrc[si] < 0x80) {
+			if (dlen == max)
+				return -1;
+			dst[dlen++] = src[si];
+		} else if ((usrc[si] & 0xC0) == 0xC0) {
+			++codepoints;
+		}
+	}
+	codepoints += dlen;
+	h = b = dlen;
+	if (dlen) {
+		if (dlen == max)
+			return -1;
+		dst[dlen++] = '-';
+	}
+	while (h < codepoints) {
+		unsigned int m = (unsigned int)-1;
+		unsigned int c;
+		wchar_t wc;
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c >= n && c < m)
+				m = c;
+		}
+		if (((unsigned int)-1 - delta) / (h + 1) < m - n)
+			return -1;
+		delta += (m - n) * (h + 1);
+		n = m;
+
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c < n /* || c < 0x80 not necessary*/)
+				if (++delta == 0)
+					return -1;
+			if (c == n) {
+				unsigned int q = delta;
+				unsigned int k;
+				for (k = base; ; k += base) {
+					unsigned int t;
+					if (k <= bias + tmin) {
+						t = tmin;
+					} else if (k >= bias + tmax) {
+						t = tmax;
+					} else {
+						t = k - bias;
+					}
+					if (q < t)
+						break;
+					if (dlen == max)
+						return -1;
+					dst[dlen++] = tbl[t + ((q - t) % (base - t))];
+					q = (q - t) / (base - t);
+				}
+				if (dlen == max)
+					return -1;
+				dst[dlen++] = tbl[q];
+				bias = adapt(delta, h + 1, h == b);
+				delta = 0;
+				++h;
+			}
+		}
+		++delta;
+		++n;
+	}
+	return dlen;
+}
+
+static ssize_t idnaenc(char dst[static 256], const char *src)
+{
+	size_t left = strlen(src);
+	size_t olen = 0;
+
+	while (left) {
+		const char *dot;
+		size_t len, i;
+		int basic = 1;
+
+		dot = memchr(src, '.', left);
+		if (!dot) { dot = src + left; }
+		len = dot - src;
+		if (len == 0) { return -1; }
+		left -= len + !!*dot;
+
+		for (i = 0; i < len; ) {
+			unsigned int c;
+			wchar_t wc;
+			int n = mbtowc(&wc, src + i, len - i);
+			c = (n <= 0) ? 0 : (unsigned int)wc;
+			if (c < 0x80) {
+				if (!isalnum(c) && !(i > 0 && c == '-'))
+					return -1;
+			} else {
+				if (c >= 0x7F && c <= 0x9F)
+					return -1;
+				basic = 0;
+			}
+			i += n;
+		}
+		if (basic) {
+			if (len > 63 || len > 254 - olen)
+				return -1;
+			for (i = 0; i < len; ++i)
+				dst[olen + i] = tolower(src[i]);
+			olen += len;
+		} else {
+			ssize_t r;
+			size_t max;
+			if (olen >= 254 - 4)
+				return -1;
+			max = 254 - 4 - olen;
+			if (max > 63 - 4)
+				max = 63 - 4;
+			memcpy(dst + olen, "xn--", 4);
+			r = punyenc(dst + olen + 4, src, len, max);
+			if (r <= 0)
+				return -1;
+			olen += r + 4;
+		}
+		if (olen == 255 || !*dot && olen == 254)
+			return -1;
+		if (*dot)
+			dst[olen++] = *dot;
+		src = dot + !!*dot;
+	}
+	if (olen == 0)
+		return -1;
+	dst[olen] = 0;
+	return olen;
+}
+
 static int name_from_null(struct address buf[static 2], const char *name, int family, int flags)
 {
 	int cnt = 0;
@@ -61,12 +229,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		return EAI_SYSTEM;
 	}
 	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
-		char *p, *z;
+		char idna[256];
+		ssize_t r;
+		char *p, *z, c;
 
 		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
-		for(p=line+1; (p=strstr(p, name)) &&
-			(!isspace(p[-1]) || !isspace(p[l])); p++);
-		if (!p) continue;
+		/* skip ip address and canonicalize names */
+		for (p=line; *p && !isspace(*p); p++);
+		while (*p) {
+			for (; *p && isspace(*p); p++);
+			for (z=p; *z && !isspace(*z); z++);
+			c = *z;
+			*z = 0;
+			r = idnaenc(idna, p);
+			*z = c;
+			if (r == l && memcmp(idna, name, l) == 0)
+				break;
+			p = z;
+		}
+		if (!*p) continue;
 
 		/* Isolate IP address to parse */
 		for (p=line; *p && !isspace(*p); p++);
@@ -86,7 +267,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		for (; *p && isspace(*p); p++);
 		for (z=p; *z && !isspace(*z); z++);
 		*z = 0;
-		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
+		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
 	}
 	__fclose_ca(f);
 	return cnt ? cnt : badfam;
@@ -285,15 +466,19 @@ static int addrcmp(const void *_a, const void *_b)
 
 int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
 {
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	char _name[256];
 	int cnt = 0, i, j;
 
+	*ploc = UTF8_LOCALE;
 	*canon = 0;
 	if (name) {
 		/* reject empty name and check len so it fits into temp bufs */
-		size_t l = strnlen(name, 255);
-		if (l-1 >= 254)
+		ssize_t l;
+		if ((l = idnaenc(_name, name)) <= 0)
 			return EAI_NONAME;
-		memcpy(canon, name, l+1);
+		memcpy(canon, _name, l+1);
+		name = _name;
 	}
 
 	/* Procedurally, a request for v6 addresses with the v4-mapped
@@ -311,6 +496,7 @@ int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], c
 		cnt = name_from_hosts(buf, canon, name, family);
 		if (!cnt) cnt = name_from_dns_search(buf, canon, name, family);
 	}
+	*ploc = loc;
 	if (cnt<=0) return cnt ? cnt : EAI_NONAME;
 
 	/* Filter/transform results for v4-mapped lookup, if requested. */
-- 
2.10.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] IDNA support in name lookups
  2017-03-29 11:26 IDNA support in name lookups Joakim Sindholt
@ 2017-04-02  7:30 ` Joakim Sindholt
  2017-04-23  1:01   ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Joakim Sindholt @ 2017-04-02  7:30 UTC (permalink / raw)
  To: musl

[-- Attachment #1: Type: text/plain, Size: 342 bytes --]

Changes since v1:
* Reject UTF-16 surrogate range runes
* Remove locale override

This is from some discussion on IRC and while I agree that it's more
"correct" in POSIX terms, I'm not particularly happy about having to
explicitly enable UTF-8 support with setlocale.

There might still be bugs and character ranges that need to be rejected.

[-- Attachment #2: 0001-add-IDNA-support-to-name-lookups.patch --]
[-- Type: text/x-diff, Size: 6507 bytes --]

From 54d5caf36cdce4e5008aecfcc2b02580fb52d0cb Mon Sep 17 00:00:00 2001
From: Joakim Sindholt <opensource@zhasha.com>
Date: Wed, 29 Mar 2017 11:51:02 +0200
Subject: [PATCH] add IDNA support to name lookups

---
 src/network/lookup_name.c | 202 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 193 insertions(+), 9 deletions(-)

diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
index fb7303a..fd4275c 100644
--- a/src/network/lookup_name.c
+++ b/src/network/lookup_name.c
@@ -10,9 +10,21 @@
 #include <unistd.h>
 #include <pthread.h>
 #include <errno.h>
+#include <wchar.h>
 #include "lookup.h"
 #include "stdio_impl.h"
 #include "syscall.h"
+#include "locale_impl.h"
+
+enum {
+	base         = 36,
+	tmin         = 1,
+	tmax         = 26,
+	skew         = 38,
+	damp         = 700,
+	initial_bias = 72,
+	initial_n    = 128,
+};
 
 static int is_valid_hostname(const char *host)
 {
@@ -22,6 +34,163 @@ static int is_valid_hostname(const char *host)
 	return !*s;
 }
 
+static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime)
+{
+	unsigned int k = 0;
+	delta /= firsttime ? damp : 2;
+	delta += delta / numpoints;
+	while (delta > ((base - tmin) * tmax) / 2) {
+		delta /= base - tmin;
+		k += base;
+	}
+	return k + ((base - tmin + 1) * delta) / (delta + skew);
+}
+
+static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max)
+{
+	static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789";
+	const unsigned char *usrc = (void *)src;
+	unsigned int codepoints = 0;
+	unsigned int dlen = 0;
+	unsigned int si, mi;
+	unsigned int n = initial_n;
+	unsigned int delta = 0;
+	unsigned int bias = initial_bias;
+	unsigned int h, b;
+	for (si = 0; si < len; ++si) {
+		if (usrc[si] < 0x80) {
+			if (dlen == max)
+				return -1;
+			dst[dlen++] = src[si];
+		} else if ((usrc[si] & 0xC0) == 0xC0) {
+			++codepoints;
+		}
+	}
+	codepoints += dlen;
+	h = b = dlen;
+	if (dlen) {
+		if (dlen == max)
+			return -1;
+		dst[dlen++] = '-';
+	}
+	while (h < codepoints) {
+		unsigned int m = (unsigned int)-1;
+		unsigned int c;
+		wchar_t wc;
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c >= n && c < m)
+				m = c;
+		}
+		if (((unsigned int)-1 - delta) / (h + 1) < m - n)
+			return -1;
+		delta += (m - n) * (h + 1);
+		n = m;
+
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c < n /* || c < 0x80 not necessary*/)
+				if (++delta == 0)
+					return -1;
+			if (c == n) {
+				unsigned int q = delta;
+				unsigned int k;
+				for (k = base; ; k += base) {
+					unsigned int t;
+					if (k <= bias + tmin) {
+						t = tmin;
+					} else if (k >= bias + tmax) {
+						t = tmax;
+					} else {
+						t = k - bias;
+					}
+					if (q < t)
+						break;
+					if (dlen == max)
+						return -1;
+					dst[dlen++] = tbl[t + ((q - t) % (base - t))];
+					q = (q - t) / (base - t);
+				}
+				if (dlen == max)
+					return -1;
+				dst[dlen++] = tbl[q];
+				bias = adapt(delta, h + 1, h == b);
+				delta = 0;
+				++h;
+			}
+		}
+		++delta;
+		++n;
+	}
+	return dlen;
+}
+
+static ssize_t idnaenc(char dst[static 256], const char *src)
+{
+	size_t left = strlen(src);
+	size_t olen = 0;
+
+	while (left) {
+		const char *dot;
+		size_t len, i;
+		int basic = 1;
+
+		dot = memchr(src, '.', left);
+		if (!dot) { dot = src + left; }
+		len = dot - src;
+		if (len == 0) { return -1; }
+		left -= len + !!*dot;
+
+		for (i = 0; i < len; ) {
+			unsigned int c;
+			wchar_t wc;
+			int n = mbtowc(&wc, src + i, len - i);
+			c = (n <= 0) ? 0 : (unsigned int)wc;
+			if (c < 0x80) {
+				if (!isalnum(c) && !(i > 0 && c == '-'))
+					return -1;
+			} else {
+				if ((c >=   0x7F && c <=   0x9F) ||
+				    (c >= 0xD800 && c <= 0xDFFF))
+					return -1;
+				basic = 0;
+			}
+			i += n;
+		}
+		if (basic) {
+			if (len > 63 || len > 254 - olen)
+				return -1;
+			for (i = 0; i < len; ++i)
+				dst[olen + i] = tolower(src[i]);
+			olen += len;
+		} else {
+			ssize_t r;
+			size_t max;
+			if (olen >= 254 - 4)
+				return -1;
+			max = 254 - 4 - olen;
+			if (max > 63 - 4)
+				max = 63 - 4;
+			memcpy(dst + olen, "xn--", 4);
+			r = punyenc(dst + olen + 4, src, len, max);
+			if (r <= 0)
+				return -1;
+			olen += r + 4;
+		}
+		if (olen == 255 || (!*dot && olen == 254))
+			return -1;
+		if (*dot)
+			dst[olen++] = *dot;
+		src = dot + !!*dot;
+	}
+	if (olen == 0)
+		return -1;
+	dst[olen] = 0;
+	return olen;
+}
+
 static int name_from_null(struct address buf[static 2], const char *name, int family, int flags)
 {
 	int cnt = 0;
@@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		return EAI_SYSTEM;
 	}
 	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
-		char *p, *z;
+		char idna[256];
+		ssize_t r;
+		char *p, *z, c;
 
 		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
-		for(p=line+1; (p=strstr(p, name)) &&
-			(!isspace(p[-1]) || !isspace(p[l])); p++);
-		if (!p) continue;
+		/* skip ip address and canonicalize names */
+		for (p=line; *p && !isspace(*p); p++);
+		while (*p) {
+			for (; *p && isspace(*p); p++);
+			for (z=p; *z && !isspace(*z); z++);
+			c = *z;
+			*z = 0;
+			r = idnaenc(idna, p);
+			*z = c;
+			if (r == l && memcmp(idna, name, l) == 0)
+				break;
+			p = z;
+		}
+		if (!*p) continue;
 
 		/* Isolate IP address to parse */
 		for (p=line; *p && !isspace(*p); p++);
@@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		for (; *p && isspace(*p); p++);
 		for (z=p; *z && !isspace(*z); z++);
 		*z = 0;
-		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
+		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
 	}
 	__fclose_ca(f);
 	return cnt ? cnt : badfam;
@@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)
 
 int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
 {
+	char _name[256];
 	int cnt = 0, i, j;
 
 	*canon = 0;
 	if (name) {
-		/* reject empty name and check len so it fits into temp bufs */
-		size_t l = strnlen(name, 255);
-		if (l-1 >= 254)
+		/* convert unicode name to RFC3492 punycode */
+		ssize_t l;
+		if ((l = idnaenc(_name, name)) <= 0)
 			return EAI_NONAME;
-		memcpy(canon, name, l+1);
+		memcpy(canon, _name, l+1);
+		name = _name;
 	}
 
 	/* Procedurally, a request for v6 addresses with the v4-mapped
-- 
2.10.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] IDNA support in name lookups
  2017-04-02  7:30 ` [PATCH v2] " Joakim Sindholt
@ 2017-04-23  1:01   ` Rich Felker
  2017-04-23  8:14     ` Joakim Sindholt
  0 siblings, 1 reply; 7+ messages in thread
From: Rich Felker @ 2017-04-23  1:01 UTC (permalink / raw)
  To: musl

On Sun, Apr 02, 2017 at 09:30:26AM +0200, Joakim Sindholt wrote:
> Changes since v1:
> * Reject UTF-16 surrogate range runes
> * Remove locale override
> 
> This is from some discussion on IRC and while I agree that it's more
> "correct" in POSIX terms, I'm not particularly happy about having to
> explicitly enable UTF-8 support with setlocale.

Yes, I'm not really happy about the decision on the C locale either,
but I understand the reasons various parties wanted it that way and I
think trying to follow the closest-to-working consensus process we
have is better than just following it when we agree with the outcomes.

> There might still be bugs and character ranges that need to be rejected.

As far as I can tell, no normalization is done. This might be
problematic for strings where the natural way users would type it does
not match the normalized form required in IDN's, but it would also be
expensive to handle. I think it's okay to punt on this until it proves
to actually be a problem.

> >From 54d5caf36cdce4e5008aecfcc2b02580fb52d0cb Mon Sep 17 00:00:00 2001
> From: Joakim Sindholt <opensource@zhasha.com>
> Date: Wed, 29 Mar 2017 11:51:02 +0200
> Subject: [PATCH] add IDNA support to name lookups
> 
> ---
>  src/network/lookup_name.c | 202 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 193 insertions(+), 9 deletions(-)
> 
> diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
> index fb7303a..fd4275c 100644
> --- a/src/network/lookup_name.c
> +++ b/src/network/lookup_name.c

I think I'd rather put these functions in their own source file, just
because they're logically distinct, even if this is the only caller.

> @@ -10,9 +10,21 @@
>  #include <unistd.h>
>  #include <pthread.h>
>  #include <errno.h>
> +#include <wchar.h>
>  #include "lookup.h"
>  #include "stdio_impl.h"
>  #include "syscall.h"
> +#include "locale_impl.h"

Is locale_impl.h actually used now?

> +
> +enum {
> +	base         = 36,
> +	tmin         = 1,
> +	tmax         = 26,
> +	skew         = 38,
> +	damp         = 700,
> +	initial_bias = 72,
> +	initial_n    = 128,
> +};

Especially because these names being introduced at file-scope sounds
error-prone in a large existing file.

>  static int is_valid_hostname(const char *host)
>  {
> @@ -22,6 +34,163 @@ static int is_valid_hostname(const char *host)
>  	return !*s;
>  }
>  
> +static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime)
> +{
> +	unsigned int k = 0;
> +	delta /= firsttime ? damp : 2;
> +	delta += delta / numpoints;
> +	while (delta > ((base - tmin) * tmax) / 2) {
> +		delta /= base - tmin;
> +		k += base;
> +	}
> +	return k + ((base - tmin + 1) * delta) / (delta + skew);
> +}
> +
> +static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max)
> +{
> +	static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789";
> +	const unsigned char *usrc = (void *)src;
> +	unsigned int codepoints = 0;
> +	unsigned int dlen = 0;
> +	unsigned int si, mi;
> +	unsigned int n = initial_n;
> +	unsigned int delta = 0;
> +	unsigned int bias = initial_bias;
> +	unsigned int h, b;
> +	for (si = 0; si < len; ++si) {
> +		if (usrc[si] < 0x80) {
> +			if (dlen == max)
> +				return -1;
> +			dst[dlen++] = src[si];
> +		} else if ((usrc[si] & 0xC0) == 0xC0) {
> +			++codepoints;
> +		}
> +	}
> +	codepoints += dlen;
> +	h = b = dlen;
> +	if (dlen) {
> +		if (dlen == max)
> +			return -1;
> +		dst[dlen++] = '-';
> +	}
> +	while (h < codepoints) {
> +		unsigned int m = (unsigned int)-1;
> +		unsigned int c;
> +		wchar_t wc;
> +		for (mi = 0; mi < len; ) {
> +			mi += mbtowc(&wc, src + mi, len - mi);
> +			c = (unsigned int)wc;
> +			if (c >= n && c < m)
> +				m = c;
> +		}
> +		if (((unsigned int)-1 - delta) / (h + 1) < m - n)
> +			return -1;
> +		delta += (m - n) * (h + 1);
> +		n = m;
> +
> +		for (mi = 0; mi < len; ) {
> +			mi += mbtowc(&wc, src + mi, len - mi);
> +			c = (unsigned int)wc;
> +			if (c < n /* || c < 0x80 not necessary*/)
> +				if (++delta == 0)
> +					return -1;
> +			if (c == n) {
> +				unsigned int q = delta;
> +				unsigned int k;
> +				for (k = base; ; k += base) {
> +					unsigned int t;
> +					if (k <= bias + tmin) {
> +						t = tmin;
> +					} else if (k >= bias + tmax) {
> +						t = tmax;
> +					} else {
> +						t = k - bias;
> +					}
> +					if (q < t)
> +						break;
> +					if (dlen == max)
> +						return -1;
> +					dst[dlen++] = tbl[t + ((q - t) % (base - t))];
> +					q = (q - t) / (base - t);
> +				}
> +				if (dlen == max)
> +					return -1;
> +				dst[dlen++] = tbl[q];
> +				bias = adapt(delta, h + 1, h == b);
> +				delta = 0;
> +				++h;
> +			}
> +		}
> +		++delta;
> +		++n;
> +	}
> +	return dlen;
> +}
> +
> +static ssize_t idnaenc(char dst[static 256], const char *src)
> +{
> +	size_t left = strlen(src);
> +	size_t olen = 0;
> +
> +	while (left) {
> +		const char *dot;
> +		size_t len, i;
> +		int basic = 1;
> +
> +		dot = memchr(src, '.', left);
> +		if (!dot) { dot = src + left; }
> +		len = dot - src;
> +		if (len == 0) { return -1; }
> +		left -= len + !!*dot;
> +
> +		for (i = 0; i < len; ) {
> +			unsigned int c;
> +			wchar_t wc;
> +			int n = mbtowc(&wc, src + i, len - i);
> +			c = (n <= 0) ? 0 : (unsigned int)wc;
> +			if (c < 0x80) {
> +				if (!isalnum(c) && !(i > 0 && c == '-'))
> +					return -1;
> +			} else {
> +				if ((c >=   0x7F && c <=   0x9F) ||
> +				    (c >= 0xD800 && c <= 0xDFFF))
> +					return -1;
> +				basic = 0;
> +			}
> +			i += n;
> +		}
> +		if (basic) {
> +			if (len > 63 || len > 254 - olen)
> +				return -1;
> +			for (i = 0; i < len; ++i)
> +				dst[olen + i] = tolower(src[i]);
> +			olen += len;
> +		} else {
> +			ssize_t r;
> +			size_t max;
> +			if (olen >= 254 - 4)
> +				return -1;
> +			max = 254 - 4 - olen;
> +			if (max > 63 - 4)
> +				max = 63 - 4;
> +			memcpy(dst + olen, "xn--", 4);
> +			r = punyenc(dst + olen + 4, src, len, max);
> +			if (r <= 0)
> +				return -1;
> +			olen += r + 4;
> +		}
> +		if (olen == 255 || (!*dot && olen == 254))
> +			return -1;
> +		if (*dot)
> +			dst[olen++] = *dot;
> +		src = dot + !!*dot;
> +	}
> +	if (olen == 0)
> +		return -1;
> +	dst[olen] = 0;
> +	return olen;
> +}
> +
>  static int name_from_null(struct address buf[static 2], const char *name, int family, int flags)
>  {
>  	int cnt = 0;
> @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
>  		return EAI_SYSTEM;
>  	}
>  	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
> -		char *p, *z;
> +		char idna[256];
> +		ssize_t r;
> +		char *p, *z, c;
>  
>  		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
> -		for(p=line+1; (p=strstr(p, name)) &&
> -			(!isspace(p[-1]) || !isspace(p[l])); p++);
> -		if (!p) continue;
> +		/* skip ip address and canonicalize names */
> +		for (p=line; *p && !isspace(*p); p++);
> +		while (*p) {
> +			for (; *p && isspace(*p); p++);
> +			for (z=p; *z && !isspace(*z); z++);
> +			c = *z;
> +			*z = 0;
> +			r = idnaenc(idna, p);
> +			*z = c;
> +			if (r == l && memcmp(idna, name, l) == 0)
> +				break;
> +			p = z;
> +		}
> +		if (!*p) continue;
>  
>  		/* Isolate IP address to parse */
>  		for (p=line; *p && !isspace(*p); p++);
> @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
>  		for (; *p && isspace(*p); p++);
>  		for (z=p; *z && !isspace(*z); z++);
>  		*z = 0;
> -		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
> +		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
>  	}
>  	__fclose_ca(f);
>  	return cnt ? cnt : badfam;
> @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)

Is there any reason this needs to be done, or should be done, for
lookups from the hosts file? IDN/punycode is a hack for transporting
unicode names on top of DNS protocol. For hosts file you can just put
the proper unicode strings directly in the file.

>  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
>  {
> +	char _name[256];
>  	int cnt = 0, i, j;
>  
>  	*canon = 0;
>  	if (name) {
> -		/* reject empty name and check len so it fits into temp bufs */
> -		size_t l = strnlen(name, 255);
> -		if (l-1 >= 254)
> +		/* convert unicode name to RFC3492 punycode */
> +		ssize_t l;
> +		if ((l = idnaenc(_name, name)) <= 0)
>  			return EAI_NONAME;
> -		memcpy(canon, name, l+1);
> +		memcpy(canon, _name, l+1);
> +		name = _name;
>  	}

If it's not needed for hosts backend, this code probably belongs
localized to the dns lookup, rather than at the top of __lookup_name.

BTW there's perhaps also a need for the opposite-direction
translation, both for ai_canonname (when a CNAME points to IDN) and
for getnameinfo reverse lookups. But that can be added as a second
patch I think.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] IDNA support in name lookups
  2017-04-23  1:01   ` Rich Felker
@ 2017-04-23  8:14     ` Joakim Sindholt
  2017-04-23 15:07       ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Joakim Sindholt @ 2017-04-23  8:14 UTC (permalink / raw)
  To: musl

On Sat, Apr 22, 2017 at 09:01:00PM -0400, Rich Felker wrote:
> On Sun, Apr 02, 2017 at 09:30:26AM +0200, Joakim Sindholt wrote:
> > Changes since v1:
> > * Reject UTF-16 surrogate range runes
> > * Remove locale override
> > 
> > This is from some discussion on IRC and while I agree that it's more
> > "correct" in POSIX terms, I'm not particularly happy about having to
> > explicitly enable UTF-8 support with setlocale.
> 
> Yes, I'm not really happy about the decision on the C locale either,
> but I understand the reasons various parties wanted it that way and I
> think trying to follow the closest-to-working consensus process we
> have is better than just following it when we agree with the outcomes.

Understood. I agree that we need to be consistent.
 
> > There might still be bugs and character ranges that need to be rejected.
> 
> As far as I can tell, no normalization is done. This might be
> problematic for strings where the natural way users would type it does
> not match the normalized form required in IDN's, but it would also be
> expensive to handle. I think it's okay to punt on this until it proves
> to actually be a problem.

It does ASCII normalization (tolower) but not unicode nomalization. I
did a quick search for it and came up with RFC5892[1], which specifies a
rather intricate set of characters to be allowed.
Maybe I'll go through it one day but it will most certainly be in
another patch.

> > >From 54d5caf36cdce4e5008aecfcc2b02580fb52d0cb Mon Sep 17 00:00:00 2001
> > From: Joakim Sindholt <opensource@zhasha.com>
> > Date: Wed, 29 Mar 2017 11:51:02 +0200
> > Subject: [PATCH] add IDNA support to name lookups
> > 
> > ---
> >  src/network/lookup_name.c | 202 +++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 193 insertions(+), 9 deletions(-)
> > 
> > diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
> > index fb7303a..fd4275c 100644
> > --- a/src/network/lookup_name.c
> > +++ b/src/network/lookup_name.c
> 
> I think I'd rather put these functions in their own source file, just
> because they're logically distinct, even if this is the only caller.

No objections.

> > @@ -10,9 +10,21 @@
> >  #include <unistd.h>
> >  #include <pthread.h>
> >  #include <errno.h>
> > +#include <wchar.h>
> >  #include "lookup.h"
> >  #include "stdio_impl.h"
> >  #include "syscall.h"
> > +#include "locale_impl.h"
> 
> Is locale_impl.h actually used now?

No, I just forgot to remove it.

[ snip ]
> > @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> >  		return EAI_SYSTEM;
> >  	}
> >  	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
> > -		char *p, *z;
> > +		char idna[256];
> > +		ssize_t r;
> > +		char *p, *z, c;
> >  
> >  		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
> > -		for(p=line+1; (p=strstr(p, name)) &&
> > -			(!isspace(p[-1]) || !isspace(p[l])); p++);
> > -		if (!p) continue;
> > +		/* skip ip address and canonicalize names */
> > +		for (p=line; *p && !isspace(*p); p++);
> > +		while (*p) {
> > +			for (; *p && isspace(*p); p++);
> > +			for (z=p; *z && !isspace(*z); z++);
> > +			c = *z;
> > +			*z = 0;
> > +			r = idnaenc(idna, p);
> > +			*z = c;
> > +			if (r == l && memcmp(idna, name, l) == 0)
> > +				break;
> > +			p = z;
> > +		}
> > +		if (!*p) continue;
> >  
> >  		/* Isolate IP address to parse */
> >  		for (p=line; *p && !isspace(*p); p++);
> > @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> >  		for (; *p && isspace(*p); p++);
> >  		for (z=p; *z && !isspace(*z); z++);
> >  		*z = 0;
> > -		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
> > +		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
> >  	}
> >  	__fclose_ca(f);
> >  	return cnt ? cnt : badfam;
> > @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)
> 
> Is there any reason this needs to be done, or should be done, for
> lookups from the hosts file? IDN/punycode is a hack for transporting
> unicode names on top of DNS protocol. For hosts file you can just put
> the proper unicode strings directly in the file.

My logic was that some people might have/want to have punycode in their
hosts file, and some might even (accidentally or otherwise) have mixed
punycode-unicode names written down. In any case I wanted it to Just
Work™ so decoding the host from punycode before comparing seemed to be
the easiest way to ensure it catches everything.
This was prompted by a paper wedding invitaion I received where the
couple had listed their gift registry in punycode form. This says to me
that people just dont know or care about this, and since the hosts file
is used extensively by non-developers as well I would personally prefer
if this worked regardless of what deranged things people might put in
there.
In fact I could imagine a lot of people shoving the punycode form in
there under the assumption that that would work better.
Also, suppose you have callers from all over the system dialing out to a
server but some of them call xn--foo-bar.com and others dial the unicode
version. Is it really reasonable that you should need to list this
domain twice in the hosts file for it to work?

> >  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
> >  {
> > +	char _name[256];
> >  	int cnt = 0, i, j;
> >  
> >  	*canon = 0;
> >  	if (name) {
> > -		/* reject empty name and check len so it fits into temp bufs */
> > -		size_t l = strnlen(name, 255);
> > -		if (l-1 >= 254)
> > +		/* convert unicode name to RFC3492 punycode */
> > +		ssize_t l;
> > +		if ((l = idnaenc(_name, name)) <= 0)
> >  			return EAI_NONAME;
> > -		memcpy(canon, name, l+1);
> > +		memcpy(canon, _name, l+1);
> > +		name = _name;
> >  	}
> 
> If it's not needed for hosts backend, this code probably belongs
> localized to the dns lookup, rather than at the top of __lookup_name.
> 
> BTW there's perhaps also a need for the opposite-direction
> translation, both for ai_canonname (when a CNAME points to IDN) and
> for getnameinfo reverse lookups. But that can be added as a second
> patch I think.

I have already written the code for decoding as well if need be :)

The only problem as I see it is that a unicode name can be a hair under
4 times larger (in bytes) than the punycode equivalent. Select any 4
byte UTF-8 character and make labels exclusively containing that. All
subsequent characters to the first will be encoded as an 'a'.

This, by the way, also means that we should probably mess with the
buffering when reading the hosts file.

[1] https://tools.ietf.org/html/rfc5892


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] IDNA support in name lookups
  2017-04-23  8:14     ` Joakim Sindholt
@ 2017-04-23 15:07       ` Rich Felker
  2017-04-23 16:38         ` Joakim Sindholt
  0 siblings, 1 reply; 7+ messages in thread
From: Rich Felker @ 2017-04-23 15:07 UTC (permalink / raw)
  To: musl

On Sun, Apr 23, 2017 at 10:14:24AM +0200, Joakim Sindholt wrote:
> > > There might still be bugs and character ranges that need to be rejected.
> > 
> > As far as I can tell, no normalization is done. This might be
> > problematic for strings where the natural way users would type it does
> > not match the normalized form required in IDN's, but it would also be
> > expensive to handle. I think it's okay to punt on this until it proves
> > to actually be a problem.
> 
> It does ASCII normalization (tolower) but not unicode nomalization. I
> did a quick search for it and came up with RFC5892[1], which specifies a
> rather intricate set of characters to be allowed.
> Maybe I'll go through it one day but it will most certainly be in
> another patch.

AFAIK, the allowed set is not relevant for us to check; it's up to
domain registrars to enforce it, and there's no need to enforce it on
subdomains if the owner of the higher-level domain doesn't care to.

On the other hand, normalization is about transformation to NFKC (as
well as case folding, IIRC), and is relevant if users might enter
domain names in a decomposed form, or (and this is nasty) for content
that has no composed form, in a form where the order of combining
characters is different that what NF[K]C specifies due to bugs in
Unicode's assignments of combining classes. For example if the normal
way to write X composed of base B and combining C1 and C2 is B C1 C2,
but combining classes are wrongly assigned such that NFC is B C2 C1,
the name entered in the way it's normally written will fail to
resolve.

I'm not sure if there are real-world cases where this happens (yet) so
we can probably just wait and see if it does...

> > > @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> > >  		return EAI_SYSTEM;
> > >  	}
> > >  	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
> > > -		char *p, *z;
> > > +		char idna[256];
> > > +		ssize_t r;
> > > +		char *p, *z, c;
> > >  
> > >  		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
> > > -		for(p=line+1; (p=strstr(p, name)) &&
> > > -			(!isspace(p[-1]) || !isspace(p[l])); p++);
> > > -		if (!p) continue;
> > > +		/* skip ip address and canonicalize names */
> > > +		for (p=line; *p && !isspace(*p); p++);
> > > +		while (*p) {
> > > +			for (; *p && isspace(*p); p++);
> > > +			for (z=p; *z && !isspace(*z); z++);
> > > +			c = *z;
> > > +			*z = 0;
> > > +			r = idnaenc(idna, p);
> > > +			*z = c;
> > > +			if (r == l && memcmp(idna, name, l) == 0)
> > > +				break;
> > > +			p = z;
> > > +		}
> > > +		if (!*p) continue;
> > >  
> > >  		/* Isolate IP address to parse */
> > >  		for (p=line; *p && !isspace(*p); p++);
> > > @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> > >  		for (; *p && isspace(*p); p++);
> > >  		for (z=p; *z && !isspace(*z); z++);
> > >  		*z = 0;
> > > -		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
> > > +		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
> > >  	}
> > >  	__fclose_ca(f);
> > >  	return cnt ? cnt : badfam;
> > > @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)
> > 
> > Is there any reason this needs to be done, or should be done, for
> > lookups from the hosts file? IDN/punycode is a hack for transporting
> > unicode names on top of DNS protocol. For hosts file you can just put
> > the proper unicode strings directly in the file.
> 
> My logic was that some people might have/want to have punycode in their
> hosts file, and some might even (accidentally or otherwise) have mixed
> punycode-unicode names written down. In any case I wanted it to Just
> Work™ so decoding the host from punycode before comparing seemed to be
> the easiest way to ensure it catches everything.
> This was prompted by a paper wedding invitaion I received where the
> couple had listed their gift registry in punycode form. This says to me
> that people just dont know or care about this, and since the hosts file
> is used extensively by non-developers as well I would personally prefer
> if this worked regardless of what deranged things people might put in
> there.
> In fact I could imagine a lot of people shoving the punycode form in
> there under the assumption that that would work better.
> Also, suppose you have callers from all over the system dialing out to a
> server but some of them call xn--foo-bar.com and others dial the unicode
> version. Is it really reasonable that you should need to list this
> domain twice in the hosts file for it to work?

If you want the punycode to resolve to the unicode name in hosts file,
I would do the opposite: decode it to proper utf-8 and match that.
Nobody should have to deal with wacky punycode encodings to make
non-latin hostnames work. They should just work transparently and
punycode should be an implementation detail inside layers that
users/programmers don't see.

If I'm not mistaken, your patch as-is actually _breaks_ support for
literal utf-8 hostnames in the hosts file.

> > >  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
> > >  {
> > > +	char _name[256];
> > >  	int cnt = 0, i, j;
> > >  
> > >  	*canon = 0;
> > >  	if (name) {
> > > -		/* reject empty name and check len so it fits into temp bufs */
> > > -		size_t l = strnlen(name, 255);
> > > -		if (l-1 >= 254)
> > > +		/* convert unicode name to RFC3492 punycode */
> > > +		ssize_t l;
> > > +		if ((l = idnaenc(_name, name)) <= 0)
> > >  			return EAI_NONAME;
> > > -		memcpy(canon, name, l+1);
> > > +		memcpy(canon, _name, l+1);
> > > +		name = _name;
> > >  	}
> > 
> > If it's not needed for hosts backend, this code probably belongs
> > localized to the dns lookup, rather than at the top of __lookup_name.
> > 
> > BTW there's perhaps also a need for the opposite-direction
> > translation, both for ai_canonname (when a CNAME points to IDN) and
> > for getnameinfo reverse lookups. But that can be added as a second
> > patch I think.
> 
> I have already written the code for decoding as well if need be :)

Yay!

> The only problem as I see it is that a unicode name can be a hair under
> 4 times larger (in bytes) than the punycode equivalent. Select any 4
> byte UTF-8 character and make labels exclusively containing that. All
> subsequent characters to the first will be encoded as an 'a'.
> 
> This, by the way, also means that we should probably mess with the
> buffering when reading the hosts file.

You mean increase it? Since HOST_NAME_MAX/NI_MAXHOST is 255 and this
is ABI, I don't think we can return names longer than 255 bytes. Such
names probably just need to be left as punycode when coming from dns,
and not supported in hosts files.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] IDNA support in name lookups
  2017-04-23 15:07       ` Rich Felker
@ 2017-04-23 16:38         ` Joakim Sindholt
  2017-04-23 16:56           ` Rich Felker
  0 siblings, 1 reply; 7+ messages in thread
From: Joakim Sindholt @ 2017-04-23 16:38 UTC (permalink / raw)
  To: musl

On Sun, Apr 23, 2017 at 11:07:47AM -0400, Rich Felker wrote:
> > > > @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> > > >  		return EAI_SYSTEM;
> > > >  	}
> > > >  	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
> > > > -		char *p, *z;
> > > > +		char idna[256];
> > > > +		ssize_t r;
> > > > +		char *p, *z, c;
> > > >  
> > > >  		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
> > > > -		for(p=line+1; (p=strstr(p, name)) &&
> > > > -			(!isspace(p[-1]) || !isspace(p[l])); p++);
> > > > -		if (!p) continue;
> > > > +		/* skip ip address and canonicalize names */
> > > > +		for (p=line; *p && !isspace(*p); p++);
> > > > +		while (*p) {
> > > > +			for (; *p && isspace(*p); p++);
> > > > +			for (z=p; *z && !isspace(*z); z++);
> > > > +			c = *z;
> > > > +			*z = 0;
> > > > +			r = idnaenc(idna, p);
> > > > +			*z = c;
> > > > +			if (r == l && memcmp(idna, name, l) == 0)
> > > > +				break;
> > > > +			p = z;
> > > > +		}
> > > > +		if (!*p) continue;
> > > >  
> > > >  		/* Isolate IP address to parse */
> > > >  		for (p=line; *p && !isspace(*p); p++);
> > > > @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
> > > >  		for (; *p && isspace(*p); p++);
> > > >  		for (z=p; *z && !isspace(*z); z++);
> > > >  		*z = 0;
> > > > -		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
> > > > +		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
> > > >  	}
> > > >  	__fclose_ca(f);
> > > >  	return cnt ? cnt : badfam;
> > > > @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)
> > > 
> > > Is there any reason this needs to be done, or should be done, for
> > > lookups from the hosts file? IDN/punycode is a hack for transporting
> > > unicode names on top of DNS protocol. For hosts file you can just put
> > > the proper unicode strings directly in the file.
> > 
> > My logic was that some people might have/want to have punycode in their
> > hosts file, and some might even (accidentally or otherwise) have mixed
> > punycode-unicode names written down. In any case I wanted it to Just
> > Work™ so decoding the host from punycode before comparing seemed to be
> > the easiest way to ensure it catches everything.
> > This was prompted by a paper wedding invitaion I received where the
> > couple had listed their gift registry in punycode form. This says to me
> > that people just dont know or care about this, and since the hosts file
> > is used extensively by non-developers as well I would personally prefer
> > if this worked regardless of what deranged things people might put in
> > there.
> > In fact I could imagine a lot of people shoving the punycode form in
> > there under the assumption that that would work better.
> > Also, suppose you have callers from all over the system dialing out to a
> > server but some of them call xn--foo-bar.com and others dial the unicode
> > version. Is it really reasonable that you should need to list this
> > domain twice in the hosts file for it to work?
> 
> If you want the punycode to resolve to the unicode name in hosts file,
> I would do the opposite: decode it to proper utf-8 and match that.
> Nobody should have to deal with wacky punycode encodings to make
> non-latin hostnames work. They should just work transparently and
> punycode should be an implementation detail inside layers that
> users/programmers don't see.
> 
> If I'm not mistaken, your patch as-is actually _breaks_ support for
> literal utf-8 hostnames in the hosts file.

No, it supports all combinations. It encodes every hostname found to the
punycode equivalent and compares only punycode. The encoding is done on
a per-label basis so ønskeskyen.dk is "xn--nskeskyen-k8a" and "dk" treated
entirely separately.
Suppose they had a subdomain, ønsker.ønskeskyen.dk, then this approach
would work even if someone put "ønsker.xn--nskeskyen-k8a.dk" in their
hosts file, even if you looked it up as xn--nsker-uua.ønskeskyen.dk.

There are a couple of reasons that I chose to do it this way, with the
primary being that it meant I only had to include the encoding function
and the secondary being that the encoded name will always fit in 254
characters, whereas the decoded equivalent might be larger (see below).

I don't have any ideological opposition to doing the comparison entirely
in unicode though. This was just the more practical solution for the
time being.
 
> > > >  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
> > > >  {
> > > > +	char _name[256];
> > > >  	int cnt = 0, i, j;
> > > >  
> > > >  	*canon = 0;
> > > >  	if (name) {
> > > > -		/* reject empty name and check len so it fits into temp bufs */
> > > > -		size_t l = strnlen(name, 255);
> > > > -		if (l-1 >= 254)
> > > > +		/* convert unicode name to RFC3492 punycode */
> > > > +		ssize_t l;
> > > > +		if ((l = idnaenc(_name, name)) <= 0)
> > > >  			return EAI_NONAME;
> > > > -		memcpy(canon, name, l+1);
> > > > +		memcpy(canon, _name, l+1);
> > > > +		name = _name;
> > > >  	}
> > > 
> > > If it's not needed for hosts backend, this code probably belongs
> > > localized to the dns lookup, rather than at the top of __lookup_name.
> > > 
> > > BTW there's perhaps also a need for the opposite-direction
> > > translation, both for ai_canonname (when a CNAME points to IDN) and
> > > for getnameinfo reverse lookups. But that can be added as a second
> > > patch I think.
> > 
> > I have already written the code for decoding as well if need be :)
> 
> Yay!
> 
> > The only problem as I see it is that a unicode name can be a hair under
> > 4 times larger (in bytes) than the punycode equivalent. Select any 4
> > byte UTF-8 character and make labels exclusively containing that. All
> > subsequent characters to the first will be encoded as an 'a'.
> > 
> > This, by the way, also means that we should probably mess with the
> > buffering when reading the hosts file.
> 
> You mean increase it? Since HOST_NAME_MAX/NI_MAXHOST is 255 and this
> is ABI, I don't think we can return names longer than 255 bytes. Such
> names probably just need to be left as punycode when coming from dns,
> and not supported in hosts files.

I'm not sure that's the correct interpretation. For example when doing
HTTP requests the browser will fill in the HTTP Host header with the
punycode name. I think the intent is to do everything with the punycoded
name and ditch utf8 permanently :(

What I meant here though was when reading from /etc/hosts the line
buffer used is 512 bytes which isn't necessarily enough for one full
domain in utf8. Take any cuneiform character of your choice, repeat it
56 times and you get a label of 63 chars that looks something like
xn--4c4daaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
but has a utf8 size of 224 bytes. You can have almost 4 of those in an
otherwise completely valid FQDN. The last one would have to lose 2 'a's
(8 bytes) making the total 63+1+63+1+63+1+61+1=254, decoding to a total
of 224+1+224+1+224+1+216+1=892 bytes.

Now I will freely admit that I don't know for sure if this is legal but
I believe that it is, and that the maximum length of 254 applies solely
to the encoded name.
I can at least say that dig will happily accept that massive domain
name and complain on a domain with one more cuneiform in the last label.
It will also complain if any one label exceeds 63 bytes post-encoding.

For the record, glibc has #define NI_MAXHOST 1025, if it matters at all.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] IDNA support in name lookups
  2017-04-23 16:38         ` Joakim Sindholt
@ 2017-04-23 16:56           ` Rich Felker
  0 siblings, 0 replies; 7+ messages in thread
From: Rich Felker @ 2017-04-23 16:56 UTC (permalink / raw)
  To: musl

On Sun, Apr 23, 2017 at 06:38:24PM +0200, Joakim Sindholt wrote:
> > If I'm not mistaken, your patch as-is actually _breaks_ support for
> > literal utf-8 hostnames in the hosts file.
> 
> No, it supports all combinations. It encodes every hostname found to the
> punycode equivalent and compares only punycode. The encoding is done on
> a per-label basis so ønskeskyen.dk is "xn--nskeskyen-k8a" and "dk" treated
> entirely separately.
> Suppose they had a subdomain, ønsker.ønskeskyen.dk, then this approach
> would work even if someone put "ønsker.xn--nskeskyen-k8a.dk" in their
> hosts file, even if you looked it up as xn--nsker-uua.ønskeskyen.dk.

OK, that makes more sense and it's non-harmful.

> There are a couple of reasons that I chose to do it this way, with the
> primary being that it meant I only had to include the encoding function
> and the secondary being that the encoded name will always fit in 254
> characters, whereas the decoded equivalent might be larger (see below).
> 
> I don't have any ideological opposition to doing the comparison entirely
> in unicode though. This was just the more practical solution for the
> time being.

It's probably okay the way you're doing it, then. I failed to
understand that it accepts either. Main other consideration might be
whether it's noticably slow on large hosts files, but I don't think
the existing code was terribly fast anyway.

> > > > >  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
> > > > >  {
> > > > > +	char _name[256];
> > > > >  	int cnt = 0, i, j;
> > > > >  
> > > > >  	*canon = 0;
> > > > >  	if (name) {
> > > > > -		/* reject empty name and check len so it fits into temp bufs */
> > > > > -		size_t l = strnlen(name, 255);
> > > > > -		if (l-1 >= 254)
> > > > > +		/* convert unicode name to RFC3492 punycode */
> > > > > +		ssize_t l;
> > > > > +		if ((l = idnaenc(_name, name)) <= 0)
> > > > >  			return EAI_NONAME;
> > > > > -		memcpy(canon, name, l+1);
> > > > > +		memcpy(canon, _name, l+1);
> > > > > +		name = _name;
> > > > >  	}
> > > > 
> > > > If it's not needed for hosts backend, this code probably belongs
> > > > localized to the dns lookup, rather than at the top of __lookup_name.
> > > > 
> > > > BTW there's perhaps also a need for the opposite-direction
> > > > translation, both for ai_canonname (when a CNAME points to IDN) and
> > > > for getnameinfo reverse lookups. But that can be added as a second
> > > > patch I think.
> > > 
> > > I have already written the code for decoding as well if need be :)
> > 
> > Yay!
> > 
> > > The only problem as I see it is that a unicode name can be a hair under
> > > 4 times larger (in bytes) than the punycode equivalent. Select any 4
> > > byte UTF-8 character and make labels exclusively containing that. All
> > > subsequent characters to the first will be encoded as an 'a'.
> > > 
> > > This, by the way, also means that we should probably mess with the
> > > buffering when reading the hosts file.
> > 
> > You mean increase it? Since HOST_NAME_MAX/NI_MAXHOST is 255 and this
> > is ABI, I don't think we can return names longer than 255 bytes. Such
> > names probably just need to be left as punycode when coming from dns,
> > and not supported in hosts files.
> 
> I'm not sure that's the correct interpretation. For example when doing
> HTTP requests the browser will fill in the HTTP Host header with the
> punycode name. I think the intent is to do everything with the punycoded
> name and ditch utf8 permanently :(

That's ugly and backwards but fits with the header encoding being
ASCII. However it's not universal. The most important other place
domain names are used, in certificates, they're proper unicode;
punycode is not used there.

> What I meant here though was when reading from /etc/hosts the line
> buffer used is 512 bytes which isn't necessarily enough for one full
> domain in utf8.

But it is enough assuming we honor HOST_NAME_MAX.

> Take any cuneiform character of your choice, repeat it
> 56 times and you get a label of 63 chars that looks something like
> xn--4c4daaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
> but has a utf8 size of 224 bytes. You can have almost 4 of those in an
> otherwise completely valid FQDN. The last one would have to lose 2 'a's
> (8 bytes) making the total 63+1+63+1+63+1+61+1=254, decoding to a total
> of 224+1+224+1+224+1+216+1=892 bytes.
> 
> Now I will freely admit that I don't know for sure if this is legal but
> I believe that it is, and that the maximum length of 254 applies solely
> to the encoded name.
> I can at least say that dig will happily accept that massive domain
> name and complain on a domain with one more cuneiform in the last label.
> It will also complain if any one label exceeds 63 bytes post-encoding.
> 
> For the record, glibc has #define NI_MAXHOST 1025, if it matters at all.

Yeah, so far I've considered that a mistake. For non-DNS names it's
rather up to the implementation what they support, but I don't think
there's significant merit in supporting gratuitously/maliciously long
hostnames.

Rich


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-04-23 16:56 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-29 11:26 IDNA support in name lookups Joakim Sindholt
2017-04-02  7:30 ` [PATCH v2] " Joakim Sindholt
2017-04-23  1:01   ` Rich Felker
2017-04-23  8:14     ` Joakim Sindholt
2017-04-23 15:07       ` Rich Felker
2017-04-23 16:38         ` Joakim Sindholt
2017-04-23 16:56           ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).