mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Joakim Sindholt <opensource@zhasha.com>
To: musl@lists.openwall.com
Subject: IDNA support in name lookups
Date: Wed, 29 Mar 2017 13:26:29 +0200	[thread overview]
Message-ID: <20170329112629.GA3506324@wirbelwind> (raw)

[-- Attachment #1: Type: text/plain, Size: 167 bytes --]

Here's a first draft patch for internationalized domain name support.
I implemented it based on the pseudocode in RFC3492[1].

[1] https://tools.ietf.org/html/rfc3492

[-- Attachment #2: 0001-add-IDNA-support-to-name-lookups.patch --]
[-- Type: text/x-diff, Size: 6835 bytes --]

From 7542dfe05b33b200360f982caf1631615cde30fb Mon Sep 17 00:00:00 2001
From: Joakim Sindholt <opensource@zhasha.com>
Date: Wed, 29 Mar 2017 11:51:02 +0200
Subject: [PATCH] add IDNA support to name lookups

---
 src/network/lookup_name.c | 202 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 194 insertions(+), 8 deletions(-)

diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
index fb7303a..3590cb1 100644
--- a/src/network/lookup_name.c
+++ b/src/network/lookup_name.c
@@ -10,9 +10,21 @@
 #include <unistd.h>
 #include <pthread.h>
 #include <errno.h>
+#include <wchar.h>
 #include "lookup.h"
 #include "stdio_impl.h"
 #include "syscall.h"
+#include "locale_impl.h"
+
+enum {
+	base         = 36,
+	tmin         = 1,
+	tmax         = 26,
+	skew         = 38,
+	damp         = 700,
+	initial_bias = 72,
+	initial_n    = 128,
+};
 
 static int is_valid_hostname(const char *host)
 {
@@ -22,6 +34,162 @@ static int is_valid_hostname(const char *host)
 	return !*s;
 }
 
+static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime)
+{
+	unsigned int k = 0;
+	delta /= firsttime ? damp : 2;
+	delta += delta / numpoints;
+	while (delta > ((base - tmin) * tmax) / 2) {
+		delta /= base - tmin;
+		k += base;
+	}
+	return k + ((base - tmin + 1) * delta) / (delta + skew);
+}
+
+static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max)
+{
+	static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789";
+	const unsigned char *usrc = (void *)src;
+	unsigned int codepoints = 0;
+	unsigned int dlen = 0;
+	unsigned int si, mi;
+	unsigned int n = initial_n;
+	unsigned int delta = 0;
+	unsigned int bias = initial_bias;
+	unsigned int h, b;
+	for (si = 0; si < len; ++si) {
+		if (usrc[si] < 0x80) {
+			if (dlen == max)
+				return -1;
+			dst[dlen++] = src[si];
+		} else if ((usrc[si] & 0xC0) == 0xC0) {
+			++codepoints;
+		}
+	}
+	codepoints += dlen;
+	h = b = dlen;
+	if (dlen) {
+		if (dlen == max)
+			return -1;
+		dst[dlen++] = '-';
+	}
+	while (h < codepoints) {
+		unsigned int m = (unsigned int)-1;
+		unsigned int c;
+		wchar_t wc;
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c >= n && c < m)
+				m = c;
+		}
+		if (((unsigned int)-1 - delta) / (h + 1) < m - n)
+			return -1;
+		delta += (m - n) * (h + 1);
+		n = m;
+
+		for (mi = 0; mi < len; ) {
+			mi += mbtowc(&wc, src + mi, len - mi);
+			c = (unsigned int)wc;
+			if (c < n /* || c < 0x80 not necessary*/)
+				if (++delta == 0)
+					return -1;
+			if (c == n) {
+				unsigned int q = delta;
+				unsigned int k;
+				for (k = base; ; k += base) {
+					unsigned int t;
+					if (k <= bias + tmin) {
+						t = tmin;
+					} else if (k >= bias + tmax) {
+						t = tmax;
+					} else {
+						t = k - bias;
+					}
+					if (q < t)
+						break;
+					if (dlen == max)
+						return -1;
+					dst[dlen++] = tbl[t + ((q - t) % (base - t))];
+					q = (q - t) / (base - t);
+				}
+				if (dlen == max)
+					return -1;
+				dst[dlen++] = tbl[q];
+				bias = adapt(delta, h + 1, h == b);
+				delta = 0;
+				++h;
+			}
+		}
+		++delta;
+		++n;
+	}
+	return dlen;
+}
+
+static ssize_t idnaenc(char dst[static 256], const char *src)
+{
+	size_t left = strlen(src);
+	size_t olen = 0;
+
+	while (left) {
+		const char *dot;
+		size_t len, i;
+		int basic = 1;
+
+		dot = memchr(src, '.', left);
+		if (!dot) { dot = src + left; }
+		len = dot - src;
+		if (len == 0) { return -1; }
+		left -= len + !!*dot;
+
+		for (i = 0; i < len; ) {
+			unsigned int c;
+			wchar_t wc;
+			int n = mbtowc(&wc, src + i, len - i);
+			c = (n <= 0) ? 0 : (unsigned int)wc;
+			if (c < 0x80) {
+				if (!isalnum(c) && !(i > 0 && c == '-'))
+					return -1;
+			} else {
+				if (c >= 0x7F && c <= 0x9F)
+					return -1;
+				basic = 0;
+			}
+			i += n;
+		}
+		if (basic) {
+			if (len > 63 || len > 254 - olen)
+				return -1;
+			for (i = 0; i < len; ++i)
+				dst[olen + i] = tolower(src[i]);
+			olen += len;
+		} else {
+			ssize_t r;
+			size_t max;
+			if (olen >= 254 - 4)
+				return -1;
+			max = 254 - 4 - olen;
+			if (max > 63 - 4)
+				max = 63 - 4;
+			memcpy(dst + olen, "xn--", 4);
+			r = punyenc(dst + olen + 4, src, len, max);
+			if (r <= 0)
+				return -1;
+			olen += r + 4;
+		}
+		if (olen == 255 || !*dot && olen == 254)
+			return -1;
+		if (*dot)
+			dst[olen++] = *dot;
+		src = dot + !!*dot;
+	}
+	if (olen == 0)
+		return -1;
+	dst[olen] = 0;
+	return olen;
+}
+
 static int name_from_null(struct address buf[static 2], const char *name, int family, int flags)
 {
 	int cnt = 0;
@@ -61,12 +229,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		return EAI_SYSTEM;
 	}
 	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
-		char *p, *z;
+		char idna[256];
+		ssize_t r;
+		char *p, *z, c;
 
 		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
-		for(p=line+1; (p=strstr(p, name)) &&
-			(!isspace(p[-1]) || !isspace(p[l])); p++);
-		if (!p) continue;
+		/* skip ip address and canonicalize names */
+		for (p=line; *p && !isspace(*p); p++);
+		while (*p) {
+			for (; *p && isspace(*p); p++);
+			for (z=p; *z && !isspace(*z); z++);
+			c = *z;
+			*z = 0;
+			r = idnaenc(idna, p);
+			*z = c;
+			if (r == l && memcmp(idna, name, l) == 0)
+				break;
+			p = z;
+		}
+		if (!*p) continue;
 
 		/* Isolate IP address to parse */
 		for (p=line; *p && !isspace(*p); p++);
@@ -86,7 +267,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		for (; *p && isspace(*p); p++);
 		for (z=p; *z && !isspace(*z); z++);
 		*z = 0;
-		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
+		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
 	}
 	__fclose_ca(f);
 	return cnt ? cnt : badfam;
@@ -285,15 +466,19 @@ static int addrcmp(const void *_a, const void *_b)
 
 int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
 {
+	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	char _name[256];
 	int cnt = 0, i, j;
 
+	*ploc = UTF8_LOCALE;
 	*canon = 0;
 	if (name) {
 		/* reject empty name and check len so it fits into temp bufs */
-		size_t l = strnlen(name, 255);
-		if (l-1 >= 254)
+		ssize_t l;
+		if ((l = idnaenc(_name, name)) <= 0)
 			return EAI_NONAME;
-		memcpy(canon, name, l+1);
+		memcpy(canon, _name, l+1);
+		name = _name;
 	}
 
 	/* Procedurally, a request for v6 addresses with the v4-mapped
@@ -311,6 +496,7 @@ int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], c
 		cnt = name_from_hosts(buf, canon, name, family);
 		if (!cnt) cnt = name_from_dns_search(buf, canon, name, family);
 	}
+	*ploc = loc;
 	if (cnt<=0) return cnt ? cnt : EAI_NONAME;
 
 	/* Filter/transform results for v4-mapped lookup, if requested. */
-- 
2.10.2


             reply	other threads:[~2017-03-29 11:26 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-29 11:26 Joakim Sindholt [this message]
2017-04-02  7:30 ` [PATCH v2] " Joakim Sindholt
2017-04-23  1:01   ` Rich Felker
2017-04-23  8:14     ` Joakim Sindholt
2017-04-23 15:07       ` Rich Felker
2017-04-23 16:38         ` Joakim Sindholt
2017-04-23 16:56           ` Rich Felker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170329112629.GA3506324@wirbelwind \
    --to=opensource@zhasha.com \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).