From mboxrd@z Thu Jan  1 00:00:00 1970
Message-ID: <982374feab1ff1d8ea2f176256d16934@plan9.bell-labs.com>
To: 9fans@9fans.net
Date: Thu, 28 Jan 2010 15:46:27 -0500
From: geoff@plan9.bell-labs.com
In-Reply-To: <4B61A280020000CC0001D4A1@wlgw07.wlu.ca>
MIME-Version: 1.0
Content-Type: text/plain; charset="US-ASCII"
Content-Transfer-Encoding: 7bit
Subject: Re: [9fans] Lex, Yacc, Unicode Plane 1
Topicbox-Message-UUID: c9c5536c-ead5-11e9-9d60-3106f5b1d025

I've extended old code using lex to accept utf by massaging the input
stream, before lex sees it, to parse utf and encode non-ascii Runes
into '\33' (escape) followed by 4 hex digits.  A simple lex rule then
decodes for the benefit of yacc.

This encodes:

/*
 * lex can't cope with character sets wider than 8 bits, so convert
 * s to runes and encode non-ascii runes as <esc><hex><hex><hex><hex>.
 * result is malloced.
 */
char *
utf2lex(char *s)
{
	int nb, bytes;
	Rune r;
	char *news, *p, *ds;

	/* pass 1: count bytes needed by the converted string; watch for UTF */
	for (p = s, nb = 0; *p != '\0'; p += bytes, nb++) {
		bytes = chartorune(&r, p);
		if (bytes > 1)
			nb += 4;
	}
	news = malloc(nb+1);
	if (news != 0) {
		/* pass 2: convert s into new string */
		news[nb] = '\0';
		for (p = s, ds = news; *p != '\0'; p += bytes) {
			bytes = chartorune(&r, p);
			if (bytes == 1)
				*ds++ = r;
			else
				ds += sprint(ds, "\33%.4ux", (int)r);
		}
	}
	return news;
}

and this lex code decodes:

%{
char *lex2rune(Rune *rp, char *s);
char *estrdup(char *);

static Rune inrune;
%}
E	\33
%%
{E}....			{
			yylval.charp = estrdup(lex2rune(&inrune, yytext+1));
			return inrune;
			}
%%
char *
lex2rune(Rune *rp, char *s)
{
	static char utf[UTFmax+1];

	*rp = strtoul(s, 0, 16);
	utf[runetochar(utf, rp)] = '\0';
	return utf;
}