Thanks, Geoff, and Erik. However... (with my 5 minute intro to Runes courtesy of Hello World doc...) we're still talking BMP, right? (I programmed in B back in the day... i.e. 1980-ish and due to a career shift have been out of things for a while, so forgive my potential obtuseness as I gradually reintegrate...!) This reminds me of what I read here: http://www.w3.org/2005/03/23-lex-U K Karljürgen G. Feuerherm, PhD Department of Archaeology and Classical Studies Wilfrid Laurier University 75 University Avenue West Waterloo, Ontario N2L 3C5 Tel. (519) 884-1970 x3193 Fax (519) 883-0991 (ATTN Arch. & Classics) >>> 28/01/2010 3:46:27 pm >>> I've extended old code using lex to accept utf by massaging the input stream, before lex sees it, to parse utf and encode non-ascii Runes into '\33' (escape) followed by 4 hex digits. A simple lex rule then decodes for the benefit of yacc. This encodes: /* * lex can't cope with character sets wider than 8 bits, so convert * s to runes and encode non-ascii runes as . * result is malloced. */ char * utf2lex(char *s) { int nb, bytes; Rune r; char *news, *p, *ds; /* pass 1: count bytes needed by the converted string; watch for UTF */ for (p = s, nb = 0; *p != '\0'; p += bytes, nb++) { bytes = chartorune(&r, p); if (bytes > 1) nb += 4; } news = malloc(nb+1); if (news != 0) { /* pass 2: convert s into new string */ news[nb] = '\0'; for (p = s, ds = news; *p != '\0'; p += bytes) { bytes = chartorune(&r, p); if (bytes == 1) *ds++ = r; else ds += sprint(ds, "\33%.4ux", (int)r); } } return news; } and this lex code decodes: %{ char *lex2rune(Rune *rp, char *s); char *estrdup(char *); static Rune inrune; %} E\33 %% {E}....{ yylval.charp = estrdup(lex2rune(&inrune, yytext+1)); return inrune; } %% char * lex2rune(Rune *rp, char *s) { static char utf[UTFmax+1]; *rp = strtoul(s, 0, 16); utf[runetochar(utf, rp)] = '\0'; return utf; }