From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 2142 invoked from network); 13 May 2003 11:48:26 -0000 Received: from sunsite.dk (130.225.247.90) by ns1.primenet.com.au with SMTP; 13 May 2003 11:48:26 -0000 Received: (qmail 16336 invoked by alias); 13 May 2003 09:01:43 -0000 Mailing-List: contact zsh-workers-help@sunsite.dk; run by ezmlm Precedence: bulk X-No-Archive: yes X-Seq: 18525 Received: (qmail 16325 invoked from network); 13 May 2003 09:01:42 -0000 Received: from localhost (HELO sunsite.dk) (127.0.0.1) by localhost with SMTP; 13 May 2003 09:01:42 -0000 X-MessageWall-Score: 0 (sunsite.dk) Received: from [212.125.75.4] by sunsite.dk (MessageWall 1.0.8) with SMTP; 13 May 2003 9:1:42 -0000 Received: (qmail 13396 invoked from network); 13 May 2003 08:58:32 -0000 Received: from iris.logica.co.uk (158.234.9.163) by server-19.tower-1.messagelabs.com with SMTP; 13 May 2003 08:58:31 -0000 Received: from gmcs3.local ([158.234.142.61]) by iris.logica.co.uk (8.9.3/8.9.3/Debian 8.9.3-21) with ESMTP id JAA02539 for ; Tue, 13 May 2003 09:58:31 +0100 X-Authentication-Warning: iris.logica.co.uk: Host [158.234.142.61] claimed to be gmcs3.local Received: from gmcs3.local (localhost [127.0.0.1]) by gmcs3.local (8.11.6/8.11.6/SuSE Linux 0.5) with ESMTP id h4D8wod01620 for ; Tue, 13 May 2003 10:58:50 +0200 X-VirusChecked: Checked From: Oliver Kiddle To: Zsh workers Subject: PATCH: improvements to \u and \U Date: Tue, 13 May 2003 10:58:50 +0200 Message-ID: <1618.1052816330@gmcs3.local> This is an attempt to make the \u and \U code have a better chance of working on a variety of platforms. I've added a manual unicode to UTF-8 conversion which can be used if nl_langinfo(CODESET) is UTF-8. I've also changed the iconv fallback to convert to the output from nl_langinfo. I may yet also change it to convert from UTF-8 instead of UCS4. Oliver Index: Src/utils.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/utils.c,v retrieving revision 1.50 diff -u -r1.50 utils.c --- Src/utils.c 12 May 2003 11:45:30 -0000 1.50 +++ Src/utils.c 13 May 2003 08:51:40 -0000 @@ -30,13 +30,15 @@ #include "zsh.mdh" #include "utils.pro" -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -#include -# ifndef __STDC_ISO_10646__ -# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) -# include -# endif -# endif +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined (__STDC_ISO_10646__) +# include +#else +# ifdef HAVE_LANGINFO_H +# include +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) +# include +# endif +# endif #endif /* name of script being sourced */ @@ -3271,6 +3273,42 @@ } #endif +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__) +/* Convert a character from UCS4 encoding to UTF-8 */ + +size_t +ucs4toutf8(char *dest, unsigned int wval) +{ + size_t len; + + if (wval < 0x80) + len = 1; + else if (wval < 0x800) + len = 2; + else if (wval < 0x10000) + len = 3; + else if (wval < 0x200000) + len = 4; + else if (wval < 0x4000000) + len = 5; + else + len = 6; + + switch (len) { /* falls through except to the last case */ + case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6; + case 5: dest[4] = (wval & 0x3f) | 0x80; wval >>= 6; + case 4: dest[3] = (wval & 0x3f) | 0x80; wval >>= 6; + case 3: dest[2] = (wval & 0x3f) | 0x80; wval >>= 6; + case 2: dest[1] = (wval & 0x3f) | 0x80; wval >>= 6; + *dest = wval | (0xfc << (6 - len)) & 0xfc; + break; + case 1: *dest = wval; + } + + return len; +} +#endif + /* * Decode a key string, turning it into the literal characters. * The length is returned in len. @@ -3299,18 +3337,18 @@ char svchar = '\0'; int meta = 0, control = 0; int i; -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -# ifdef __STDC_ISO_10646__ +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) wint_t wval; -# elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + size_t count; +#else unsigned int wval; +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && (defined(HAVE_ICONV) || defined(HAVE_LIBICONV)) iconv_t cd; char inbuf[4]; - wchar_t outbuf[1]; size_t inbytes, outbytes; - char *inptr, *outptr; -# endif + char *inptr; size_t count; +# endif #endif if (fromwhere == 6) @@ -3387,8 +3425,6 @@ *misc = 1; break; } -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV) case 'u': case 'U': wval = 0; @@ -3407,21 +3443,10 @@ *misc = wval; return s+1; } -#ifdef __STDC_ISO_10646__ +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) count = wctomb(t, (wchar_t)wval); -#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) - inbytes = outbytes = 4; - inptr = inbuf; - outptr = (char *)outbuf; - /* assume big endian convention for UCS-4 */ - for (i=3;i>=0;i--) { - inbuf[i] = wval & 0xff; - wval >>= 8; - } - - cd = iconv_open("WCHAR_T", "ISO-10646"); - if (cd == (iconv_t)-1) { - zerr("cannot do charset conversion", NULL, 0); + if (count == (size_t)-1) { + zerr("character not in range", NULL, 0); if (fromwhere == 4) { for (u = t; (*u++ = *++s);); return t; @@ -3430,24 +3455,58 @@ *len = t - buf; return buf; } - iconv(cd, (const char **)&inptr, &inbytes, &outptr, &outbytes); - iconv_close(cd); - count = wctomb(t, *outbuf); -#endif - if (count == (size_t)-1) { - zerr("character not in range", NULL, 0); - if (fromwhere == 4) { - for (u = t; (*u++ = *++s);); - return t; + t += count; + continue; +# else +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) + if (!strcmp(nl_langinfo(CODESET), "UTF-8")) { + t += ucs4toutf8(t, wval); + continue; + } else { +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + inbytes = 4; + outbytes = 6; + inptr = inbuf; + /* assume big endian convention for UCS-4 */ + for (i=3;i>=0;i--) { + inbuf[i] = wval & 0xff; + wval >>= 8; } + + cd = iconv_open(nl_langinfo(CODESET), "ISO-10646"); + if (cd == (iconv_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + count = iconv(cd, (char **)&inptr, &inbytes, &t, &outbytes); + iconv_close(cd); + if (count == (size_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + *t = '\0'; + *len = t - buf; + return buf; + } + continue; +# else + zerr("cannot do charset conversion", NULL, 0); *t = '\0'; *len = t - buf; return buf; +# endif } - t += count; - continue; -#endif -#endif +# else + zerr("cannot do charset conversion", NULL, 0); + *t = '\0'; + *len = t - buf; + return buf; +# endif +# endif default: def: if ((idigit(*s) && *s < '8') || *s == 'x') {