From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 24195 invoked from network); 12 Mar 2003 09:35:02 -0000 Received: from sunsite.dk (130.225.247.90) by ns1.primenet.com.au with SMTP; 12 Mar 2003 09:35:02 -0000 Received: (qmail 13168 invoked by alias); 12 Mar 2003 09:34:56 -0000 Mailing-List: contact zsh-workers-help@sunsite.dk; run by ezmlm Precedence: bulk X-No-Archive: yes X-Seq: 18343 Received: (qmail 13160 invoked from network); 12 Mar 2003 09:34:55 -0000 Received: from localhost (HELO sunsite.dk) (127.0.0.1) by localhost with SMTP; 12 Mar 2003 09:34:55 -0000 X-MessageWall-Score: 0 (sunsite.dk) Received: from [212.125.75.4] by sunsite.dk (MessageWall 1.0.8) with SMTP; 12 Mar 2003 9:34:55 -0000 Received: (qmail 3697 invoked from network); 12 Mar 2003 09:34:49 -0000 Received: from iris.logica.co.uk (158.234.9.163) by server-23.tower-1.messagelabs.com with SMTP; 12 Mar 2003 09:34:49 -0000 Received: from finches.logica.co.uk ([158.234.142.11]) by iris.logica.co.uk (8.9.3/8.9.3/Debian 8.9.3-21) with ESMTP id JAA30588 for ; Wed, 12 Mar 2003 09:34:48 GMT X-Authentication-Warning: iris.logica.co.uk: Host [158.234.142.11] claimed to be finches.logica.co.uk Received: from finches.logica.co.uk (localhost [127.0.0.1]) by finches.logica.co.uk (8.11.6/8.11.6/SuSE Linux 0.5) with ESMTP id h2C9ce831420 for ; Wed, 12 Mar 2003 10:38:40 +0100 X-VirusChecked: Checked From: Oliver Kiddle To: Zsh workers Subject: PATCH: support \u and \U in echo/print/$'' Date: Wed, 12 Mar 2003 10:38:40 +0100 Message-ID: <31418.1047461920@finches.logica.co.uk> This implements the \u and \U escapes to specify characters by their unicode numbers. \u/\U exists in various other things such as C99, Python, Tcl, JavaScript, SGML and TeX and is also handled by the printf in GNU coreutils so it is pretty standard (only Perl uses something different). The characters are converted to the charset used by the current locale. The error handling should perhaps be done differently - printing the '\unnnn' instead of one or other of the error messages. It should probably be consistent with \xx - currently \xZZ prints `ZZ'. I'd have thought `\xZZ' would be better. sh, ksh, bash and coreutils vary on this. Any thoughts? And is it a bug or a feature that you can do \x-3? Also, should this be disabled in sh emulation? Is it okay to just do that with an (emulation != EMULATE_SH) check (adding an option would seem a bit excessive for this). Autoconf tests could probably be improved. Trying to use the iconv.m4 that comes with GNU gettext was too complicated for me. Oliver Index: zshconfig.ac =================================================================== RCS file: /cvsroot/zsh/zsh/zshconfig.ac,v retrieving revision 1.32 diff -u -r1.32 zshconfig.ac --- zshconfig.ac 26 Feb 2003 18:12:00 -0000 1.32 +++ zshconfig.ac 12 Mar 2003 09:23:46 -0000 @@ -494,7 +494,7 @@ limits.h fcntl.h libc.h sys/utsname.h sys/resource.h \ locale.h errno.h stdio.h stdlib.h unistd.h sys/capability.h \ utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \ - netinet/in_systm.h pcre.h langinfo.h) + netinet/in_systm.h pcre.h langinfo.h wchar.h) if test $dynamic = yes; then AC_CHECK_HEADERS(dlfcn.h) AC_CHECK_HEADERS(dl.h) @@ -663,6 +663,8 @@ AC_CHECK_LIB(socket, socket) +AC_CHECK_LIB(iconv, iconv) + dnl pcre-config should probably be employed here AC_SEARCH_LIBS(pcre_compile, pcre) @@ -959,7 +961,8 @@ tgetent tigetflag tigetnum tigetstr setupterm \ pcre_compile pcre_study pcre_exec \ nl_langinfo \ - erand48 open_memstream) + erand48 open_memstream \ + wctomb iconv) AC_FUNC_STRCOLL dnl Check if tgetent accepts NULL (and will allocate its own termcap buffer) Index: Doc/Zsh/builtins.yo =================================================================== RCS file: /cvsroot/zsh/zsh/Doc/Zsh/builtins.yo,v retrieving revision 1.58 diff -u -r1.58 builtins.yo --- Doc/Zsh/builtins.yo 17 Feb 2003 10:08:03 -0000 1.58 +++ Doc/Zsh/builtins.yo 12 Mar 2003 09:23:46 -0000 @@ -278,6 +278,8 @@ sitem(tt(\\))(backslash) sitem(tt(\0)var(NNN))(character code in octal) sitem(tt(\x)var(NN))(character code in hexadecimal) +sitem(tt(\u)var(NNNN))(unicode character code in hexadecimal) +sitem(tt(\U)var(NNNNNNNN))(unicode character code in hexadecimal) endsitem() pindex(BSD_ECHO, use of) Index: Src/utils.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/utils.c,v retrieving revision 1.44 diff -u -r1.44 utils.c --- Src/utils.c 5 Feb 2003 11:57:09 -0000 1.44 +++ Src/utils.c 12 Mar 2003 09:23:46 -0000 @@ -30,6 +30,15 @@ #include "zsh.mdh" #include "utils.pro" +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +#include +# ifndef __STDC_ISO_10646__ +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) +# include +# endif +# endif +#endif + /* name of script being sourced */ /**/ @@ -3274,7 +3283,8 @@ * for no newlines. * 3: As 1, but don't handle \c. * 4: Do $'...' quoting. Overwrites the existing string instead of - * zhalloc'ing + * zhalloc'ing. If \uNNNN ever generates multi-byte chars longer + * than 6 bytes, will need to adjust this to re-allocate memory. * 5: As 2, but \- is special. Expects misc to be defined. * 6: As 2, but parses only one character and returns end-pointer * and parsed character in *misc @@ -3288,11 +3298,28 @@ char *t, *u = NULL; char svchar = '\0'; int meta = 0, control = 0; + int i; +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +# ifdef __STDC_ISO_10646__ + wint_t wval; +# elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + unsigned int wval; + iconv_t cd; + char inbuf[4]; + wchar_t outbuf[1]; + size_t inbytes, outbytes; + char *inptr, *outptr; +# endif + size_t count; + size_t buflen = MB_LEN_MAX * (strlen(s) / 6) + (strlen(s) % 6) + 1; +#else + size_t buflen = strlen(s) + 1; +#endif if (fromwhere == 6) t = buf = tmp; else if (fromwhere != 4) - t = buf = zhalloc(strlen(s) + 1); + t = buf = zhalloc(buflen); else { t = buf = s; s += 2; @@ -3363,6 +3390,73 @@ *misc = 1; break; } +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + case 'u': + case 'U': + wval = 0; + for (i=(*s == 'u' ? 4 : 8); i>0; i--) { + if (*++s && idigit(*s)) + wval = wval * 16 + (*s - '0'); + else if (*s && (*s >= 'a' && *s <= 'f') || + (*s >= 'A' && *s <= 'F')) + wval = wval * 16 + (*s & 0x1f) + 9; + else { + zerr("expected hexadecimal digit", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + } + if (fromwhere == 6) { + *misc = wval; + return s+1; + } +#ifdef __STDC_ISO_10646__ + count = wctomb(t, (wchar_t)wval); +#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + inbytes = outbytes = 4; + inptr = inbuf; + outptr = (char *)outbuf; + /* assume big endian convention for UCS-4 */ + for (i=3;i>=0;i--) { + inbuf[i] = wval & 0xff; + wval >>= 8; + } + + cd = iconv_open("WCHAR_T", "ISO-10646"); + if (cd == (iconv_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + iconv(cd, &inptr, &inbytes, &outptr, &outbytes); + iconv_close(cd); + count = wctomb(t, *outbuf); +#endif + if (count == (size_t)-1) { + zerr("character not in range", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + t += count; + continue; +#endif +#endif default: def: if ((idigit(*s) && *s < '8') || *s == 'x') {