From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <zsh-workers-return-18343-mason-zsh=primenet.com.au@sunsite.dk>
Received: (qmail 24195 invoked from network); 12 Mar 2003 09:35:02 -0000
Received: from sunsite.dk (130.225.247.90)
  by ns1.primenet.com.au with SMTP; 12 Mar 2003 09:35:02 -0000
Received: (qmail 13168 invoked by alias); 12 Mar 2003 09:34:56 -0000
Mailing-List: contact zsh-workers-help@sunsite.dk; run by ezmlm
Precedence: bulk
X-No-Archive: yes
X-Seq: 18343
Received: (qmail 13160 invoked from network); 12 Mar 2003 09:34:55 -0000
Received: from localhost (HELO sunsite.dk) (127.0.0.1)
  by localhost with SMTP; 12 Mar 2003 09:34:55 -0000
X-MessageWall-Score: 0 (sunsite.dk)
Received: from [212.125.75.4] by sunsite.dk (MessageWall 1.0.8) with SMTP; 12 Mar 2003 9:34:55 -0000
Received: (qmail 3697 invoked from network); 12 Mar 2003 09:34:49 -0000
Received: from iris.logica.co.uk (158.234.9.163)
  by server-23.tower-1.messagelabs.com with SMTP; 12 Mar 2003 09:34:49 -0000
Received: from finches.logica.co.uk ([158.234.142.11])
	by iris.logica.co.uk (8.9.3/8.9.3/Debian 8.9.3-21) with ESMTP id JAA30588
	for <zsh-workers@sunsite.dk>; Wed, 12 Mar 2003 09:34:48 GMT
X-Authentication-Warning: iris.logica.co.uk: Host [158.234.142.11] claimed to be finches.logica.co.uk
Received: from finches.logica.co.uk (localhost [127.0.0.1])
	by finches.logica.co.uk (8.11.6/8.11.6/SuSE Linux 0.5) with ESMTP id h2C9ce831420
	for <zsh-workers@sunsite.dk>; Wed, 12 Mar 2003 10:38:40 +0100
X-VirusChecked: Checked
From: Oliver Kiddle <okiddle@yahoo.co.uk>
To: Zsh workers <zsh-workers@sunsite.dk>
Subject: PATCH: support \u and \U in echo/print/$''
Date: Wed, 12 Mar 2003 10:38:40 +0100
Message-ID: <31418.1047461920@finches.logica.co.uk>

This implements the \u and \U escapes to specify characters by their
unicode numbers. \u/\U exists in various other things such as C99,
Python, Tcl, JavaScript, SGML and TeX and is also handled by the printf
in GNU coreutils so it is pretty standard (only Perl uses something
different). The characters are converted to the charset used by the
current locale.

The error handling should perhaps be done differently - printing the
'\unnnn' instead of one or other of the error messages. It should probably
be consistent with \xx - currently \xZZ prints `ZZ'. I'd have thought
`\xZZ' would be better. sh, ksh, bash and coreutils vary on this. Any
thoughts?
And is it a bug or a feature that you can do \x-3?

Also, should this be disabled in sh emulation? Is it okay to just do
that with an (emulation != EMULATE_SH) check (adding an option would
seem a bit excessive for this).

Autoconf tests could probably be improved. Trying to use the iconv.m4
that comes with GNU gettext was too complicated for me.

Oliver

Index: zshconfig.ac
===================================================================
RCS file: /cvsroot/zsh/zsh/zshconfig.ac,v
retrieving revision 1.32
diff -u -r1.32 zshconfig.ac
--- zshconfig.ac	26 Feb 2003 18:12:00 -0000	1.32
+++ zshconfig.ac	12 Mar 2003 09:23:46 -0000
@@ -494,7 +494,7 @@
 		 limits.h fcntl.h libc.h sys/utsname.h sys/resource.h \
 		 locale.h errno.h stdio.h stdlib.h unistd.h sys/capability.h \
 		 utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \
-		 netinet/in_systm.h pcre.h langinfo.h)
+		 netinet/in_systm.h pcre.h langinfo.h wchar.h)
 if test $dynamic = yes; then
   AC_CHECK_HEADERS(dlfcn.h)
   AC_CHECK_HEADERS(dl.h)
@@ -663,6 +663,8 @@
 
 AC_CHECK_LIB(socket, socket)
 
+AC_CHECK_LIB(iconv, iconv)
+
 dnl pcre-config should probably be employed here
 AC_SEARCH_LIBS(pcre_compile, pcre)
 
@@ -959,7 +961,8 @@
 	       tgetent tigetflag tigetnum tigetstr setupterm \
 	       pcre_compile pcre_study pcre_exec \
 	       nl_langinfo \
-	       erand48 open_memstream)
+	       erand48 open_memstream \
+	       wctomb iconv)
 AC_FUNC_STRCOLL
 
 dnl  Check if tgetent accepts NULL (and will allocate its own termcap buffer)
Index: Doc/Zsh/builtins.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/builtins.yo,v
retrieving revision 1.58
diff -u -r1.58 builtins.yo
--- Doc/Zsh/builtins.yo	17 Feb 2003 10:08:03 -0000	1.58
+++ Doc/Zsh/builtins.yo	12 Mar 2003 09:23:46 -0000
@@ -278,6 +278,8 @@
 sitem(tt(\\))(backslash)
 sitem(tt(\0)var(NNN))(character code in octal)
 sitem(tt(\x)var(NN))(character code in hexadecimal)
+sitem(tt(\u)var(NNNN))(unicode character code in hexadecimal)
+sitem(tt(\U)var(NNNNNNNN))(unicode character code in hexadecimal)
 endsitem()
 
 pindex(BSD_ECHO, use of)
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.44
diff -u -r1.44 utils.c
--- Src/utils.c	5 Feb 2003 11:57:09 -0000	1.44
+++ Src/utils.c	12 Mar 2003 09:23:46 -0000
@@ -30,6 +30,15 @@
 #include "zsh.mdh"
 #include "utils.pro"
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#include <wchar.h>
+#  ifndef __STDC_ISO_10646__
+#    if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+#      include <iconv.h>
+#    endif
+#  endif
+#endif
+
 /* name of script being sourced */
 
 /**/
@@ -3274,7 +3283,8 @@
  *       for no newlines.
  *   3:  As 1, but don't handle \c.
  *   4:  Do $'...' quoting.  Overwrites the existing string instead of
- *       zhalloc'ing 
+ *       zhalloc'ing. If \uNNNN ever generates multi-byte chars longer
+ *       than 6 bytes, will need to adjust this to re-allocate memory.
  *   5:  As 2, but \- is special.  Expects misc to be defined.
  *   6:  As 2, but parses only one character and returns end-pointer
  *       and parsed character in *misc
@@ -3288,11 +3298,28 @@
     char *t, *u = NULL;
     char svchar = '\0';
     int meta = 0, control = 0;
+    int i;
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#  ifdef __STDC_ISO_10646__
+    wint_t wval;
+#  elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    unsigned int wval;
+    iconv_t cd;
+    char inbuf[4];
+    wchar_t outbuf[1];
+    size_t inbytes, outbytes;
+    char *inptr, *outptr;
+#  endif
+    size_t count;
+    size_t buflen = MB_LEN_MAX * (strlen(s) / 6) + (strlen(s) % 6) + 1;
+#else
+    size_t buflen = strlen(s) + 1;
+#endif
 
     if (fromwhere == 6)
 	t = buf = tmp;
     else if (fromwhere != 4)
-	t = buf = zhalloc(strlen(s) + 1);
+	t = buf = zhalloc(buflen);
     else {
 	t = buf = s;
 	s += 2;
@@ -3363,6 +3390,73 @@
 		    *misc = 1;
 		    break;
 		}
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+	    case 'u':
+	    case 'U':
+	    	wval = 0;
+		for (i=(*s == 'u' ? 4 : 8); i>0; i--) {
+		    if (*++s && idigit(*s))
+		        wval = wval * 16 + (*s - '0');
+		    else if (*s && (*s >= 'a' && *s <= 'f') ||
+		            (*s >= 'A' && *s <= 'F'))
+		        wval = wval * 16 + (*s & 0x1f) + 9;
+		    else {
+		        zerr("expected hexadecimal digit", NULL, 0);
+			if (fromwhere == 4) {
+			    for (u = t; (*u++ = *++s););
+			    return t;
+			}
+			*t = '\0';
+			*len = t - buf;
+			return buf;
+		    }
+		}
+    	    	if (fromwhere == 6) {
+		    *misc = wval;
+		    return s+1;
+		}
+#ifdef __STDC_ISO_10646__
+		count = wctomb(t, (wchar_t)wval);
+#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    	    	inbytes = outbytes = 4;
+    	    	inptr = inbuf;
+    	    	outptr = (char *)outbuf;
+		/* assume big endian convention for UCS-4 */
+		for (i=3;i>=0;i--) {
+		    inbuf[i] = wval & 0xff;
+		    wval >>= 8;
+		}
+    	    	
+    	    	cd = iconv_open("WCHAR_T", "ISO-10646");
+		if (cd == (iconv_t)-1) {
+		    zerr("cannot do charset conversion", NULL, 0);
+		    if (fromwhere == 4) {
+			for (u = t; (*u++ = *++s););
+			return t;
+		    }
+		    *t = '\0';
+		    *len = t - buf;
+		    return buf;
+		}
+                iconv(cd, &inptr, &inbytes, &outptr, &outbytes);
+		iconv_close(cd);
+		count = wctomb(t, *outbuf);
+#endif
+		if (count == (size_t)-1) {
+		    zerr("character not in range", NULL, 0);
+		    if (fromwhere == 4) {
+			for (u = t; (*u++ = *++s););
+			return t;
+		    }
+		    *t = '\0';
+		    *len = t - buf;
+		    return buf;
+		}
+		t += count;  
+		continue;
+#endif
+#endif
 	    default:
 	    def:
 		if ((idigit(*s) && *s < '8') || *s == 'x') {