PATCH: support \u and \U in echo/print/$''

zsh-workers
 help / color / mirror / code / Atom feed

* PATCH: support \u and \U in echo/print/$''
@ 2003-03-12  9:38 Oliver Kiddle
  2003-03-13  9:47 ` Oliver Kiddle
  0 siblings, 1 reply; 2+ messages in thread
From: Oliver Kiddle @ 2003-03-12  9:38 UTC (permalink / raw)
  To: Zsh workers

This implements the \u and \U escapes to specify characters by their
unicode numbers. \u/\U exists in various other things such as C99,
Python, Tcl, JavaScript, SGML and TeX and is also handled by the printf
in GNU coreutils so it is pretty standard (only Perl uses something
different). The characters are converted to the charset used by the
current locale.

The error handling should perhaps be done differently - printing the
'\unnnn' instead of one or other of the error messages. It should probably
be consistent with \xx - currently \xZZ prints `ZZ'. I'd have thought
`\xZZ' would be better. sh, ksh, bash and coreutils vary on this. Any
thoughts?
And is it a bug or a feature that you can do \x-3?

Also, should this be disabled in sh emulation? Is it okay to just do
that with an (emulation != EMULATE_SH) check (adding an option would
seem a bit excessive for this).

Autoconf tests could probably be improved. Trying to use the iconv.m4
that comes with GNU gettext was too complicated for me.

Oliver

Index: zshconfig.ac
===================================================================
RCS file: /cvsroot/zsh/zsh/zshconfig.ac,v
retrieving revision 1.32
diff -u -r1.32 zshconfig.ac
--- zshconfig.ac	26 Feb 2003 18:12:00 -0000	1.32
+++ zshconfig.ac	12 Mar 2003 09:23:46 -0000
@@ -494,7 +494,7 @@
 		 limits.h fcntl.h libc.h sys/utsname.h sys/resource.h \
 		 locale.h errno.h stdio.h stdlib.h unistd.h sys/capability.h \
 		 utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \
-		 netinet/in_systm.h pcre.h langinfo.h)
+		 netinet/in_systm.h pcre.h langinfo.h wchar.h)
 if test $dynamic = yes; then
   AC_CHECK_HEADERS(dlfcn.h)
   AC_CHECK_HEADERS(dl.h)
@@ -663,6 +663,8 @@
 
 AC_CHECK_LIB(socket, socket)
 
+AC_CHECK_LIB(iconv, iconv)
+
 dnl pcre-config should probably be employed here
 AC_SEARCH_LIBS(pcre_compile, pcre)
 
@@ -959,7 +961,8 @@
 	       tgetent tigetflag tigetnum tigetstr setupterm \
 	       pcre_compile pcre_study pcre_exec \
 	       nl_langinfo \
-	       erand48 open_memstream)
+	       erand48 open_memstream \
+	       wctomb iconv)
 AC_FUNC_STRCOLL
 
 dnl  Check if tgetent accepts NULL (and will allocate its own termcap buffer)
Index: Doc/Zsh/builtins.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/builtins.yo,v
retrieving revision 1.58
diff -u -r1.58 builtins.yo
--- Doc/Zsh/builtins.yo	17 Feb 2003 10:08:03 -0000	1.58
+++ Doc/Zsh/builtins.yo	12 Mar 2003 09:23:46 -0000
@@ -278,6 +278,8 @@
 sitem(tt(\\))(backslash)
 sitem(tt(\0)var(NNN))(character code in octal)
 sitem(tt(\x)var(NN))(character code in hexadecimal)
+sitem(tt(\u)var(NNNN))(unicode character code in hexadecimal)
+sitem(tt(\U)var(NNNNNNNN))(unicode character code in hexadecimal)
 endsitem()
 
 pindex(BSD_ECHO, use of)
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.44
diff -u -r1.44 utils.c
--- Src/utils.c	5 Feb 2003 11:57:09 -0000	1.44
+++ Src/utils.c	12 Mar 2003 09:23:46 -0000
@@ -30,6 +30,15 @@
 #include "zsh.mdh"
 #include "utils.pro"
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#include <wchar.h>
+#  ifndef __STDC_ISO_10646__
+#    if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+#      include <iconv.h>
+#    endif
+#  endif
+#endif
+
 /* name of script being sourced */
 
 /**/
@@ -3274,7 +3283,8 @@
  *       for no newlines.
  *   3:  As 1, but don't handle \c.
  *   4:  Do $'...' quoting.  Overwrites the existing string instead of
- *       zhalloc'ing 
+ *       zhalloc'ing. If \uNNNN ever generates multi-byte chars longer
+ *       than 6 bytes, will need to adjust this to re-allocate memory.
  *   5:  As 2, but \- is special.  Expects misc to be defined.
  *   6:  As 2, but parses only one character and returns end-pointer
  *       and parsed character in *misc
@@ -3288,11 +3298,28 @@
     char *t, *u = NULL;
     char svchar = '\0';
     int meta = 0, control = 0;
+    int i;
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#  ifdef __STDC_ISO_10646__
+    wint_t wval;
+#  elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    unsigned int wval;
+    iconv_t cd;
+    char inbuf[4];
+    wchar_t outbuf[1];
+    size_t inbytes, outbytes;
+    char *inptr, *outptr;
+#  endif
+    size_t count;
+    size_t buflen = MB_LEN_MAX * (strlen(s) / 6) + (strlen(s) % 6) + 1;
+#else
+    size_t buflen = strlen(s) + 1;
+#endif
 
     if (fromwhere == 6)
 	t = buf = tmp;
     else if (fromwhere != 4)
-	t = buf = zhalloc(strlen(s) + 1);
+	t = buf = zhalloc(buflen);
     else {
 	t = buf = s;
 	s += 2;
@@ -3363,6 +3390,73 @@
 		    *misc = 1;
 		    break;
 		}
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
+#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+	    case 'u':
+	    case 'U':
+	    	wval = 0;
+		for (i=(*s == 'u' ? 4 : 8); i>0; i--) {
+		    if (*++s && idigit(*s))
+		        wval = wval * 16 + (*s - '0');
+		    else if (*s && (*s >= 'a' && *s <= 'f') ||
+		            (*s >= 'A' && *s <= 'F'))
+		        wval = wval * 16 + (*s & 0x1f) + 9;
+		    else {
+		        zerr("expected hexadecimal digit", NULL, 0);
+			if (fromwhere == 4) {
+			    for (u = t; (*u++ = *++s););
+			    return t;
+			}
+			*t = '\0';
+			*len = t - buf;
+			return buf;
+		    }
+		}
+    	    	if (fromwhere == 6) {
+		    *misc = wval;
+		    return s+1;
+		}
+#ifdef __STDC_ISO_10646__
+		count = wctomb(t, (wchar_t)wval);
+#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    	    	inbytes = outbytes = 4;
+    	    	inptr = inbuf;
+    	    	outptr = (char *)outbuf;
+		/* assume big endian convention for UCS-4 */
+		for (i=3;i>=0;i--) {
+		    inbuf[i] = wval & 0xff;
+		    wval >>= 8;
+		}
+    	    	
+    	    	cd = iconv_open("WCHAR_T", "ISO-10646");
+		if (cd == (iconv_t)-1) {
+		    zerr("cannot do charset conversion", NULL, 0);
+		    if (fromwhere == 4) {
+			for (u = t; (*u++ = *++s););
+			return t;
+		    }
+		    *t = '\0';
+		    *len = t - buf;
+		    return buf;
+		}
+                iconv(cd, &inptr, &inbytes, &outptr, &outbytes);
+		iconv_close(cd);
+		count = wctomb(t, *outbuf);
+#endif
+		if (count == (size_t)-1) {
+		    zerr("character not in range", NULL, 0);
+		    if (fromwhere == 4) {
+			for (u = t; (*u++ = *++s););
+			return t;
+		    }
+		    *t = '\0';
+		    *len = t - buf;
+		    return buf;
+		}
+		t += count;  
+		continue;
+#endif
+#endif
 	    default:
 	    def:
 		if ((idigit(*s) && *s < '8') || *s == 'x') {


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: PATCH: support \u and \U in echo/print/$''
  2003-03-12  9:38 PATCH: support \u and \U in echo/print/$'' Oliver Kiddle
@ 2003-03-13  9:47 ` Oliver Kiddle
  0 siblings, 0 replies; 2+ messages in thread
From: Oliver Kiddle @ 2003-03-13  9:47 UTC (permalink / raw)
  To: zsh-workers

I wrote:
> '\unnnn' instead of one or other of the error messages. It should probably
> be consistent with \xx - currently \xZZ prints `ZZ'. I'd have thought
> `\xZZ' would be better. sh, ksh, bash and coreutils vary on this. Any
> thoughts?

Apparently, \xZZ prints a null followed by `ZZ'. So it allows less than
two digits. This is nice and easy to implement so the patch below, to
go on top of the previous makes \u consistent with \x.

The other two errors - where it is unable to convert the character are
still there as messages.

> Also, should this be disabled in sh emulation? Is it okay to just do

It seems not - SUSv3 apparently leaves other sequences after a backslash
as "unspecified".

Oliver

--- utils.c	2003-03-12 20:09:08.000000000 +0100
+++ utils.c	2003-03-13 00:04:56.000000000 +0100
@@ -3402,14 +3402,8 @@
 		            (*s >= 'A' && *s <= 'F'))
 		        wval = wval * 16 + (*s & 0x1f) + 9;
 		    else {
-		        zerr("expected hexadecimal digit", NULL, 0);
-			if (fromwhere == 4) {
-			    for (u = t; (*u++ = *++s););
-			    return t;
-			}
-			*t = '\0';
-			*len = t - buf;
-			return buf;
+		    	s--;
+		        break;
 		    }
 		}
     	    	if (fromwhere == 6) {


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2003-03-13  9:44 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-03-12  9:38 PATCH: support \u and \U in echo/print/$'' Oliver Kiddle
2003-03-13  9:47 ` Oliver Kiddle

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).