zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: multibyte odds and ends
@ 2006-08-01 20:44 Peter Stephenson
  2006-08-02  2:55 ` Bart Schaefer
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Stephenson @ 2006-08-01 20:44 UTC (permalink / raw)
  To: Zsh hackers list

A few minor things:

- fix bslashquote().  The only issue was a test for printability when
  using $'...'.  (I don't know where I got the idea that form of
  quoting was something to do with POSIX, it isn't.)
- remove aliases iascii, iupper, iprint, ilower which simply invoked
  standard ctype macros and instead use ctype directly for clarity
- fix return value from getzlequery(); it was only ever used in yes/no
  mode so a status return makes more sense than a character.

The only other outstanding syntactic issue I noticed this time through
was with bangchar, hatchar and hashchar which can be redefined by
setting HISTCHARS.  These are always compared to single bytes, and this
happens at quite a low level of input where we *really* don't want to
start handling multibyte characters since it effectively means a rewrite
of the lexical analyser.  They are in any case quite nasty hacks which
it wouldn't be much fun to handle outside the portable character set.
My suggestion is that their use be limited in some way:

- Make sure they are ASCII characters?
- Allow non-ASCII characters but make sure they are complete (this will
only work with single-byte extensions to ASCII e.g. ISO-8859-1)?
- Keep the current code and simply document that a single byte will be
compared, with possibly unexpected effects (possibly issue a warning)?

Does anybody have any preferences?  Does anybody ever redefine HISTCHARS?

Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.129
diff -u -r1.129 utils.c
--- Src/utils.c	25 Jul 2006 09:25:27 -0000	1.129
+++ Src/utils.c	1 Aug 2006 20:28:47 -0000
@@ -2835,7 +2835,7 @@
     if (len == 0) {
 	/* NULL is special */
 	return zistype(0, itype);
-    } else if (len == 1 && iascii(*outstr)) {
+    } else if (len == 1 && isascii(*outstr)) {
 	return zistype(*outstr, itype);
     } else {
 	switch (itype) {
@@ -2897,7 +2897,7 @@
 		/* in this case non-ASCII characters can't match */
 		if (chr > 127 || !zistype(chr,itype))
 		    break;
-	    } else if (len == 1 && iascii(*ptr)) {
+	    } else if (len == 1 && isascii(*ptr)) {
 		/* ASCII: can't be metafied, use standard test */
 		if (!zistype(*ptr,itype))
 		    break;
@@ -4017,7 +4017,7 @@
  * The last argument should be zero if this is to be used outside a string, *
  * one if it is to be quoted for the inside of a single quoted string,      *
  * two if it is for the inside of a double quoted string, and               *
- * three if it is for the inside of a posix quoted string.                  *
+ * three if it is for the inside of a $'...' quoted string.                 *
  * The string may be metafied and contain tokens.                           */
 
 /**/
@@ -4031,127 +4031,153 @@
 
     tt = v = buf;
     u = s;
-    for (; *u; u++) {
-	if (e && *e == u)
-	    *e = v, sf = 1;
-	if (instring == 3) {
-	  int c = *u;
-	  if (c == Meta) {
-	    c = *++u ^ 32;
-	  }
-	  c &= 0xff;
-	  if(isprint(c)) {
-	    switch (c) {
-	    case '\\':
-	    case '\'':
-	      *v++ = '\\';
-	      *v++ = c;
-	      break;
+    if (instring == 3) {
+	/*
+	 * As we test for printability here we need to be able
+	 * to look for multibyte characters.
+	 */
+	convchar_t cc;
+	MB_METACHARINIT();
+	while (*u) {
+	    const char *uend = u + MB_METACHARLENCONV(u, &cc);
+
+	    if (e && !sf && *e <= u) {
+		*e = v;
+		sf = 1;
+	    }
+	    if (
+#ifdef MULTIBYTE_SUPPORT
+		cc != WEOF && 
+#endif
+		MB_ISPRINT(cc)) {
+		switch (cc) {
+		case ZWC('\\'):
+		case ZWC('\''):
+		    *v++ = '\\';
+		    break;
 
-	    default:
-	      if(imeta(c)) {
-		*v++ = Meta;
-		*v++ = c ^ 32;
-	      }
-	      else {
-		if (isset(BANGHIST) && c == bangchar) {
-		  *v++ = '\\';
+		default:
+		    if (isset(BANGHIST) && cc == (wchar_t)bangchar)
+			*v++ = '\\';
+		    break;
 		}
-		*v++ = c;
-	      }
-	      break;
-	    }
-	  }
-	  else {
-	    switch (c) {
-	    case '\0':
-	      *v++ = '\\';
-	      *v++ = '0';
-	      if ('0' <= u[1] && u[1] <= '7') {
-		*v++ = '0';
-		*v++ = '0';
-	      }
-	      break;
-
-	    case '\007': *v++ = '\\'; *v++ = 'a'; break;
-	    case '\b': *v++ = '\\'; *v++ = 'b'; break;
-	    case '\f': *v++ = '\\'; *v++ = 'f'; break;
-	    case '\n': *v++ = '\\'; *v++ = 'n'; break;
-	    case '\r': *v++ = '\\'; *v++ = 'r'; break;
-	    case '\t': *v++ = '\\'; *v++ = 't'; break;
-	    case '\v': *v++ = '\\'; *v++ = 'v'; break;
+		while (u < uend)
+		    *v++ = *u++;
+	    } else {
+		/* Not printable */
+		for (; u < uend; u++) {
+		    /*
+		     * Just do this byte by byte; there's no great
+		     * advantage in being clever with multibyte
+		     * characters if we don't think they're printable.
+		     */
+		    int c;
+		    if (*u == Meta)
+			c = STOUC(*++u ^ 32);
+		    else
+			c = STOUC(*u);
+		    switch (c) {
+		    case '\0':
+			*v++ = '\\';
+			*v++ = '0';
+			if ('0' <= u[1] && u[1] <= '7') {
+			    *v++ = '0';
+			    *v++ = '0';
+			}
+			break;
 
-	    default:
-	      *v++ = '\\';
-	      *v++ = '0' + ((c >> 6) & 7);
-	      *v++ = '0' + ((c >> 3) & 7);
-	      *v++ = '0' + (c & 7);
-	      break;
+		    case '\007': *v++ = '\\'; *v++ = 'a'; break;
+		    case '\b': *v++ = '\\'; *v++ = 'b'; break;
+		    case '\f': *v++ = '\\'; *v++ = 'f'; break;
+		    case '\n': *v++ = '\\'; *v++ = 'n'; break;
+		    case '\r': *v++ = '\\'; *v++ = 'r'; break;
+		    case '\t': *v++ = '\\'; *v++ = 't'; break;
+		    case '\v': *v++ = '\\'; *v++ = 'v'; break;
+
+		    default:
+			*v++ = '\\';
+			*v++ = '0' + ((c >> 6) & 7);
+			*v++ = '0' + ((c >> 3) & 7);
+			*v++ = '0' + (c & 7);
+			break;
+		    }
+		}
 	    }
-	  }
-	  continue;
 	}
-	else if (*u == Tick || *u == Qtick) {
-	    char c = *u++;
+    }
+    else
+    {
+	/*
+	 * Here the only special characters are syntactic, so
+	 * we can go through bytewise.
+	 */
+	for (; *u; u++) {
+	    if (e && *e == u)
+		*e = v, sf = 1;
+	    if (*u == Tick || *u == Qtick) {
+		char c = *u++;
+
+		*v++ = c;
+		while (*u && *u != c)
+		    *v++ = *u++;
+		*v++ = c;
+		if (!*u)
+		    u--;
+		continue;
+	    }
+	    else if ((*u == String || *u == Qstring) &&
+		     (u[1] == Inpar || u[1] == Inbrack || u[1] == Inbrace)) {
+		char c = (u[1] == Inpar ? Outpar : (u[1] == Inbrace ?
+						    Outbrace : Outbrack));
+		char beg = *u;
+		int level = 0;
 
-	    *v++ = c;
-	    while (*u && *u != c)
 		*v++ = *u++;
-	    *v++ = c;
-	    if (!*u)
-		u--;
-	    continue;
-	}
-	else if ((*u == String || *u == Qstring) &&
-		 (u[1] == Inpar || u[1] == Inbrack || u[1] == Inbrace)) {
-	    char c = (u[1] == Inpar ? Outpar : (u[1] == Inbrace ?
-						Outbrace : Outbrack));
-	    char beg = *u;
-	    int level = 0;
-
-	    *v++ = *u++;
-	    *v++ = *u++;
-	    while (*u && (*u != c || level)) {
-		if (*u == beg)
-		    level++;
-		else if (*u == c)
-		    level--;
 		*v++ = *u++;
-	    }
-	    if (*u)
-		*v++ = *u;
-	    else
-		u--;
-	    continue;
-	}
-	else if (ispecial(*u) &&
-		 ((*u != '=' && *u != '~') ||
-		  u == s ||
-		  (isset(MAGICEQUALSUBST) && (u[-1] == '=' || u[-1] == ':')) ||
-		  (*u == '~' && isset(EXTENDEDGLOB))) &&
-	    (!instring ||
-	     (isset(BANGHIST) && *u == (char)bangchar && instring != 1) ||
-	     (instring == 2 &&
-	      (*u == '$' || *u == '`' || *u == '\"' || *u == '\\')) ||
-	     (instring == 1 && *u == '\''))) {
-	    if (*u == '\n' || (instring == 1 && *u == '\'')) {
-		if (unset(RCQUOTES)) {
-		    *v++ = '\'';
-		    if (*u == '\'')
-			*v++ = '\\';
+		while (*u && (*u != c || level)) {
+		    if (*u == beg)
+			level++;
+		    else if (*u == c)
+			level--;
+		    *v++ = *u++;
+		}
+		if (*u)
 		    *v++ = *u;
-		    *v++ = '\'';
-		} else if (*u == '\n')
-		    *v++ = '"', *v++ = '\n', *v++ = '"';
 		else
-		    *v++ = '\'', *v++ = '\'';
+		    u--;
 		continue;
-	    } else
-		*v++ = '\\';
+	    }
+	    else if (ispecial(*u) &&
+		     ((*u != '=' && *u != '~') ||
+		      u == s ||
+		      (isset(MAGICEQUALSUBST) &&
+		       (u[-1] == '=' || u[-1] == ':')) ||
+		      (*u == '~' && isset(EXTENDEDGLOB))) &&
+		     (!instring ||
+		      (isset(BANGHIST) && *u == (char)bangchar &&
+		       instring != 1) ||
+		      (instring == 2 &&
+		       (*u == '$' || *u == '`' || *u == '\"' || *u == '\\')) ||
+		      (instring == 1 && *u == '\''))) {
+		if (*u == '\n' || (instring == 1 && *u == '\'')) {
+		    if (unset(RCQUOTES)) {
+			*v++ = '\'';
+			if (*u == '\'')
+			    *v++ = '\\';
+			*v++ = *u;
+			*v++ = '\'';
+		    } else if (*u == '\n')
+			*v++ = '"', *v++ = '\n', *v++ = '"';
+		    else
+			*v++ = '\'', *v++ = '\'';
+		    continue;
+		} else
+		    *v++ = '\\';
+	    }
+	    if(*u == Meta)
+		*v++ = *u++;
+	    *v++ = *u;
 	}
-	if(*u == Meta)
-	    *v++ = *u++;
-	*v++ = *u;
     }
     *v = '\0';
 
Index: Src/ztype.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/ztype.h,v
retrieving revision 1.5
diff -u -r1.5 ztype.h
--- Src/ztype.h	24 Jul 2006 22:00:21 -0000	1.5
+++ Src/ztype.h	1 Aug 2006 20:28:47 -0000
@@ -61,11 +61,8 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define MB_ZISTYPE(X,Y) wcsitype((X),(Y))
+#define MB_ISPRINT(X)	iswprint(X)
 #else
 #define MB_ZISTYPE(X,Y)	zistype((X),(Y))
+#define MB_ISPRINT(X)	isprint(X)
 #endif
-
-#define iascii(X) isascii(STOUC(X))
-#define ilower(X) islower(STOUC(X))
-#define iprint(X) isprint(STOUC(X))
-#define iupper(X) isupper(STOUC(X))
Index: Src/Zle/compresult.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/compresult.c,v
retrieving revision 1.62
diff -u -r1.62 compresult.c
--- Src/Zle/compresult.c	7 Mar 2006 21:31:43 -0000	1.62
+++ Src/Zle/compresult.c	1 Aug 2006 20:28:48 -0000
@@ -1861,7 +1861,7 @@
 		     listdat.nlines));
 	qup = ((l + columns - 1) / columns) - 1;
 	fflush(shout);
-	if (getzlequery(1) != 'y') {
+	if (!getzlequery()) {
 	    if (clearflag) {
 		putc('\r', shout);
 		tcmultout(TCUP, TCMULTUP, qup);
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.31
diff -u -r1.31 zle.h
--- Src/Zle/zle.h	24 Jul 2006 22:00:21 -0000	1.31
+++ Src/Zle/zle.h	1 Aug 2006 20:28:48 -0000
@@ -125,9 +125,9 @@
 #define ZC_icntrl icntrl
 #define ZC_idigit idigit
 #define ZC_iident iident
-#define ZC_ilower ilower
+#define ZC_ilower islower
 #define ZC_inblank inblank
-#define ZC_iupper iupper
+#define ZC_iupper isupper
 #define ZC_iword iword
 
 #define ZC_tolower tulower
Index: Src/Zle/zle_tricky.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_tricky.c,v
retrieving revision 1.68
diff -u -r1.68 zle_tricky.c
--- Src/Zle/zle_tricky.c	10 Jul 2006 13:08:24 -0000	1.68
+++ Src/Zle/zle_tricky.c	1 Aug 2006 20:28:50 -0000
@@ -2298,7 +2298,7 @@
 	     fprintf(shout, "zsh: do you wish to see all %d lines? ", nlines));
 	qup = ((l + columns - 1) / columns) - 1;
 	fflush(shout);
-	if (getzlequery(1) != 'y') {
+	if (!getzlequery()) {
 	    if (clearflag) {
 		putc('\r', shout);
 		tcmultout(TCUP, TCMULTUP, qup);
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.37
diff -u -r1.37 zle_utils.c
--- Src/Zle/zle_utils.c	13 Jan 2006 17:13:51 -0000	1.37
+++ Src/Zle/zle_utils.c	1 Aug 2006 20:28:50 -0000
@@ -653,50 +653,42 @@
 }
 
 /*
- * Query the user, and return a single character response.  The question
- * is assumed to have been printed already, and the cursor is left
- * immediately after the response echoed.  (Might cause a problem if
- * this takes it onto the next line.)  If yesno is non-zero: <Tab> is
- * interpreted as 'y'; any other control character is interpreted as
- * 'n'.  If there are any characters in the buffer, this is taken as a
- * negative response, and no characters are read.  Case is folded.
- *
- * TBD: this may need extending to return a wchar_t or possibly
- * a wint_t.
+ * Query the user, and return 1 for yes, 0 for no.  The question is assumed to
+ * have been printed already, and the cursor is left immediately after the
+ * response echoed.  (Might cause a problem if this takes it onto the next
+ * line.)  <Tab> is interpreted as 'y'; any other control character is
+ * interpreted as 'n'.  If there are any characters in the buffer, this is
+ * taken as a negative response, and no characters are read.  Case is folded.
  */
 
 /**/
 mod_export int
-getzlequery(int yesno)
+getzlequery(void)
 {
     ZLE_INT_T c;
 #ifdef FIONREAD
     int val;
 
-    if (yesno) {
-	/* check for typeahead, which is treated as a negative response */
-	ioctl(SHTTY, FIONREAD, (char *)&val);
-	if (val) {
-	    putc('n', shout);
-	    return 'n';
-	}
+    /* check for typeahead, which is treated as a negative response */
+    ioctl(SHTTY, FIONREAD, (char *)&val);
+    if (val) {
+	putc('n', shout);
+	return 0;
     }
 #endif
 
     /* get a character from the tty and interpret it */
     c = getfullchar(0);
-    if (yesno) {
-	if (c == ZWC('\t'))
-	    c = ZWC('y');
-	else if (ZC_icntrl(c) || c == ZLEEOF)
-	    c = ZWC('n');
-	else
-	    c = ZC_tolower(c);
-    }
+    if (c == ZWC('\t'))
+	c = ZWC('y');
+    else if (ZC_icntrl(c) || c == ZLEEOF)
+	c = ZWC('n');
+    else
+	c = ZC_tolower(c);
     /* echo response and return */
     if (c != ZWC('\n'))
 	zwcputc(c);
-    return c;
+    return c == ZWC('y');
 }
 
 /* Format a string, keybinding style. */

-- 
Peter Stephenson <p.w.stephenson@ntlworld.com>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: PATCH: multibyte odds and ends
  2006-08-01 20:44 PATCH: multibyte odds and ends Peter Stephenson
@ 2006-08-02  2:55 ` Bart Schaefer
  2006-08-02 17:12   ` Peter Stephenson
  0 siblings, 1 reply; 3+ messages in thread
From: Bart Schaefer @ 2006-08-02  2:55 UTC (permalink / raw)
  To: Zsh hackers list

On Aug 1,  9:44pm, Peter Stephenson wrote:
}
} Does anybody have any preferences?  Does anybody ever redefine HISTCHARS?

I've been redefining HISTCHARS/histchars to replace ^ with = for longer
than I've been using zsh (dating from using csh way back when some tty
terminals did not have | and the Bourne shell interpreted ^ as a pipe).

I'd be fine with restricting HISTCHARS to ASCII, or even to non-alpha-
numeric ASCII.


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: PATCH: multibyte odds and ends
  2006-08-02  2:55 ` Bart Schaefer
@ 2006-08-02 17:12   ` Peter Stephenson
  0 siblings, 0 replies; 3+ messages in thread
From: Peter Stephenson @ 2006-08-02 17:12 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> On Aug 1,  9:44pm, Peter Stephenson wrote:
> } Does anybody have any preferences?  Does anybody ever redefine HISTCHARS?
> 
> I've been redefining HISTCHARS/histchars to replace ^ with = for longer
> than I've been using zsh (dating from using csh way back when some tty
> terminals did not have | and the Bourne shell interpreted ^ as a pipe).
> 
> I'd be fine with restricting HISTCHARS to ASCII, or even to non-alpha-
> numeric ASCII.

I presume this is the only response I'm likely to get.

The shell will now issue a warning and refuse to set histchars/HISTCHARS
if either contains non-ASCII characters.  I thought about an error, but
the most likely place to set HISTCHARS is in .zshrc, and aborting all
processing if setting the variable fails is not likely to be the right
thing to do.

It also fixes a hard-to-find bug with metafication.

By the way, I've no intention of removing HISTCHARS so the note that
it's "deprecated" isn't really true, particularly since upper case is
the natural form for scalars in zsh.

Index: README
===================================================================
RCS file: /cvsroot/zsh/zsh/README,v
retrieving revision 1.34
diff -u -r1.34 README
--- README	10 Jul 2006 13:08:22 -0000	1.34
+++ README	2 Aug 2006 17:05:51 -0000
@@ -81,6 +81,11 @@
 on some fairly common PC configurations.  This change is only likely to
 affect some highly specialised uses of the shell.
 
+The variables HISTCHARS and histchars now reject any attempt to
+set non-ASCII characters for history or comments.  Multibyte characters
+have never worked and the most consistent change was to restrict the
+set to portable characters only.
+
 Documentation
 -------------
 
Index: Doc/Zsh/params.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/params.yo,v
retrieving revision 1.32
diff -u -r1.32 params.yo
--- Doc/Zsh/params.yo	2 Aug 2006 09:59:23 -0000	1.32
+++ Doc/Zsh/params.yo	2 Aug 2006 17:05:51 -0000
@@ -803,6 +803,10 @@
 expansion (default `tt(!)').  The second character signals the
 start of a quick history substitution (default `tt(^)').  The third
 character is the comment character (default `tt(#)').
+
+The characters must be in the ASCII character set; any attempt to set
+tt(histchars) to characters with a locale-dependent meaning will be
+rejected with an error message.
 )
 vindex(HISTCHARS)
 item(tt(HISTCHARS) <S> <Z>)(
Index: Src/params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/params.c,v
retrieving revision 1.117
diff -u -r1.117 params.c
--- Src/params.c	10 Jul 2006 13:08:23 -0000	1.117
+++ Src/params.c	2 Aug 2006 17:05:52 -0000
@@ -3548,10 +3548,21 @@
 histcharssetfn(UNUSED(Param pm), char *x)
 {
     if (x) {
-	bangchar = x[0];
-	hatchar = (bangchar) ? x[1] : '\0';
-	hashchar = (hatchar) ? x[2] : '\0';
-	zsfree(x);
+	int len, i;
+
+	unmetafy(x, &len);
+	if (len > 3)
+	    len = 3;
+	for (i = 0; i < len; i++) {
+	    if (!isascii(STOUC(x[i]))) {
+		zwarn("HISTCHARS can only contain ASCII characters");
+		return;
+	    }
+	}
+	bangchar = len ? STOUC(x[0]) : '\0';
+	hatchar =  len > 1 ? STOUC(x[1]) : '\0';
+	hashchar = len > 2 ? STOUC(x[2]) : '\0';
+	free(x);
     } else {
 	bangchar = '!';
 	hashchar = '#';

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


To access the latest news from CSR copy this link into a web browser:  http://www.csr.com/email_sig.php


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-08-02 17:13 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-08-01 20:44 PATCH: multibyte odds and ends Peter Stephenson
2006-08-02  2:55 ` Bart Schaefer
2006-08-02 17:12   ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).