Re: $'\uXXXX' - Peter Stephenson

zsh-workers
 help / color / mirror / code / Atom feed

From: Peter Stephenson <p.w.stephenson@ntlworld.com>
To: Zsh hackers list <zsh-workers@sunsite.dk>
Subject: Re: $'\uXXXX'
Date: Tue, 6 Nov 2007 20:38:45 +0000	[thread overview]
Message-ID: <20071106203845.54f4c474.p.w.stephenson@ntlworld.com> (raw)
In-Reply-To: <200711061005.lA6A56Y0024628@news01.csr.com>

On Tue, 06 Nov 2007 10:05:06 +0000
Peter Stephenson <pws@csr.com> wrote:
> Stephane Chazelas wrote:
> > ~$ print '<\u0041>'
> > <A>
> > ~$ printf '%s\n' $'<\u0041>'
> > <>
> > 
> > ~$ LC_ALL=C
> > ~$ print $'\u00e9'
> > zsh: character not in range
> > zsh: segmentation fault  zsh-beta
> 
> I was aware of various problems with multibyte in the getkeystring()
> function that handles this and I was in the middle of a patch.  I think
> that part was mostly complete.  I'll look out the appropriate bit, if I
> can still find it, and see what's left.

This is it, stripped of the completion quoting bit that was causing me
to tear my hair out.  It does seem to improve things.

The tests I've added are repeated to try to tease out any memory errors.

Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.169
diff -u -r1.169 utils.c
--- Src/utils.c	31 Oct 2007 06:22:57 -0000	1.169
+++ Src/utils.c	6 Nov 2007 20:36:46 -0000
@@ -4578,6 +4578,31 @@
 }
 #endif
 
+
+/*
+ * The following only occurs once or twice in the code, but in different
+ * places depending how character set conversion is implemented.
+ */
+#define CHARSET_FAILED()		      \
+    if (how & GETKEY_DOLLAR_QUOTE) {	      \
+	while ((*tdest++ = *++s)) {	      \
+	    if (how & GETKEY_UPDATE_OFFSET) { \
+		if (s - sstart > *misc)	      \
+		    (*misc)++;		      \
+	    }				      \
+	    if (*s == Snull) {		      \
+		*len = (s - sstart) + 1;      \
+		*tdest = '\0';		      \
+		return buf;		      \
+	    }				      \
+	}				      \
+	*len = tdest - buf;		      \
+	return buf;			      \
+    }					      \
+    *t = '\0';				      \
+    *len = t - buf;			      \
+    return buf
+
 /*
  * Decode a key string, turning it into the literal characters.
  * The value returned is a newly allocated string from the heap.
@@ -4622,7 +4647,7 @@
 getkeystring(char *s, int *len, int how, int *misc)
 {
     char *buf, tmp[1];
-    char *t, *tdest = NULL, *u = NULL, *sstart = s;
+    char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf;
     char svchar = '\0';
     int meta = 0, control = 0;
     int i;
@@ -4642,38 +4667,69 @@
 #endif
 
     DPUTS((how & GETKEY_UPDATE_OFFSET) &&
-	  (how & ~(GETKEY_DOLLAR_QUOTE|GETKEY_UPDATE_OFFSET)),
+	  (how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)),
 	  "BUG: offset updating in getkeystring only supported with $'.");
+    DPUTS((how & (GETKEY_DOLLAR_QUOTE|GETKEY_SINGLE_CHAR)) ==
+	  (GETKEY_DOLLAR_QUOTE|GETKEY_SINGLE_CHAR),
+	  "BUG: incompatible options in getkeystring");
 
     if (how & GETKEY_SINGLE_CHAR)
 	t = buf = tmp;
-    else
-	t = buf = zhalloc(strlen(s) + 1);
-    if (how & GETKEY_DOLLAR_QUOTE) {
+    else {
+	/* Length including terminating NULL */
+	int maxlen = 1;
 	/*
-	 * TODO: we're not necessarily guaranteed the output string will
+	 * We're not necessarily guaranteed the output string will
 	 * be no longer than the input with \u and \U when output
-	 * characters need to be metafied: should check the maximum
-	 * length.
-	 *
-	 * We're going to unmetafy into the original string, but
-	 * to get a proper metafied input we're going to metafy
-	 * into an allocated buffer.  This is necessary if we have
-	 * \u and \U's with multiple metafied bytes.  We can't
-	 * simply remetafy the entire string because there may
-	 * be tokens (indeed, we know there are lexical nulls floating
-	 * around), so we have to be aware character by character
-	 * what we are converting.
+	 * characters need to be metafied.  As this is the only
+	 * case where the string can get longer (?I think),
+	 * include it in the allocation length here but don't
+	 * bother taking account of other factors.
 	 */
-	tdest = t;
-	t = s;
+	for (t = s; *t; t++) {
+	    if (*t == '\\') {
+		if (!t[1]) {
+		    maxlen++;
+		    break;
+		}
+		if (t[1] == 'u' || t[1] == 'U')
+		    maxlen += MB_CUR_MAX * 2;
+		else
+		    maxlen += 2;
+		/* skip the backslash and the following character */
+		t++;
+	    } else
+		maxlen++;
+	}
+	if (how & GETKEY_DOLLAR_QUOTE) {
+	    /*
+	     * We're going to unmetafy into a new string, but
+	     * to get a proper metafied input we're going to metafy
+	     * into an intermediate buffer.  This is necessary if we have
+	     * \u and \U's with multiple metafied bytes.  We can't
+	     * simply remetafy the entire string because there may
+	     * be tokens (indeed, we know there are lexical nulls floating
+	     * around), so we have to be aware character by character
+	     * what we are converting.
+	     *
+	     * In this case, buf is the final buffer (as usual),
+	     * but t points into a temporary buffer that just has
+	     * to be long enough to hold the result of one escape
+	     * code transformation.  We count this is a full multibyte
+	     * character (MB_CUR_MAX) with every character metafied
+	     * (*2) plus a little bit of fuzz (for e.g. the odd backslash).
+	     */
+	    buf = tdest = zhalloc(maxlen);
+	    t = tbuf = zhalloc(MB_CUR_MAX * 3 + 1);
+	} else {
+	    t = buf = zhalloc(maxlen);
+	}
     }
     for (; *s; s++) {
-	char *torig = t;
 	if (*s == '\\' && s[1]) {
 	    int miscadded;
-	    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc) {
-		(*misc)++;
+	    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
+		(*misc)--;
 		miscadded = 1;
 	    } else
 		miscadded = 0;
@@ -4707,7 +4763,7 @@
 		if (!(how & GETKEY_EMACS)) {
 		    *t++ = '\\', s--;
 		    if (miscadded)
-			(*misc)--;
+			(*misc)++;
 		    continue;
 		}
 		/* FALL THROUGH */
@@ -4715,30 +4771,32 @@
 		*t++ = '\033';
 		break;
 	    case 'M':
+		/* HERE: GETKEY_UPDATE_OFFSET */
 		if (how & GETKEY_EMACS) {
 		    if (s[1] == '-')
 			s++;
 		    meta = 1 + control;	/* preserve the order of ^ and meta */
 		} else {
 		    if (miscadded)
-			(*misc)--;
+			(*misc)++;
 		    *t++ = '\\', s--;
 		}
 		continue;
 	    case 'C':
+		/* HERE: GETKEY_UPDATE_OFFSET */
 		if (how & GETKEY_EMACS) {
 		    if (s[1] == '-')
 			s++;
 		    control = 1;
 		} else {
 		    if (miscadded)
-			(*misc)--;
+			(*misc)++;
 		    *t++ = '\\', s--;
 		}
 		continue;
 	    case Meta:
 		if (miscadded)
-		    (*misc)--;
+		    (*misc)++;
 		*t++ = '\\', s--;
 		break;
 	    case '-':
@@ -4755,15 +4813,16 @@
 		    return buf;
 		}
 		goto def;
-	    case 'u':
-		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc)
-		    (*misc) += 4;
 	    case 'U':
-		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc) {
-		    (*misc) += 6;
+		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
+		    (*misc) -= 4;
+		/* FALLTHROUGH */
+	    case 'u':
+		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
+		    (*misc) -= 6; /* HERE don't really believe this */
 		    /*
 		     * We've now adjusted the offset for all the input
-		     * characters, so we need to subtract for each
+		     * characters, so we need to add for each
 		     * byte of output below.
 		     */
 		}
@@ -4787,31 +4846,18 @@
 		count = wctomb(t, (wchar_t)wval);
 		if (count == -1) {
 		    zerr("character not in range");
-		    if (how & GETKEY_DOLLAR_QUOTE) {
-			/* HERE new convention */
-			for (u = t; (*u++ = *++s);) {
-			    if ((how & GETKEY_UPDATE_OFFSET) &&
-				s - sstart > *misc)
-				(*misc)++;
-			}
-			return t;
-		    }
-		    *t = '\0';
-		    *len = t - buf;
-		    return buf;
+		    CHARSET_FAILED();
 		}
-		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc)
+		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 		    (*misc) += count;
 		t += count;
-		continue;
 # else
 #  if defined(HAVE_NL_LANGINFO) && defined(CODESET)
 		if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
 		    count = ucs4toutf8(t, wval);
 		    t += count;
-		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc)
+		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 			(*misc) += count;
-		    continue;
 		} else {
 #   ifdef HAVE_ICONV
 		    ICONV_CONST char *inptr = inbuf;
@@ -4826,46 +4872,55 @@
     	    	    cd = iconv_open(nl_langinfo(CODESET), "UCS-4BE");
 		    if (cd == (iconv_t)-1) {
 			zerr("cannot do charset conversion");
-			if (how & GETKEY_DOLLAR_QUOTE) {
-			    /* HERE: new convention */
-			    for (u = t; (*u++ = *++s);) {
-				if ((how & GETKEY_UPDATE_OFFSET) &&
-				    s - sstart > *misc)
-				    (*misc)++;
-			    }
-			    return t;
-			}
-			*t = '\0';
-			*len = t - buf;
-			return buf;
+			CHARSET_FAILED();
 		    }
                     count = iconv(cd, &inptr, &inbytes, &t, &outbytes);
 		    iconv_close(cd);
 		    if (count == (size_t)-1) {
                         zerr("character not in range");
-		        *t = '\0';
-			*len = t - buf;
-			return buf;
+			CHARSET_FAILED();
 		    }
-		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart > *misc)
+		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 			(*misc) += count;
-		    continue;
 #   else
                     zerr("cannot do charset conversion");
-		    *t = '\0';
-		    *len = t - buf;
-		    return buf;
+		    CHARSET_FAILED();
 #   endif
 		}
 #  else
                 zerr("cannot do charset conversion");
-		*t = '\0';
-		*len = t - buf;
-		return buf;
+		CHARSET_FAILED();
 #  endif
 # endif
+		if (how & GETKEY_DOLLAR_QUOTE) {
+		    char *t2;
+		    for (t2 = tbuf; t2 < t; t2++) {
+			if (imeta(*t2)) {
+			    *tdest++ = Meta;
+			    *tdest++ = *t2 ^ 32;
+			} else
+			    *tdest++ = *t2;
+		    }
+		    /* reset temporary buffer after handling */
+		    t = tbuf;
+		}
+		continue;
+	    case '\'':
+	    case '\\':
+		if (how & GETKEY_DOLLAR_QUOTE) {
+		    /*
+		     * Usually \' and \\ will have the initial
+		     * \ turned into a Bnull, however that's not
+		     * necessarily the case when called from
+		     * completion.
+		     */
+		    *t++ = *s;
+		    break;
+		}
+		/* FALLTHROUGH */
 	    default:
 	    def:
+		/* HERE: GETKEY_UPDATE_OFFSET? */
 		if ((idigit(*s) && *s < '8') || *s == 'x') {
 		    if (!(how & GETKEY_OCTAL_ESC)) {
 			if (*s == '0')
@@ -4890,7 +4945,7 @@
 		} else {
 		    if (!(how & GETKEY_EMACS) && *s != '\\') {
 			if (miscadded)
-			    (*misc)--;
+			    (*misc)++;
 			*t++ = '\\';
 		    }
 		    *t++ = *s;
@@ -4961,6 +5016,8 @@
 			 */
 			*tdest++ = *++s;
 		    }
+		    /* reset temporary buffer, now handled */
+		    t = tbuf;
 		    continue;
 		} else
 		    *t++ = *s;
@@ -4984,13 +5041,17 @@
 	}
 	if (how & GETKEY_DOLLAR_QUOTE) {
 	    char *t2;
-	    for (t2 = torig; t2 < t; t2++) {
+	    for (t2 = tbuf; t2 < t; t2++) {
 		if (imeta(*t2)) {
 		    *tdest++ = Meta;
 		    *tdest++ = *t2 ^ 32;
 		} else
 		    *tdest++ = *t2;
 	    }
+	    /*
+	     * Reset use of temporary buffer.
+	     */
+	    t = tbuf;
 	}
 	if ((how & GETKEY_SINGLE_CHAR) && t != tmp) {
 	    *misc = STOUC(tmp[0]);
Index: Test/A03quoting.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/A03quoting.ztst,v
retrieving revision 1.3
diff -u -r1.3 A03quoting.ztst
--- Test/A03quoting.ztst	27 Jan 2007 23:52:14 -0000	1.3
+++ Test/A03quoting.ztst	6 Nov 2007 20:36:46 -0000
@@ -42,3 +42,13 @@
   unsetopt rcquotes
 0:Yes RC_QUOTES with single quotes
 >'
+
+  print '<\u0041>'
+  printf '%s\n' $'<\u0042>'
+  print '<\u0043>'
+  printf '%s\n' $'<\u0044>'
+0:\u in both print and printf
+><A>
+><B>
+><C>
+><D>
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.20
diff -u -r1.20 D07multibyte.ztst
--- Test/D07multibyte.ztst	22 Aug 2007 10:46:42 -0000	1.20
+++ Test/D07multibyte.ztst	6 Nov 2007 20:36:46 -0000
@@ -384,3 +384,13 @@
   print -r ${(q)foo}
 0:Backslash-quoting of unprintable/invalid characters uses $'...'
 >X$'\300'Y$'\a'Z$'\177'T
+
+# This also isn't strictly multibyte and is here to reduce the
+# likelihood of a "can't do character set conversion" error.
+  testfn() { (LC_ALL=C; print $'\u00e9') }
+  repeat 4 testfn
+1:error handling in Unicode quoting
+?testfn: character not in range
+?testfn: character not in range
+?testfn: character not in range
+?testfn: character not in range

-- 
Peter Stephenson <p.w.stephenson@ntlworld.com>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/

     prev parent reply	other threads:[~2007-11-06 20:39 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-06  7:44 $'\uXXXX' Stephane Chazelas
2007-11-06 10:05 ` $'\uXXXX' Peter Stephenson
2007-11-06 20:38   ` Peter Stephenson [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071106203845.54f4c474.p.w.stephenson@ntlworld.com \
    --to=p.w.stephenson@ntlworld.com \
    --cc=zsh-workers@sunsite.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).