zsh-workers
 help / color / mirror / code / Atom feed
From: Jun T <takimoto-j@kba.biglobe.ne.jp>
To: zsh-workers@zsh.org
Subject: Re: [bug] busyloop upon $=var with NULs when $IFS contains both NUL and a byte > 0x7f
Date: Tue, 13 Dec 2022 18:51:30 +0900	[thread overview]
Message-ID: <1D770986-46C3-4A8C-A66A-5DA661AC5C27@kba.biglobe.ne.jp> (raw)
In-Reply-To: <AD3B3716-F348-4C93-981F-8B8DD624C37D@kba.biglobe.ne.jp>


> 2022/11/29 23:27, Jun. T <takimoto-j@kba.biglobe.ne.jp> wrote:
> 
> So the basic question is:
> What should we do if IFS contains invalid character(s)?
> 
> I think, at least if MULTIBYTE option is ON, it would be better to
> force reset IFS to the default, rather than leaving ifs_wide empty.

So currently this is the only simple solution I can think of.

In the patch below, if MULTIBYTE option is ON and IFS contains
invalid characters, it is reset to the default. Is this OK?

Do we need to issue a warning when reseting IFS?


The patch includes the patch in workers/51087⁩ (fix the behavior
when MULTIBYTE option is OFF).

When LC_CTYPE changes (directly or via LC_ALL or LANG), a
character that was valid would become invalid in the new locale.
So I added inittyptab() in lcsetfn() etc.

A simple test is included. On macOS, with C-locale, any byte
is a valid character, and IFS is not reset by the test.


 Doc/Zsh/params.yo      |  7 +++++--
 Src/params.c           |  3 +++
 Src/utils.c            | 42 ++++++++++++++++++++++++++----------------
 Test/D04parameter.ztst | 12 ++++++++++++
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/Doc/Zsh/params.yo b/Doc/Zsh/params.yo
index 2a30085a8..91201616a 100644
--- a/Doc/Zsh/params.yo
+++ b/Doc/Zsh/params.yo
@@ -1251,15 +1251,18 @@ Internal field separators (by default space, tab, newline and NUL), that
 are used to separate words which result from
 command or parameter expansion and words read by
 the tt(read) builtin.  Any characters from the set space, tab and
-newline that appear in the IFS are called em(IFS white space).
+newline that appear in the tt(IFS) are called em(IFS white space).
 One or more IFS white space characters or one non-IFS white space
 character together with any adjacent IFS white space character delimit
 a field.  If an IFS white space character appears twice consecutively
-in the IFS, this character is treated as if it were not an IFS white
+in the tt(IFS), this character is treated as if it were not an IFS white
 space character.
 
 If the parameter is unset, the default is used.  Note this has
 a different effect from setting the parameter to an empty string.
+
+If tt(MULTIBYTE) option is on and tt(IFS) contains invalid characters in
+the current locale, it is reset to the default.
 )
 vindex(KEYBOARD_HACK)
 item(tt(KEYBOARD_HACK))(
diff --git a/Src/params.c b/Src/params.c
index f1fe38955..81f0e5015 100644
--- a/Src/params.c
+++ b/Src/params.c
@@ -4639,6 +4639,7 @@ setlang(char *x)
 	if ((x = getsparam_u(ln->name)) && *x)
 	    setlocale(ln->category, x);
     unqueue_signals();
+    inittyptab();
 }
 
 /**/
@@ -4662,6 +4663,7 @@ lc_allsetfn(Param pm, char *x)
     else {
 	setlocale(LC_ALL, unmeta(x));
 	clear_mbstate();
+	inittyptab();
     }
 }
 
@@ -4700,6 +4702,7 @@ lcsetfn(Param pm, char *x)
     }
     unqueue_signals();
     clear_mbstate();	/* LC_CTYPE may have changed */
+    inittyptab();
 }
 #endif /* USE_LOCALE */
 
diff --git a/Src/utils.c b/Src/utils.c
index edf5d3df7..a874851cc 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -74,9 +74,6 @@ set_widearray(char *mb_array, Widechar_array wca)
     }
     wca->len = 0;
 
-    if (!isset(MULTIBYTE))
-	return;
-
     if (mb_array) {
 	VARARR(wchar_t, tmpwcs, strlen(mb_array));
 	wchar_t *wcptr = tmpwcs;
@@ -87,8 +84,7 @@ set_widearray(char *mb_array, Widechar_array wca)
 	    int mblen;
 
 	    if (STOUC(*mb_array) <= 0x7f) {
-		mb_array++;
-		*wcptr++ = (wchar_t)*mb_array;
+		*wcptr++ = (wchar_t)*mb_array++;
 		continue;
 	    }
 
@@ -4118,8 +4114,9 @@ inittyptab(void)
      * having IIDENT here is a good idea at all, but this code
      * should disappear into history...
      */
-    for (t0 = 0240; t0 != 0400; t0++)
-	typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+    if isset(MULTIBYTE)
+	for (t0 = 0240; t0 != 0400; t0++)
+	    typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
 #endif
     /* typtab['.'] |= IIDENT; */ /* Allow '.' in variable names - broken */
     typtab['_'] = IIDENT | IUSER;
@@ -4134,11 +4131,24 @@ inittyptab(void)
 	typtab[t0] |= ITOK | IMETA;
     for (t0 = (int)STOUC(Snull); t0 <= (int)STOUC(Nularg); t0++)
 	typtab[t0] |= ITOK | IMETA | INULL;
-    for (s = ifs ? ifs : EMULATION(EMULATE_KSH|EMULATE_SH) ?
-	DEFAULT_IFS_SH : DEFAULT_IFS; *s; s++) {
+    /* ifs */
+#define CURRENT_DEFAULT_IFS (EMULATION(EMULATE_KSH|EMULATE_SH) ? \
+			    DEFAULT_IFS_SH : DEFAULT_IFS)
+#ifdef MULTIBYTE_SUPPORT
+    if (isset(MULTIBYTE)) {
+	set_widearray(ifs ? ifs : CURRENT_DEFAULT_IFS, &ifs_wide);
+	if (ifs && !ifs_wide.chars) {
+	    /* IFS has invalid character(s). Reset it to default */
+	    zsfree(ifs);
+	    ifs = ztrdup(CURRENT_DEFAULT_IFS);
+	    set_widearray(ifs, &ifs_wide);
+	}
+    }
+#endif
+    for (s = ifs ? ifs : CURRENT_DEFAULT_IFS; *s; s++) {
 	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
 #ifdef MULTIBYTE_SUPPORT
-	if (!isascii(c)) {
+	if (isset(MULTIBYTE) && !isascii(c)) {
 	    /* see comment for wordchars below */
 	    continue;
 	}
@@ -4151,10 +4161,15 @@ inittyptab(void)
 	}
 	typtab[c] |= ISEP;
     }
+    /* wordchars */
+#ifdef MULTIBYTE_SUPPORT
+    if (isset(MULTIBYTE))
+	set_widearray(wordchars, &wordchars_wide);
+#endif
     for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
 	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
 #ifdef MULTIBYTE_SUPPORT
-	if (!isascii(c)) {
+	if (isset(MULTIBYTE) && !isascii(c)) {
 	    /*
 	     * If we have support for multibyte characters, we don't
 	     * handle non-ASCII characters here; instead, we turn
@@ -4167,11 +4182,6 @@ inittyptab(void)
 #endif
 	typtab[c] |= IWORD;
     }
-#ifdef MULTIBYTE_SUPPORT
-    set_widearray(wordchars, &wordchars_wide);
-    set_widearray(ifs ? ifs : EMULATION(EMULATE_KSH|EMULATE_SH) ?
-	DEFAULT_IFS_SH : DEFAULT_IFS, &ifs_wide);
-#endif
     for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
     if (typtab_flags & ZTF_SP_COMMA)
diff --git a/Test/D04parameter.ztst b/Test/D04parameter.ztst
index 6bf55b4db..d9f81f66d 100644
--- a/Test/D04parameter.ztst
+++ b/Test/D04parameter.ztst
@@ -2275,6 +2275,18 @@ F:We do not care what $OLDPWD is, as long as it does not cause an error
 F:As of this writing, var=$@ and var="$@" with null IFS have unspecified
 F:behavior, see http://austingroupbugs.net/view.php?id=888
 
+  (
+  IFS=$'\x80'
+  if [[ $IFS = $' \t\n\0' ]]; then
+    echo OK     # if $'\x80' is illegal
+  else          # otherwise, it should work as a separator
+    s=$'foo\x80\bar'
+    [[ ${${=s}[1]} = foo ]] && echo OK
+  fi
+  )
+0:reset IFS to default if it contains illegal character
+>OK
+
   () {
     setopt localoptions extendedglob
     [[ $- = [[:alnum:]]## ]] || print Failed 1





  parent reply	other threads:[~2022-12-13  9:51 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-18 14:27 Stephane Chazelas
2022-11-29 14:27 ` Jun. T
2022-11-29 14:38   ` Peter Stephenson
2022-11-30  4:20     ` Bart Schaefer
2022-11-30  9:21       ` Peter Stephenson
2022-12-13  9:50         ` Jun T
2022-12-13  9:49     ` Jun T
2022-12-13 10:13       ` Peter Stephenson
2022-12-13 11:40         ` Jun T
2022-12-13 11:55           ` Peter Stephenson
2023-06-21  4:49             ` Jun T
2022-12-11 19:12   ` Stephane Chazelas
2022-12-13  9:51   ` Jun T [this message]
2022-11-30 14:56 ` Jun. T

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1D770986-46C3-4A8C-A66A-5DA661AC5C27@kba.biglobe.ne.jp \
    --to=takimoto-j@kba.biglobe.ne.jp \
    --cc=zsh-workers@zsh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).