zsh-workers
 help / color / mirror / code / Atom feed
From: Peter Stephenson <pws@csr.com>
To: zsh-workers@sunsite.dk (Zsh hackers list)
Subject: PATCH: multibyte $WORDCHARS
Date: Tue, 20 Sep 2005 16:06:40 +0100	[thread overview]
Message-ID: <200509201506.j8KF6f9X020818@news01.csr.com> (raw)

This fully fixes up WORDCHARS to use multibyte characters, with the
single exception of the [[:WORD:]] test --- I don't want to mess around
with the pattern code until we start work properly on the main shell.

I've rewritten the existing code involving wordtriggers() and
zle_wordchars, which were unused --- in fact I'd forgotten they were there.
(For that reason I tend to find it's better not to add to the support
routines until we know what we're doing with them.)

I've also improved support for testing identifiers.  This isn't quite so
good: since the main shell has no idea about multibyte characters, there
didn't seem any point allowing non-ASCII characters to be identified as
identifiers, so only ASCII characters are allowed in zle, too.

You'll also see I've removed a chunk that made bytes from 160 to 255
behave as if they were alphabet characters when ZLE_UNICODE_SUPPORT is
defined.  This chunk obviously isn't appropriate if we're trying to do
things properly.

Index: Src/init.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/init.c,v
retrieving revision 1.58
diff -u -r1.58 init.c
--- Src/init.c	9 Sep 2005 16:06:48 -0000	1.58
+++ Src/init.c	20 Sep 2005 15:01:57 -0000
@@ -1180,9 +1180,6 @@
 #endif /* !LINKED_XMOD_zshQszle */
 
 /**/
-mod_export ZleVoidFn wordcharstriggerptr = noop_function;
-
-/**/
 unsigned char *
 autoload_zleread(char **lp, char **rp, int ha, int con)
 {
Index: Src/params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/params.c,v
retrieving revision 1.103
diff -u -r1.103 params.c
--- Src/params.c	9 Sep 2005 16:06:48 -0000	1.103
+++ Src/params.c	20 Sep 2005 15:01:58 -0000
@@ -3346,7 +3346,6 @@
     zsfree(wordchars);
     wordchars = x;
     inittyptab();
-    wordcharstriggerptr();
 }
 
 /* Function to get value for special parameter `_' */
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.27
diff -u -r1.27 pattern.c
--- Src/pattern.c	9 May 2005 10:46:15 -0000	1.27
+++ Src/pattern.c	20 Sep 2005 15:01:58 -0000
@@ -2749,6 +2749,10 @@
 		    return 1;
 		break;
 	    case PP_WORD:
+		/*
+		 * HERE: when we support multibyte characters,
+		 * this test needs to be wcsiword().
+		 */
 		if (iword(ch))
 		    return 1;
 		break;
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.90
diff -u -r1.90 utils.c
--- Src/utils.c	17 Sep 2005 18:23:50 -0000	1.90
+++ Src/utils.c	20 Sep 2005 15:01:58 -0000
@@ -35,6 +35,16 @@
 /**/
 char *scriptname;
 
+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * The wordchars variable turned into a wide character array.
+ * This is much more convenient for testing.
+ */
+
+/**/
+mod_export wchar_t *wordchars_wide;
+#endif
+
 /* Print an error */
  
 /**/
@@ -2456,8 +2466,18 @@
 	typtab[t0] = IDIGIT | IALNUM | IWORD | IIDENT | IUSER;
     for (t0 = 'a'; t0 <= 'z'; t0++)
 	typtab[t0] = typtab[t0 - 'a' + 'A'] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#ifndef ZLE_UNICODE_SUPPORT
+    /*
+     * This really doesn't seem to me the right thing to do when
+     * we have multibyte character support...  it was a hack to assume
+     * eight bit characters `worked' for some values of work before
+     * we could test for them properly.  I'm not 100% convinced
+     * having IIDENT here is a good idea at all, but this code
+     * should disappear into history...
+     */
     for (t0 = 0240; t0 != 0400; t0++)
 	typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#endif
     typtab['_'] = IIDENT | IUSER;
     typtab['-'] = IUSER;
     typtab[' '] |= IBLANK | INBLANK;
@@ -2477,8 +2497,44 @@
 	}
 	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP;
     }
-    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++)
-	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= IWORD;
+#ifdef ZLE_UNICODE_SUPPORT
+    if (wordchars) {
+	const char *wordchars_ptr = wordchars;
+	mbstate_t mbs;
+	int nchars;
+
+	memset(&mbs, 0, sizeof(mbs));
+	wordchars_wide = (wchar_t *)
+	    zrealloc(wordchars_wide, (strlen(wordchars)+1)*sizeof(wchar_t));
+	nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, strlen(wordchars),
+			   &mbs);
+	if (nchars == -1) {
+	    /* Conversion state is undefined: better just set to null */
+	    *wordchars_wide = L'\0';
+	} else {
+	    wordchars_wide[nchars] = L'\0';
+	}
+    } else {
+	wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t));
+	*wordchars_wide = L'\0';
+    }
+#endif
+    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
+	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
+#ifdef ZLE_UNICODE_SUPPORT
+	if (!isascii(c)) {
+	    /*
+	     * If we have support for multibyte characters, we don't
+	     * handle non-ASCII characters here; instead, we turn
+	     * wordchars into a wide character array.
+	     * (We may actually have a single-byte 8-bit character set,
+	     * but it works the same way.)
+	     */
+	    continue;
+	}
+#endif
+	typtab[c] |= IWORD;
+    }
     for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
     if (isset(BANGHIST) && bangchar && interact && isset(SHINSTDIN))
@@ -2503,9 +2559,6 @@
      * produces an ASCII character.  If it does, use iword on that.
      * If it doesn't, use iswalnum on the original character.  This
      * is pretty good most of the time.
-     *
-     * TODO: extend WORDCHARS to handle multibyte chars by some kind
-     * of hierarchical list or hash table.
      */
     len = wctomb(outstr, c);
 
@@ -2515,7 +2568,40 @@
     } else if (len == 1 && isascii(*outstr)) {
 	return iword(*outstr);
     } else {
-	return iswalnum(c);
+	return iswalnum(c) || wcschr(wordchars_wide, c);
+    }
+}
+
+/*
+ * iident() macro extended to support wide characters.
+ *
+ * The macro is intended to test if a character is allowed in an
+ * internal zsh identifier.  Until the main shell handles multibyte
+ * characters it's not a good idea to allow characters other than
+ * ASCII characters; it would cause zle to allow characters that
+ * the main shell would reject.  Eventually we should be able
+ * to allow all alphanumerics.
+ *
+ * Otherwise similar to wcsiword.
+ */
+
+/**/
+mod_export int
+wcsiident(wchar_t c)
+{
+    int len;
+    VARARR(char, outstr, MB_CUR_MAX);
+
+    len = wctomb(outstr, c);
+
+    if (len == 0) {
+	/* NULL is special */
+	return 0;
+    } else if (len == 1 && isascii(*outstr)) {
+	return iword(*outstr);
+    } else {
+	/* not currently allowed, see above */
+	return 0;
     }
 }
 #endif
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.18
diff -u -r1.18 zle.h
--- Src/Zle/zle.h	9 Sep 2005 20:34:43 -0000	1.18
+++ Src/Zle/zle.h	20 Sep 2005 15:01:58 -0000
@@ -66,12 +66,7 @@
 
 #define ZC_iblank iswspace
 #define ZC_icntrl iswcntrl
-/*
- * TODO: doesn't work on arguments with side effects.
- * Also YUK.  Not even sure this is guaranteed to work.
- * Should be easy to do along the lines of wcsiword.
- */
-#define ZC_iident(x)	(x < 256 && iident((int)x))
+#define ZC_iident wcsiident
 
 #define ZC_tolower towlower
 #define ZC_toupper towupper
Index: Src/Zle/zle_main.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_main.c,v
retrieving revision 1.75
diff -u -r1.75 zle_main.c
--- Src/Zle/zle_main.c	9 Sep 2005 16:55:21 -0000	1.75
+++ Src/Zle/zle_main.c	20 Sep 2005 15:01:58 -0000
@@ -106,11 +106,6 @@
 /**/
 mod_export int
 lastchar_wide_valid;
-
-/**/
-mod_export ZLE_STRING_T zle_wordchars;
-#else
-# define zle_wordchars wordchars;
 #endif
 
 /* the bindings for the previous and for this key */
@@ -1558,17 +1553,6 @@
 	kungetct = 0;
 }
 
-/**/
-mod_export void
-wordcharstrigger(void)
-{
-#ifdef ZLE_UNICODE_SUPPORT
-    zrealloc(zle_wordchars, strlen(wordchars)*MB_CUR_MAX);
-    mbsrtowcs(zle_wordchars, (const char **)&wordchars,
-	      strlen(wordchars), NULL);
-    /* TODO: error handling here */
-#endif
-}
 
 /* Hook functions. Used to allow access to zle parameters if zle is
  * active. */
@@ -1636,8 +1620,6 @@
     kungetbuf = (char *) zalloc(kungetsz = 32);
     comprecursive = 0;
     rdstrs = NULL;
-    wordcharstriggerptr = wordcharstrigger;
-    wordcharstrigger();
 
     /* initialise the keymap system */
     init_keymaps();
@@ -1712,7 +1694,6 @@
     zlegetlineptr = NULL;
     zlereadptr = fallback_zleread;
     zlesetkeymapptr= noop_function_int;
-    wordcharstriggerptr = noop_function;
 
     getkeyptr = NULL;
 


-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

**********************************************************************


             reply	other threads:[~2005-09-20 15:06 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-09-20 15:06 Peter Stephenson [this message]
2005-09-20 15:17 ` Peter Stephenson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200509201506.j8KF6f9X020818@news01.csr.com \
    --to=pws@csr.com \
    --cc=zsh-workers@sunsite.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).