zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: zle_params.c
@ 2005-01-26 18:06 Peter Stephenson
  2005-01-26 18:35 ` Clint Adams
  2005-01-29  3:47 ` UTF-8 input [was Re: PATCH: zle_params.c] Clint Adams
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Stephenson @ 2005-01-26 18:06 UTC (permalink / raw)
  To: Zsh hackers list

This fixes up access to ZLE parameters.  It seems not to be completely
broken.  It also fixes a typo in zle_params.c.

I've left last_isearch since it's not clear what is to become of it
yet.  Fixing doisearch isn't going to be great fun (240 lines, 2
comments).  It'll have to wait until we decide about input.

Index: Src/system.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/system.h,v
retrieving revision 1.24
diff -u -r1.24 system.h
--- Src/system.h	26 Jan 2005 13:39:51 -0000	1.24
+++ Src/system.h	26 Jan 2005 18:04:32 -0000
@@ -725,6 +725,7 @@
 #define ZLENL	L'\n'
 #define ZLENUL	L'\0'
 #define ZLETAB	L'\t'
+#define ZLENULSTR	L""
 #define ZS_memcpy wmemcpy
 #define ZC_icntrl iswcntrl
 #else
@@ -735,6 +736,7 @@
 #define ZLENL	'\n'
 #define ZLENUL	'\0'
 #define ZLETAB	'\t'
+#define ZLENULSTR	""
 #define ZS_memcpy memcpy
 #define ZC_icntrl icntrl
 #endif
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.5
diff -u -r1.5 zle.h
--- Src/Zle/zle.h	25 Jan 2005 16:41:24 -0000	1.5
+++ Src/Zle/zle.h	26 Jan 2005 18:04:32 -0000
@@ -194,3 +194,9 @@
 /* Invalidate the completion list. */
 
 #define invalidatelist() runhookdef(INVALIDATELISTHOOK, NULL)
+
+/* Bit flags to setline */
+enum {
+    ZSL_COPY = 1,		/* Copy the argument, don't modify it */
+    ZSL_TOEND = 2,		/* Go to the end of the new line */
+};
Index: Src/Zle/zle_hist.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_hist.c,v
retrieving revision 1.16
diff -u -r1.16 zle_hist.c
--- Src/Zle/zle_hist.c	14 Jan 2005 13:05:23 -0000	1.16
+++ Src/Zle/zle_hist.c	26 Jan 2005 18:04:32 -0000
@@ -567,7 +567,7 @@
     remember_edits();
     mkundoent();
     histline = he->histnum;
-    setline(ZLETEXT(he));
+    setline(ZLETEXT(he), ZSL_COPY|ZSL_TOEND);
     setlastline();
     clearlist = 1;
 }
Index: Src/Zle/zle_main.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_main.c,v
retrieving revision 1.57
diff -u -r1.57 zle_main.c
--- Src/Zle/zle_main.c	25 Jan 2005 16:41:24 -0000	1.57
+++ Src/Zle/zle_main.c	26 Jan 2005 18:04:32 -0000
@@ -829,7 +829,7 @@
     selectlocalmap(NULL);
     fixsuffix();
     if ((s = (unsigned char *)getlinknode(bufstack))) {
-	setline((char *)s);
+	setline((char *)s, ZSL_TOEND);
 	zsfree((char *)s);
 	if (stackcs != -1) {
 	    zlecs = stackcs;
Index: Src/Zle/zle_params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_params.c,v
retrieving revision 1.21
diff -u -r1.21 zle_params.c
--- Src/Zle/zle_params.c	25 Jan 2005 16:41:24 -0000	1.21
+++ Src/Zle/zle_params.c	26 Jan 2005 18:04:32 -0000
@@ -175,12 +175,8 @@
 set_buffer(UNUSED(Param pm), char *x)
 {
     if(x) {
-	unmetafy(x, &zlell);
-	sizeline(zlell);
-	strcpy((char *)zleline, x);
+	setline(x, 0);
 	zsfree(x);
-	if(zlecs > zlell)
-	    zlecs = zlell;
     } else
 	zlecs = zlell = 0;
     fixsuffix();
@@ -191,7 +187,7 @@
 static char *
 get_buffer(UNUSED(Param pm))
 {
-    return metafy((char *)zleline, zlell, META_HEAPDUP);
+    return (char *)zlelineasstring((char *)zleline, zlell, 0, NULL, NULL, 1);
 }
 
 /**/
@@ -238,19 +234,22 @@
 static void
 set_lbuffer(UNUSED(Param pm), char *x)
 {
-    char *y;
+    ZLE_STRING_T y;
     int len;
 
-    if(x)
-	unmetafy(y = x, &len);
+    if (x && *x != ZLENUL)
+	y = stringaszleline((unsigned char *)x, &len, NULL);
     else
-	y = "", len = 0;
+	y = ZLENULSTR, len = 0;
     sizeline(zlell - zlecs + len);
-    memmove(zleline + len, zleline + zlecs, zlell - zlecs);
-    memcpy(zleline, y, len);
+    memmove((char *)(zleline + len), (char *)(zleline + zlecs),
+	    (zlell - zlecs) * ZLE_CHAR_SIZE);
+    ZS_memcpy(zleline, y, len);
     zlell = zlell - zlecs + len;
     zlecs = len;
     zsfree(x);
+    if (len)
+	free(y);
     fixsuffix();
     menucmp = 0;
 }
@@ -259,7 +258,7 @@
 static char *
 get_lbuffer(UNUSED(Param pm))
 {
-    return metafy((char *)zleline, zlecs, META_HEAPDUP);
+    return (char *)zlelineasstring(zleline, zlecs, 0, NULL, NULL, 1);
 }
 
 /**/
@@ -269,13 +268,15 @@
     char *y;
     int len;
 
-    if(x)
-	unmetafy(y = x, &len);
+    if (x && *x != ZLENUL)
+	y = stringaszleline((unsigned char *)x, &len, NULL);
     else
-	y = "", len = 0;
+	y = ZLENULSTR, len = 0;
     sizeline(zlell = zlecs + len);
-    memcpy(zleline + zlecs, y, len);
+    ZS_memcpy(zleline + zlecs, y, len);
     zsfree(x);
+    if (len)
+	free(y);
     fixsuffix();
     menucmp = 0;
 }
@@ -284,7 +285,8 @@
 static char *
 get_rbuffer(UNUSED(Param pm))
 {
-    return metafy((char *)zleline + zlecs, zlell - zlecs, META_HEAPDUP);
+    return (char *)zlelineasstring(zleline + zlecs, zlell - zlecs,
+				   0, NULL, NULL, 1);
 }
 
 /**/
@@ -547,27 +549,23 @@
 }
 
 static void
-set_prepost(unsigned char **textvar, int *lenvar, char *x)
+set_prepost(ZLE_STRING_T *textvar, int *lenvar, char *x)
 {
     if (*lenvar) {
-	zfree(*textvar, *lenvar);
+	free(*textvar);
 	*textvar = NULL;
 	*lenvar = 0;
     }
     if (x) {
-	unmetafy(x, lenvar);
-	if (*lenvar) {
-	    *textvar = (unsigned char *)zalloc(*lenvar);
-	    memcpy((char *)*textvar, x, *lenvar);
-	}
+	*textvar = stringaszleline((unsigned char *)x, lenvar, NULL);
 	free(x);
     }
 }
 
 static char *
-get_prepost(unsigned char *text, int len)
+get_prepost(ZLE_STRING_T text, int len)
 {
-    return metafy((char *)text, len, META_HEAPDUP);
+    return (char *)zlelineasstring(text, len, 0, NULL, NULL, 1);
 }
 
 /**/
Index: Src/Zle/zle_refresh.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_refresh.c,v
retrieving revision 1.16
diff -u -r1.16 zle_refresh.c
--- Src/Zle/zle_refresh.c	26 Jan 2005 14:29:17 -0000	1.16
+++ Src/Zle/zle_refresh.c	26 Jan 2005 18:04:32 -0000
@@ -287,7 +287,7 @@
 	*sen,			/* pointer to end of the video buffer (eol)  */
 	*u;			/* pointer for status line stuff */
     ZLE_STRING_T t,		/* pointer into the real buffer		     */
-	*scs;			/* pointer to cursor position in real buffer */
+	scs;			/* pointer to cursor position in real buffer */
     char **qbuf;		/* tmp					     */
     ZLE_STRING_T tmpline;	/* line with added pre/post text */
     int tmpcs, tmpll;		/* ditto cursor position and line length */
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.16
diff -u -r1.16 zle_utils.c
--- Src/Zle/zle_utils.c	26 Jan 2005 12:34:48 -0000	1.16
+++ Src/Zle/zle_utils.c	26 Jan 2005 18:04:33 -0000
@@ -410,9 +410,14 @@
 
 /**/
 void
-setline(char const *s)
+setline(char *s, int flags)
 {
-    char *scp = ztrdup(s);
+    char *scp;
+
+    if (flags & ZSL_COPY)
+	scp = ztrdup(s);
+    else
+	scp = s;
     /*
      * TBD: we could make this more efficient by passing the existing
      * allocated line to stringaszleline.
@@ -421,10 +426,13 @@
 
     zleline = stringaszleline(scp, &zlell, &linesz);
 
-    if ((zlecs = zlell) && invicmdmode())
+    if ((flags & ZSL_TOEND) && (zlecs = zlell) && invicmdmode())
 	zlecs--;
+    else if (zlecs > zlell)
+	zlecs = zlell;
 
-    free(scp);
+    if (flags & ZSL_COPY)
+	free(scp);
 }
 
 /**/

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

This footnote also confirms that this email message has been swept by
MIMEsweeper for the presence of computer viruses.

www.mimesweeper.com
**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: PATCH: zle_params.c
  2005-01-26 18:06 PATCH: zle_params.c Peter Stephenson
@ 2005-01-26 18:35 ` Clint Adams
  2005-01-29  3:47 ` UTF-8 input [was Re: PATCH: zle_params.c] Clint Adams
  1 sibling, 0 replies; 13+ messages in thread
From: Clint Adams @ 2005-01-26 18:35 UTC (permalink / raw)
  To: Peter Stephenson; +Cc: Zsh hackers list

> broken.  It also fixes a typo in zle_params.c.

Oops.

> -    memmove(zleline + len, zleline + zlecs, zlell - zlecs);
> -    memcpy(zleline, y, len);
> +    memmove((char *)(zleline + len), (char *)(zleline + zlecs),
> +	    (zlell - zlecs) * ZLE_CHAR_SIZE);
> +    ZS_memcpy(zleline, y, len);

Hmm.  For the sake of consistency..

Index: Src/system.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/system.h,v
retrieving revision 1.25
diff -u -r1.25 system.h
--- Src/system.h	26 Jan 2005 18:12:18 -0000	1.25
+++ Src/system.h	26 Jan 2005 18:34:04 -0000
@@ -727,6 +727,7 @@
 #define ZLETAB	L'\t'
 #define ZLENULSTR	L""
 #define ZS_memcpy wmemcpy
+#define ZS_memmove wmemmove
 #define ZC_icntrl iswcntrl
 #else
 typedef int ZLE_CHAR_T;
@@ -738,5 +739,6 @@
 #define ZLETAB	'\t'
 #define ZLENULSTR	""
 #define ZS_memcpy memcpy
+#define ZS_memmove memmove
 #define ZC_icntrl icntrl
 #endif
Index: Src/Zle/zle_params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_params.c,v
retrieving revision 1.22
diff -u -r1.22 zle_params.c
--- Src/Zle/zle_params.c	26 Jan 2005 18:12:18 -0000	1.22
+++ Src/Zle/zle_params.c	26 Jan 2005 18:34:04 -0000
@@ -242,8 +242,7 @@
     else
 	y = ZLENULSTR, len = 0;
     sizeline(zlell - zlecs + len);
-    memmove((char *)(zleline + len), (char *)(zleline + zlecs),
-	    (zlell - zlecs) * ZLE_CHAR_SIZE);
+    ZS_memmove(zleline + len, zleline + zlecs, zlell - zlecs);
     ZS_memcpy(zleline, y, len);
     zlell = zlell - zlecs + len;
     zlecs = len;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-26 18:06 PATCH: zle_params.c Peter Stephenson
  2005-01-26 18:35 ` Clint Adams
@ 2005-01-29  3:47 ` Clint Adams
  2005-01-30  1:07   ` Peter Stephenson
  1 sibling, 1 reply; 13+ messages in thread
From: Clint Adams @ 2005-01-29  3:47 UTC (permalink / raw)
  To: Zsh hackers list

> I've left last_isearch since it's not clear what is to become of it
> yet.  Fixing doisearch isn't going to be great fun (240 lines, 2
> comments).  It'll have to wait until we decide about input.

What needs deciding?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-29  3:47 ` UTF-8 input [was Re: PATCH: zle_params.c] Clint Adams
@ 2005-01-30  1:07   ` Peter Stephenson
  2005-01-30  6:35     ` Bart Schaefer
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Stephenson @ 2005-01-30  1:07 UTC (permalink / raw)
  To: Zsh hackers list

Clint Adams wrote:
> > I've left last_isearch since it's not clear what is to become of it
> > yet.  Fixing doisearch isn't going to be great fun (240 lines, 2
> > comments).  It'll have to wait until we decide about input.
> 
> What needs deciding?

At what stage we turn a character from read() into a wide character.
I argued before that key bindings should still use ordinary character
strings to avoid breaking existing bindings.  Somewhere before we insert
a character in the line we need to accumulate bytes from multibyte
characters where necessary.

I thought of the following: self-insert could take a single character,
as at present, and then test if it was the initial part of a multibyte
character.  If it was, it could read the rest; we might need a timeout to
avoid an infinite hang on systems that didn't do multibyte input
properly, which is potentially quite a lot of them.  This would allow
you to bind all 8-bit characters with the top bit set to self-insert and
voila, multibyte character input with the property (as in UTF-8) that
the 7-bit subset is ASCII is now completely handled, but with the
choice of whether to do so or keep old 8-bit bindings left to users.

This leaves other calls to getkey() and other low-level key handling
routines.  Some might need the same mechanism; isearch is an example,
because some keys are interpreted while some are inserted into the
search string.  A further complication is that when searching the
history we might well want to keep the history lines as multibyte
strings; then the search string remains in that format, too.  As this
example indicates I think each case will need considering on its merits.

In addition to getkey() and friends, there is the related matter of the
variable lastchar.  Currently this is a single character; I'm not yet
100% sure whether we can keep this, or promote it to a wchar_t, or
whether we might need both types.  I fear it may be the last.

-- 
Peter Stephenson <pws@pwstephenson.fsnet.co.uk>
Work: pws@csr.com
Web: http://www.pwstephenson.fsnet.co.uk


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-30  1:07   ` Peter Stephenson
@ 2005-01-30  6:35     ` Bart Schaefer
  2005-01-31 11:46       ` Peter Stephenson
  2005-02-10 14:22       ` Peter Stephenson
  0 siblings, 2 replies; 13+ messages in thread
From: Bart Schaefer @ 2005-01-30  6:35 UTC (permalink / raw)
  To: Zsh hackers list

On Jan 30,  1:07am, Peter Stephenson wrote:
} Subject: Re: UTF-8 input [was Re: PATCH: zle_params.c]
}
} I thought of the following: self-insert could take a single character,
} as at present, and then test if it was the initial part of a multibyte
} character.  If it was, it could read the rest; we might need a timeout to
} avoid an infinite hang on systems that didn't do multibyte input
} properly

This would mean what, in terms of binding other functions to wide chars?
That they'd behave like escape sequences do now?  I would think you'd
want to decide whether the input is a wide char at a lower level than
that.  Otherwise don't you have issues if what the user really means to
bind to self-insert is a single-byte character that happens to have the
high bit set?

It seems to me that some stage of the input process has to be "told"
that the input stream is UTF-8 rather than e.g. iso-8859-something.  If
it's the widget level that's going to handle that [*], I think it'd be
most useful to create a self-insert-multibyte which does in fact wait
indefinitely (or at least, longer than the normal escape-sequence key
timeout) for the "rest" of a multibyte character after the first byte is
seen, and feep if it doesn't get something recognizable as the rest.

Then, probably, create a shortcut along the lines of bindkey -m that sets
up self-insert-multibyte on the appropriate prefixes.

[*] Is there a plan yet for UTF-8 shell scripts, by the way?  That
can't be handled at the ZLE level.  What about zcompile?

} In addition to getkey() and friends, there is the related matter of the
} variable lastchar.  Currently this is a single character; I'm not yet
} 100% sure whether we can keep this, or promote it to a wchar_t, or
} whether we might need both types.  I fear it may be the last.

Not just lastchar, but also the KEYS parameter.  If wide chars are dealt
with as sequences at the widget binding level, but BUFFER contains the
corresponding wchars instead, then various currently-working tricks that
involve inserting all or part of KEYS into BUFFER will fail.  At least,
it becomes harder to emulate self-insert(-multibyte) in widget funcs.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-30  6:35     ` Bart Schaefer
@ 2005-01-31 11:46       ` Peter Stephenson
  2005-01-31 16:18         ` Bart Schaefer
  2005-02-10 14:22       ` Peter Stephenson
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Stephenson @ 2005-01-31 11:46 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> On Jan 30,  1:07am, Peter Stephenson wrote:
> } Subject: Re: UTF-8 input [was Re: PATCH: zle_params.c]
> }
> } I thought of the following: self-insert could take a single character,
> } as at present, and then test if it was the initial part of a multibyte
> } character.  If it was, it could read the rest; we might need a timeout to
> } avoid an infinite hang on systems that didn't do multibyte input
> } properly
> 
> This would mean what, in terms of binding other functions to wide chars?
> That they'd behave like escape sequences do now?  I would think you'd
> want to decide whether the input is a wide char at a lower level than
> that.  Otherwise don't you have issues if what the user really means to
> bind to self-insert is a single-byte character that happens to have the
> high bit set?

Hmmm... you mean that on a system where mbrtowc() reports that a
single-byte character is incomplete, the user might nonetheless want to
insert a single-byte character onto the command line?  That's certainly
not something I'd thought of.  However, I'm not sure I see what this is
doing.  If mbrtowc() etc. are confused, which in this case they must be
(it's the only way the user's intention can disagree with what the
proposed mechanism is doing), how can we handle the later stages of
character processing successfully?  When outputting, do we ignore the
fact that wctomb() failed on this character (as it must), reset the
shift counter (for safety) and carry on?  In other words, are you
supposing this is some kind of fallback in case the locale isn't set
correctly, e.g. it's set to UTF-8 but on an xterm with character set
ISO-8859-1?

> It seems to me that some stage of the input process has to be "told"
> that the input stream is UTF-8 rather than e.g. iso-8859-something.  If
> it's the widget level that's going to handle that [*], I think it'd be
> most useful to create a self-insert-multibyte which does in fact wait
> indefinitely (or at least, longer than the normal escape-sequence key
> timeout) for the "rest" of a multibyte character after the first byte is
> seen, and feep if it doesn't get something recognizable as the rest.

That's perfectly workable, but the question above about self-insert
remains.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

This footnote also confirms that this email message has been swept by
MIMEsweeper for the presence of computer viruses.

www.mimesweeper.com
**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-31 11:46       ` Peter Stephenson
@ 2005-01-31 16:18         ` Bart Schaefer
  2005-01-31 17:01           ` Peter Stephenson
  0 siblings, 1 reply; 13+ messages in thread
From: Bart Schaefer @ 2005-01-31 16:18 UTC (permalink / raw)
  To: Zsh hackers list

On Jan 31, 11:46am, Peter Stephenson wrote:
} Subject: Re: UTF-8 input [was Re: PATCH: zle_params.c]
}
} > Otherwise don't you have issues if what the user really means to
} > bind to self-insert is a single-byte character that happens to have
} > the high bit set?
}
} Hmmm... you mean that on a system where mbrtowc() reports that a
} single-byte character is incomplete, the user might nonetheless want to
} insert a single-byte character onto the command line?

No.  I mean, suppose the user uses the same .zshrc in both a iso-8859-*
and a UTF-8 locale, and has an explicit bindkey command which is intended
to work only in the iso-8859-* locale.  That bindkey happens to use a
character for which, in the UTF-8 locale, mbrtowc() reports incomplete.
This was in part why I added the footnote asking about plans for UTF-8
in shell scripts; is it even possible to have the same .zshrc in these
cases?

However, I wasn't thinking very clearly, since mbrtowc() won't report
incomplete for an iso-8859-* character if LC_CTYPE is set correctly.

I'm still worried about the case where that bindkey exists but is for a
function other than self-insert.  If multibyte translation is handled by
a widget at the same priority as all other widgets, that "stray" bindkey
can mess up the whole scheme.

} In other words, are you supposing this is some kind of fallback in
} case the locale isn't set correctly, e.g. it's set to UTF-8 but on an
} xterm with character set ISO-8859-1?

That was probably what was in my head, but on reflection it's not really
something that the shell can deal with.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-31 16:18         ` Bart Schaefer
@ 2005-01-31 17:01           ` Peter Stephenson
  2005-01-31 18:29             ` Bart Schaefer
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Stephenson @ 2005-01-31 17:01 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> No.  I mean, suppose the user uses the same .zshrc in both a iso-8859-*
> and a UTF-8 locale, and has an explicit bindkey command which is intended
> to work only in the iso-8859-* locale.  That bindkey happens to use a
> character for which, in the UTF-8 locale, mbrtowc() reports incomplete.
> This was in part why I added the footnote asking about plans for UTF-8
> in shell scripts; is it even possible to have the same .zshrc in these
> cases?

UTF-8 should work fine to that extent: it gets passed straight through
from the main shell to zle (or anything else) intact by the usual Meta
mechanism.  (That's why I'm so keen on retaining the current string
representation in the main shell.)  If we keep metafied input strings as
the hash keys for the key binding lookups and they are simply string
arguments to bindkey, then there shouldn't be a problem.  I think.

The bit that doesn't work is when you try to examine individual
characters in the main shell; you will get single bytes, possibly with
the 8th bit set.  I can't think of a simple case where setting up key
bindings would need this to work, however.

> I'm still worried about the case where that bindkey exists but is for a
> function other than self-insert.  If multibyte translation is handled by
> a widget at the same priority as all other widgets, that "stray" bindkey
> can mess up the whole scheme.

You mean if the input is real UTF-8 and a widget grabs the first byte,
leaving garbage?  Yes, that's a real problem.  I was expecting that the
shell would either be set up to handle old-style input, or new style
input, not a combination, based on what the user (or administrator; this
should all be possible to automate relatively easily) knows about the
system.

To be explicit, either:

- Input system is not UTF-8 aware; "pass8" or equivalent allows 8-bit
  bindings; any zsh bindings for high-eighth-bit bytes are ordinary
  commands.

or:

- Input system is UTF-8 aware; by hypothesis, any high-eighth-bit
  character sent from the terminal is part of a multibyte character
  (this is beyond our control); any zsh bindings for such bytes reflect
  their use as part of a multibyte character.

The zsh bindings would need to be set by whoever decides which is the
case.  I don't see much more we can do within the shell without more
clairvoyance than usual and without breaking someone's setup.  Please
enlighten me.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

This footnote also confirms that this email message has been swept by
MIMEsweeper for the presence of computer viruses.

www.mimesweeper.com
**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-31 17:01           ` Peter Stephenson
@ 2005-01-31 18:29             ` Bart Schaefer
  2005-02-01 10:37               ` Peter Stephenson
  0 siblings, 1 reply; 13+ messages in thread
From: Bart Schaefer @ 2005-01-31 18:29 UTC (permalink / raw)
  To: Zsh hackers list

On Jan 31,  5:01pm, Peter Stephenson wrote:
} Subject: Re: UTF-8 input [was Re: PATCH: zle_params.c]
}
} Bart Schaefer wrote:
} > No.  I mean, suppose the user uses the same .zshrc in both a iso-8859-*
} > and a UTF-8 locale, and has an explicit bindkey command which is intended
} > to work only in the iso-8859-* locale.
} 
} UTF-8 should work fine to that extent: it gets passed straight through
} from the main shell to zle (or anything else) intact by the usual Meta
} mechanism.

That doesn't answer the question.  When reading the .zshrc (or any other
script) and a byte for which mbrtowc() reports incomplete is found, what
decides whether it's part of a string intended for an iso-8859-* locale
or the introducer of a wide character for a UTF-8 locale?

Is the answer "the file just gets metafied as if it were a binary stream
and individual modules work it out later"?

} > If multibyte translation is handled by a widget at the same priority
} > as all other widgets, that "stray" bindkey can mess up the whole
} > scheme.
} 
} You mean if the input is real UTF-8 and a widget grabs the first byte,
} leaving garbage?  Yes, that's a real problem.  I was expecting that the
} shell would either be set up to handle old-style input, or new style
} input, not a combination

In other words, you assume that nobody will try to use the same .zshrc in 
two different locales, or at least not without wrapping bits of it in
tests of the value of LC_CTYPE or the like.

} I don't see much more we can do within the shell without more
} clairvoyance than usual and without breaking someone's setup.  Please
} enlighten me.

I don't (yet?) know what else we can do, either; I'm just pointing out
issues to make sure they've been considered.

A question that comes to mind is, how will the shell deal with UTF-8
input when ZLE is not enabled?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-31 18:29             ` Bart Schaefer
@ 2005-02-01 10:37               ` Peter Stephenson
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Stephenson @ 2005-02-01 10:37 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> Is the answer "the file just gets metafied as if it were a binary stream
> and individual modules work it out later"?

Yes, that's about the only sensible way it can work, as far as I can
see.  If instead of a shell module it's an argument or output to an
external command, this is the way it needs to work.

> In other words, you assume that nobody will try to use the same .zshrc in 
> two different locales, or at least not without wrapping bits of it in
> tests of the value of LC_CTYPE or the like.

Yes, someone has to test at some point.  I don't think the shell's
internals have enough information.  We *could* bind
(multibyte-?)self-insert to high-eighth-bit characters in UTF-8 locales
by default, I suppose.

> A question that comes to mind is, how will the shell deal with UTF-8
> input when ZLE is not enabled?

I think this relates to the same issue as input from .zshrc, i.e. a
binary stream which has to be dealt with by whatever receives it.
Again, to the extent that the shell doesn't need to process the string
this ought to be OK.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

This footnote also confirms that this email message has been swept by
MIMEsweeper for the presence of computer viruses.

www.mimesweeper.com
**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-01-30  6:35     ` Bart Schaefer
  2005-01-31 11:46       ` Peter Stephenson
@ 2005-02-10 14:22       ` Peter Stephenson
  2005-02-10 14:51         ` Bart Schaefer
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Stephenson @ 2005-02-10 14:22 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> } In addition to getkey() and friends, there is the related matter of the
> } variable lastchar.  Currently this is a single character; I'm not yet
> } 100% sure whether we can keep this, or promote it to a wchar_t, or
> } whether we might need both types.  I fear it may be the last.
> 
> Not just lastchar, but also the KEYS parameter.  If wide chars are dealt
> with as sequences at the widget binding level, but BUFFER contains the
> corresponding wchars instead, then various currently-working tricks that
> involve inserting all or part of KEYS into BUFFER will fail.  At least,
> it becomes harder to emulate self-insert(-multibyte) in widget funcs.

I've been looking at all this rather slowly...

I think $KEYS is OK.  The current intention is for the input to key
bindings to remain multibyte strings (metafied where necessary).
Modifying BUFFER and other parameters converts from multibyte strings to
wide characters automatically; I've already written that bit (because it
was easy).  So if you feed back KEYS into the system from a function it
will pass through the mbtowc stuff at the appropriate level.

It does, of course, become trickier to decide how many characters there
are in $KEYS --- or, more critically, $BUFFER.  However, that's all
bound up with how we do ${#KEYS} etc. in the main shell, which is a
separate question.  I suspect we will want a flag such as ${(m)#KEYS} to
treat the string as a multibyte character string instead of raw bytes
and maybe a ${(M)#KEYS} not to.  The default is tricky; I'm sure you can
argue for it to handle multibyte strings by default in a suitable
locale, but when Perl did that it broke everything in sight.  However,
I'm not planning on doing that bit until we've got ZLE out of the way.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-02-10 14:22       ` Peter Stephenson
@ 2005-02-10 14:51         ` Bart Schaefer
  2005-02-10 15:06           ` Peter Stephenson
  0 siblings, 1 reply; 13+ messages in thread
From: Bart Schaefer @ 2005-02-10 14:51 UTC (permalink / raw)
  To: Zsh hackers list

On Feb 10,  2:22pm, Peter Stephenson wrote:
} Subject: Re: UTF-8 input [was Re: PATCH: zle_params.c]
}
} I think $KEYS is OK.  The current intention is for the input to key
} bindings to remain multibyte strings (metafied where necessary).
} Modifying BUFFER and other parameters converts from multibyte strings to
} wide characters automatically; I've already written that bit (because it
} was easy).  So if you feed back KEYS into the system from a function it
} will pass through the mbtowc stuff at the appropriate level.


So if, for example, I do

	BUFFER="$LBUFFER$KEYS$RBUFFER"

this is going to convert LBUFFER and RBUFFER from wide chars to metafied
on expansion, and then convert the whole quoted string back to wide chars
upon assignment?  Or am I missing something crucial, e.g. that BUFFER and
its L/R components are stored as raw bytes?
 
} I suspect we will want a flag such as ${(m)#KEYS} to treat the string
} as a multibyte character string instead of raw bytes and maybe a
} ${(M)#KEYS} not to.

(M) is already taken, as are (w) and (W).

I presume you mean (m) to be like (c) in that it works only with ${#...}.
Otherwise I'm even more confused.

} [...] argue for it to handle multibyte strings by default in a suitable
} locale, but when Perl did that it broke everything in sight.

I'm not arguing for that ... if anything, I'm arguing against it.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: UTF-8 input [was Re: PATCH: zle_params.c]
  2005-02-10 14:51         ` Bart Schaefer
@ 2005-02-10 15:06           ` Peter Stephenson
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Stephenson @ 2005-02-10 15:06 UTC (permalink / raw)
  To: Zsh hackers list

Bart Schaefer wrote:
> So if, for example, I do
> 
> 	BUFFER="$LBUFFER$KEYS$RBUFFER"
> 
> this is going to convert LBUFFER and RBUFFER from wide chars to metafied
> on expansion, and then convert the whole quoted string back to wide chars
> upon assignment?

Yes, the ZLE parameter interface is responsible for the conversion of
the bits of the buffer in both directions, and, as it happens, $KEYS
doesn't need converting.

> } I suspect we will want a flag such as ${(m)#KEYS} to treat the string
> } as a multibyte character string instead of raw bytes and maybe a
> } ${(M)#KEYS} not to.
> 
> (M) is already taken, as are (w) and (W).

I missed that, but maybe we don't need it if the default is raw bytes.

> I presume you mean (m) to be like (c) in that it works only with ${#...}.
> Otherwise I'm even more confused.

Well, it would be anywhere the distinction made sense, but counting
lengths is the only obvious one.  Anywhere involving pattern matching
would need to go with the locale, I would think; anything else is too
horrendous to contemplate.

> } [...] argue for it to handle multibyte strings by default in a suitable
> } locale, but when Perl did that it broke everything in sight.
> 
> I'm not arguing for that ... if anything, I'm arguing against it.

Compatibility with other shells would be the only good reason I can see,
but that could be an option.  There are plenty of
do_something_done_by_other_shells_however_idiotic options.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

**********************************************************************


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2005-02-10 15:06 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-26 18:06 PATCH: zle_params.c Peter Stephenson
2005-01-26 18:35 ` Clint Adams
2005-01-29  3:47 ` UTF-8 input [was Re: PATCH: zle_params.c] Clint Adams
2005-01-30  1:07   ` Peter Stephenson
2005-01-30  6:35     ` Bart Schaefer
2005-01-31 11:46       ` Peter Stephenson
2005-01-31 16:18         ` Bart Schaefer
2005-01-31 17:01           ` Peter Stephenson
2005-01-31 18:29             ` Bart Schaefer
2005-02-01 10:37               ` Peter Stephenson
2005-02-10 14:22       ` Peter Stephenson
2005-02-10 14:51         ` Bart Schaefer
2005-02-10 15:06           ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).