From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 5792 invoked by alias); 10 Sep 2015 19:02:55 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: X-Seq: 36478 Received: (qmail 2308 invoked from network); 10 Sep 2015 19:02:52 -0000 X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-1.9 required=5.0 tests=BAYES_00 autolearn=ham autolearn_force=no version=3.4.0 X-Originating-IP: [80.3.228.158] X-Spam: 0 X-Authority: v=2.1 cv=AJvf2gUA c=1 sm=1 tr=0 a=P+FLVI8RzFchTbbqTxIDRw==:117 a=P+FLVI8RzFchTbbqTxIDRw==:17 a=NLZqzBF-AAAA:8 a=kj9zAlcOel0A:10 a=hD80L64hAAAA:8 a=yTpqj2Z6neFHdqsdNxgA:9 a=CjuIK1q_8ugA:10 Date: Thu, 10 Sep 2015 19:57:13 +0100 From: Peter Stephenson To: Peter Stephenson Cc: zsh-workers@zsh.org Subject: Re: Pasting UTF-8 characters with bracketed-paste-magic seems broken in 5.1 Message-ID: <20150910195713.340861cb@ntlworld.com> In-Reply-To: <20150910172840.0a1899f4@pwslap01u.europe.root.pri> References: <20150906155751.GD3721@sym.noone.org> <150910073920.ZM21998@torch.brasslantern.com> <20150910145714.GO3721@sym.noone.org> <150910084516.ZM22892@torch.brasslantern.com> <20150910170705.0fbeb302@pwslap01u.europe.root.pri> <150910091649.ZM3715@torch.brasslantern.com> <20150910172840.0a1899f4@pwslap01u.europe.root.pri> X-Mailer: Claws Mail 3.11.1 (GTK+ 2.24.28; x86_64-redhat-linux-gnu) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit On Thu, 10 Sep 2015 17:28:40 +0100 Peter Stephenson wrote: > Wouldn't be hard to add [[:INCOMPLETE::]] or [[:INVALID:]] to the > pattern code, but that's an extra step... Easy to write, though slightly less convenient to use than you might hope. The point is that we treat invalid and incomplete characters byte by byte, so you can guarantee to detect [[:INCOMPLETE:]] as the first byte, but you can't in general guarantee how the rest will be treated, particularly since we don't insist multibyte means UTF-8. So [[:INCOMPLETE:]]* is about the best you can do to determine your sequence is incomplete. But in general you can't be sure the sequence is ever going to be complete anyway, so this isn't so much of a limitation, and I've documented it. Now should be possible to do more in shell code... diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo index d44b40a..de12c85 100644 --- a/Doc/Zsh/expn.yo +++ b/Doc/Zsh/expn.yo @@ -1956,6 +1956,20 @@ ifzman(the zmanref(zshparam) manual page)\ ifnzman(noderef(Parameters Used By The Shell))\ . ) +item(tt([:INCOMPLETE:]))( +Matches a byte that starts an incomplete multibyte character. +Note that there may be a sequence of more than one bytes that +taken together form the prefix of a multibyte character. To +test for a potentially incomplete byte sequence, use the pattern +`tt([[:INCOMPLETE:]]*)'. This will never match a sequence starting +with a valid multibyte character. +) +item(tt([:INVALID:]))( +Matches a byte that does not start a valid multibyte character. +Note this may be a continuation byte of an incomplete multibyte +character as any part of a multibyte string consisting of invalid and +incomplete multibyte characters is treated as single bytes. +) item(tt([:WORD:]))( The character is treated as part of a word; this test is sensitive to the value of the tt(WORDCHARS) parameter diff --git a/Src/Zle/comp.h b/Src/Zle/comp.h index 34da2ca..023c418 100644 --- a/Src/Zle/comp.h +++ b/Src/Zle/comp.h @@ -202,8 +202,9 @@ struct cpattern { * TODO: this will change. */ #ifdef MULTIBYTE_SUPPORT -#define PATMATCHRANGE(r, c, ip, mtp) mb_patmatchrange(r, c, ip, mtp) -#define PATMATCHINDEX(r, i, cp, mtp) mb_patmatchindex(r, i, cp, mtp) +#define PATMATCHRANGE(r, c, ip, mtp) \ + mb_patmatchrange(r, c, ZMB_VALID, ip, mtp) +#define PATMATCHINDEX(r, i, cp, mtp) mb_patmatchindex(r, i, cp, mtp) #define CONVCAST(c) ((wchar_t)(c)) #define CHR_INVALID (WEOF) #else diff --git a/Src/pattern.c b/Src/pattern.c index b4ba33e..3b55ccf 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -145,7 +145,7 @@ typedef union upat *Upat; * * P_ANY, P_ANYOF: the operand is a null terminated * string. Normal characters match as expected. Characters - * in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate + * in the range Meta+PP_ALPHA..Meta+PP_UNKWN do the appropriate * Posix range tests. This relies on imeta returning true for these * characters. We treat unknown POSIX ranges as never matching. * PP_RANGE means the next two (possibly metafied) characters form @@ -1119,7 +1119,7 @@ patgetglobflags(char **strp, long *assertp, int *ignore) static const char *colon_stuffs[] = { "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", "IDENT", - "IFS", "IFSSPACE", "WORD", NULL + "IFS", "IFSSPACE", "WORD", "INCOMPLETE", "INVALID", NULL }; /* @@ -1870,9 +1870,9 @@ static int globdots; /* Glob initial dots? */ #ifdef MULTIBYTE_SUPPORT /* Get a character from the start point in a string */ -#define CHARREF(x, y) charref((x), (y)) +#define CHARREF(x, y) charref((x), (y), (int *)NULL) static wchar_t -charref(char *x, char *y) +charref(char *x, char *y, int *zmb_ind) { wchar_t wc; size_t ret; @@ -1886,9 +1886,13 @@ charref(char *x, char *y) /* Error. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); + if (zmb_ind) + *zmb_ind = (ret == MB_INVALID) ? ZMB_INVALID : ZMB_INCOMPLETE; return WCHAR_INVALID(*x); } + if (zmb_ind) + *zmb_ind = ZMB_VALID; return wc; } @@ -2580,10 +2584,11 @@ patmatch(Upat prog) fail = 1; else { #ifdef MULTIBYTE_SUPPORT - wchar_t cr = CHARREF(patinput, patinend); + int zmb_ind; + wchar_t cr = charref(patinput, patinend, &zmb_ind); char *scanop = (char *)P_OPERAND(scan); if (patglobflags & GF_MULTIBYTE) { - if (mb_patmatchrange(scanop, cr, NULL, NULL) ^ + if (mb_patmatchrange(scanop, cr, zmb_ind, NULL, NULL) ^ (P_OP(scan) == P_ANYOF)) fail = 1; else @@ -3351,6 +3356,9 @@ patmatch(Upat prog) * The null-terminated specification is in range; the test * character is in ch. * + * zmb is one of the enum defined above charref(), for indicating + * incomplete or invalid multibyte characters. + * * indptr is used by completion matching, which is why this * function is exported. If indptr is not NULL we set *indptr * to the index of the character in the range string, adjusted @@ -3367,7 +3375,7 @@ patmatch(Upat prog) /**/ mod_export int -mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp) +mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp) { wchar_t r1, r2; @@ -3476,6 +3484,14 @@ mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp) *indptr += r2 - r1; } break; + case PP_INCOMPLETE: + if (zmb_ind == ZMB_INCOMPLETE) + return 1; + break; + case PP_INVALID: + if (zmb_ind == ZMB_INVALID) + return 1; + break; case PP_UNKWN: DPUTS(1, "BUG: unknown posix range passed through.\n"); break; @@ -3545,6 +3561,8 @@ mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp) case PP_IFS: case PP_IFSSPACE: case PP_WORD: + case PP_INCOMPLETE: + case PP_INVALID: if (!ind) { *mtp = swtype; return 1; @@ -3698,6 +3716,10 @@ patmatchrange(char *range, int ch, int *indptr, int *mtp) if (indptr && r1 < r2) *indptr += r2 - r1; break; + case PP_INCOMPLETE: + case PP_INVALID: + /* Never true if not in multibyte mode */ + break; case PP_UNKWN: DPUTS(1, "BUG: unknown posix range passed through.\n"); break; @@ -3768,6 +3790,8 @@ patmatchindex(char *range, int ind, int *chr, int *mtp) case PP_IFS: case PP_IFSSPACE: case PP_WORD: + case PP_INCOMPLETE: + case PP_INVALID: if (!ind) { *mtp = swtype; return 1; @@ -3851,9 +3875,10 @@ static int patrepeat(Upat p, char *charstart) case P_ANYBUT: while (scan < patinend) { #ifdef MULTIBYTE_SUPPORT - wchar_t cr = CHARREF(scan, patinend); + int zmb_ind; + wchar_t cr = charref(scan, patinend, &zmb_ind); if (patglobflags & GF_MULTIBYTE) { - if (mb_patmatchrange(opnd, cr, NULL, NULL) ^ + if (mb_patmatchrange(opnd, cr, zmb_ind, NULL, NULL) ^ (P_OP(p) == P_ANYOF)) break; } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^ diff --git a/Src/zsh.h b/Src/zsh.h index a99c900..4e2cb65 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1562,13 +1562,15 @@ typedef struct zpc_disables_save *Zpc_disables_save; #define PP_IFS 15 #define PP_IFSSPACE 16 #define PP_WORD 17 +#define PP_INCOMPLETE 18 +#define PP_INVALID 19 /* Special value for last definition */ -#define PP_LAST 17 +#define PP_LAST 19 /* Unknown type. Not used in a valid token. */ -#define PP_UNKWN 18 +#define PP_UNKWN 20 /* Range: token followed by the (possibly multibyte) start and end */ -#define PP_RANGE 19 +#define PP_RANGE 21 /* Globbing flags: lower 8 bits gives approx count */ #define GF_LCMATCHUC 0x0100 @@ -1577,6 +1579,15 @@ typedef struct zpc_disables_save *Zpc_disables_save; #define GF_MATCHREF 0x0800 #define GF_MULTIBYTE 0x1000 /* Use multibyte if supported by build */ +enum { + /* Valid multibyte character from charref */ + ZMB_VALID, + /* Incomplete multibyte character from charref */ + ZMB_INCOMPLETE, + /* Invalid multibyte character charref */ + ZMB_INVALID +}; + /* Dummy Patprog pointers. Used mainly in executable code, but the * pattern code needs to know about it, too. */ diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index 3fadd80..ace191f 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -525,3 +525,9 @@ fi done 0:Invalid characters in pattern matching + + [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1 + [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2 + [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:NVALID:]] ]] || print fail 3 + [[ $'\xe3\x83\x9b' = ? ]] || print fail 4 +0:Testing incomplete and invalid multibyte character components