From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 22937 invoked from network); 2 Nov 2006 18:37:53 -0000 X-Spam-Checker-Version: SpamAssassin 3.1.7 (2006-10-05) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-2.4 required=5.0 tests=AWL,BAYES_00, FORGED_RCVD_HELO autolearn=ham version=3.1.7 Received: from news.dotsrc.org (HELO a.mx.sunsite.dk) (130.225.247.88) by ns1.primenet.com.au with SMTP; 2 Nov 2006 18:37:53 -0000 Received-SPF: none (ns1.primenet.com.au: domain at sunsite.dk does not designate permitted sender hosts) Received: (qmail 10841 invoked from network); 2 Nov 2006 18:37:43 -0000 Received: from sunsite.dk (130.225.247.90) by a.mx.sunsite.dk with SMTP; 2 Nov 2006 18:37:43 -0000 Received: (qmail 11220 invoked by alias); 2 Nov 2006 18:37:40 -0000 Mailing-List: contact zsh-workers-help@sunsite.dk; run by ezmlm Precedence: bulk X-No-Archive: yes X-Seq: 22952 Received: (qmail 11208 invoked from network); 2 Nov 2006 18:37:39 -0000 Received: from news.dotsrc.org (HELO a.mx.sunsite.dk) (130.225.247.88) by sunsite.dk with SMTP; 2 Nov 2006 18:37:39 -0000 Received: (qmail 10537 invoked from network); 2 Nov 2006 18:37:39 -0000 Received: from cluster-c.mailcontrol.com (168.143.177.190) by a.mx.sunsite.dk with SMTP; 2 Nov 2006 18:37:31 -0000 Received: from cameurexb01.EUROPE.ROOT.PRI ([62.189.241.200]) by rly21c.srv.mailcontrol.com (MailControl) with ESMTP id kA2Iawwe025701 for ; Thu, 2 Nov 2006 18:37:09 GMT Received: from news01.csr.com ([10.103.143.38]) by cameurexb01.EUROPE.ROOT.PRI with Microsoft SMTPSVC(6.0.3790.1830); Thu, 2 Nov 2006 18:37:01 +0000 Date: Thu, 2 Nov 2006 18:37:01 +0000 From: Peter Stephenson To: Zsh hackers list Subject: PATCH: multibyte delimiters for substitutions and parameter flags Message-Id: <20061102183701.60fe2efa.pws@csr.com> Organization: Cambridge Silicon Radio X-Mailer: Sylpheed version 2.2.9 (GTK+ 2.8.20; i386-redhat-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-OriginalArrivalTime: 02 Nov 2006 18:37:01.0735 (UTC) FILETIME=[E2F82B70:01C6FEAD] X-Scanned-By: MailControl A-07-06-65 (www.mailcontrol.com) on 10.67.0.131 This is supposed to fix multibyte delimiters for substitutions in modifiers when used in globbing or parameters (not yet history, which is separate code), and for delimiters in the arguments of parameter flags and similar. Index: Src/glob.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/glob.c,v retrieving revision 1.54 diff -u -r1.54 glob.c --- Src/glob.c 1 Nov 2006 12:25:22 -0000 1.54 +++ Src/glob.c 2 Nov 2006 18:33:58 -0000 @@ -1243,9 +1243,10 @@ else { /* ... or a user name */ char sav, *tt; + int arglen; /* Find matching delimiters */ - tt = get_strarg(s); + tt = get_strarg(s, &arglen); if (!*tt) { zerr("missing end of name"); data = 0; @@ -1255,7 +1256,7 @@ sav = *tt; *tt = '\0'; - if ((pw = getpwnam(s + 1))) + if ((pw = getpwnam(s + arglen))) data = pw->pw_uid; else { zerr("unknown user"); @@ -1268,7 +1269,7 @@ data = 0; #endif /* !USE_GETPWNAM */ if (sav) - s = tt + 1; + s = tt + arglen; else s = tt; } @@ -1283,8 +1284,9 @@ else { /* ...or a delimited group name. */ char sav, *tt; + int arglen; - tt = get_strarg(s); + tt = get_strarg(s, &arglen); if (!*tt) { zerr("missing end of name"); data = 0; @@ -1294,7 +1296,7 @@ sav = *tt; *tt = '\0'; - if ((gr = getgrnam(s + 1))) + if ((gr = getgrnam(s + arglen))) data = gr->gr_gid; else { zerr("unknown group"); @@ -1307,7 +1309,7 @@ data = 0; #endif /* !USE_GETGRNAM */ if (sav) - s = tt + 1; + s = tt + arglen; else s = tt; } @@ -1438,8 +1440,7 @@ tt = NULL; } } else { - plus = 1; - tt = get_strarg(s); + tt = get_strarg(s, &plus); if (!*tt) { zerr("missing end of string"); Index: Src/params.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/params.c,v retrieving revision 1.120 diff -u -r1.120 params.c --- Src/params.c 11 Sep 2006 11:09:15 -0000 1.120 +++ Src/params.c 2 Nov 2006 18:34:00 -0000 @@ -947,7 +947,7 @@ int *prevcharlen, int *nextcharlen) { int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash; - int keymatch = 0, needtok = 0; + int keymatch = 0, needtok = 0, arglen; char *s = *str, *sep = NULL, *t, sav, *d, **ta, **p, *tt, c; zlong num = 1, beg = 0, r = 0; Patprog pprog = NULL; @@ -1004,28 +1004,28 @@ * special interpretation by getindex() of `*' or `@'. */ break; case 'n': - t = get_strarg(++s); + t = get_strarg(++s, &arglen); if (!*t) goto flagerr; sav = *t; *t = '\0'; - num = mathevalarg(s + 1, &d); + num = mathevalarg(s + arglen, &d); if (!num) num = 1; *t = sav; - s = t; + s = t + arglen - 1; break; case 'b': hasbeg = 1; - t = get_strarg(++s); + t = get_strarg(++s, &arglen); if (!*t) goto flagerr; sav = *t; *t = '\0'; - if ((beg = mathevalarg(s + 1, &d)) > 0) + if ((beg = mathevalarg(s + arglen, &d)) > 0) beg--; *t = sav; - s = t; + s = t + arglen - 1; break; case 'p': escapes = 1; @@ -1033,15 +1033,16 @@ case 's': /* This gives the string that separates words * * (for use with the `w' flag). */ - t = get_strarg(++s); + t = get_strarg(++s, &arglen); if (!*t) goto flagerr; sav = *t; *t = '\0'; - sep = escapes ? getkeystring(s + 1, &waste, GETKEYS_SEP, NULL) - : dupstring(s + 1); + s += arglen; + sep = escapes ? getkeystring(s, &waste, GETKEYS_SEP, NULL) + : dupstring(s); *t = sav; - s = t; + s = t + arglen - 1; break; default: flagerr: Index: Src/subst.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/subst.c,v retrieving revision 1.65 diff -u -r1.65 subst.c --- Src/subst.c 1 Nov 2006 12:25:22 -0000 1.65 +++ Src/subst.c 2 Nov 2006 18:34:01 -0000 @@ -1137,62 +1137,113 @@ return ret; } + +/* + * Look for a delimited portion of a string. The first (possibly + * multibyte) character at s is the delimiter. Various forms + * of brackets are treated separately, as documented. + * + * Returns a pointer to the final delimiter. Sets *len to the + * length of the final delimiter; a NULL causes *len to be set + * to zero since we shouldn't advance past it. (The string is + * tokenized, so a NULL is a real end of string.) + */ + /**/ char * -get_strarg(char *s) +get_strarg(char *s, int *lenp) { - char t = *s++; - - if (!t) - return s - 1; + convchar_t del; + int len; + char tok = 0; + + MB_METACHARINIT(); + len = MB_METACHARLENCONV(s, &del); + if (!len) { + *lenp = 0; + return s; + } - switch (t) { - case '(': - t = ')'; +#ifdef MULTIBYTE_SUPPORT + if (del == WEOF) + del = (wint_t)((*s == Meta) ? s[1] ^ 32 : *s); +#endif + s += len; + switch (del) { + case ZWC('('): + del = ZWC(')'); break; case '[': - t = ']'; + del = ZWC(']'); break; case '{': - t = '}'; + del = ZWC('}'); break; case '<': - t = '>'; + del = ZWC('>'); break; case Inpar: - t = Outpar; + tok = Outpar; break; case Inang: - t = Outang; + tok = Outang; break; case Inbrace: - t = Outbrace; + tok = Outbrace; break; case Inbrack: - t = Outbrack; + tok = Outbrack; break; } - while (*s && *s != t) - s++; + if (tok) { + /* + * Looking for a matching token; we want the literal byte, + * not a decoded multibyte character, so search specially. + */ + while (*s && *s != tok) + s++; + } else { + convchar_t del2; + len = 0; + while (*s) { + len = MB_METACHARLENCONV(s, &del2); +#ifdef MULTIBYTE_SUPPORT + if (del2 == WEOF) + del2 = (wint_t)((*s == Meta) ? s[1] ^ 32 : *s); +#endif + if (del == del2) + break; + s += len; + } + } + *lenp = len; return s; } +/* + * Get an integer argument; update *s to he end of the + * final delimiter. *delmatchp is set to 1 if we have matching + * delimiters and there was no error in the evaluation, else 0. + */ + /**/ static int -get_intarg(char **s) +get_intarg(char **s, int *delmatchp) { - char *t = get_strarg(*s + 1); + int arglen; + char *t = get_strarg(*s, &arglen); char *p, sav; zlong ret; + *delmatchp = 0; if (!*t) return -1; sav = *t; *t = '\0'; - p = dupstring(*s + 2); - *s = t; + p = dupstring(*s + arglen); + *s = t + arglen; *t = sav; if (parsestr(p)) return -1; @@ -1204,6 +1255,7 @@ return -1; if (ret < 0) ret = -ret; + *delmatchp = 1; return ret < 0 ? -ret : ret; } @@ -1540,8 +1592,8 @@ int escapes = 0; int klen; #define UNTOK(C) (itok(C) ? ztokens[(C) - Pound] : (C)) -#define UNTOK_AND_ESCAPE(X) {\ - untokenize(X = dupstring(s + 1));\ +#define UNTOK_AND_ESCAPE(X, S) {\ + untokenize(X = dupstring(S));\ if (escapes) {\ X = getkeystring(X, &klen, GETKEYS_SEP, NULL);\ X = metafy(X, klen, META_HREALLOC);\ @@ -1549,6 +1601,9 @@ } for (s++; (c = *s) != ')' && c != Outpar; s++, tt = 0) { + int arglen; /* length of modifier argument */ + int delmatch; /* integer delimiters matched OK */ + switch (c) { case ')': case Outpar: @@ -1578,9 +1633,11 @@ flags |= SUB_SUBSTR; break; case 'I': - flnum = get_intarg(&s); + s++; + flnum = get_intarg(&s, &delmatch); if (flnum < 0) goto flagerr; + s--; break; case 'L': @@ -1658,16 +1715,16 @@ tt = 1; /* fall through */ case 'j': - t = get_strarg(++s); + t = get_strarg(++s, &arglen); if (*t) { sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(spsep) + UNTOK_AND_ESCAPE(spsep, s + arglen) else - UNTOK_AND_ESCAPE(sep) + UNTOK_AND_ESCAPE(sep, s + arglen) *t = sav; - s = t; + s = t + arglen - 1; } else goto flagerr; break; @@ -1676,43 +1733,43 @@ tt = 1; /* fall through */ case 'r': - sav = s[1]; - num = get_intarg(&s); + s++; + num = get_intarg(&s, &delmatch); if (num < 0) goto flagerr; if (tt) prenum = num; else postnum = num; - if (UNTOK(s[1]) != UNTOK(sav)) + if (!delmatch) break; - t = get_strarg(++s); + t = get_strarg(s, &arglen); if (!*t) goto flagerr; sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(premul) + UNTOK_AND_ESCAPE(premul, s + arglen) else - UNTOK_AND_ESCAPE(postmul) + UNTOK_AND_ESCAPE(postmul, s + arglen) *t = sav; sav = *s; - s = t + 1; + s = t + arglen; if (UNTOK(*s) != UNTOK(sav)) { s--; break; } - t = get_strarg(s); + t = get_strarg(s, &arglen); if (!*t) goto flagerr; sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(preone) + UNTOK_AND_ESCAPE(preone, s + arglen) else - UNTOK_AND_ESCAPE(postone) + UNTOK_AND_ESCAPE(postone, s + arglen) *t = sav; - s = t; + s = t + arglen - 1; break; case 'm': @@ -3251,9 +3308,10 @@ void modify(char **str, char **ptr) { - char *ptr1, *ptr2, *ptr3, del, *lptr, c, *test, *sep, *t, *tt, tc, *e; - char *copy, *all, *tmp, sav; - int gbal, wall, rec, al, nl; + char *ptr1, *ptr2, *ptr3, *lptr, c, *test, *sep, *t, *tt, tc, *e; + char *copy, *all, *tmp, sav, sav1, *ptr1end; + int gbal, wall, rec, al, nl, charlen, delmatch; + convchar_t del; test = NULL; @@ -3282,20 +3340,48 @@ break; case 's': - /* TODO: multibyte delimiter */ c = **ptr; (*ptr)++; ptr1 = *ptr; - del = *ptr1++; - for (ptr2 = ptr1; *ptr2 != del && *ptr2; ptr2++); + MB_METACHARINIT(); + charlen = MB_METACHARLENCONV(ptr1, &del); +#ifdef MULTIBYTE_SUPPORT + if (del == WEOF) + del = (wint_t)((*ptr1 == Meta) ? ptr1[1] ^ 32 : *ptr1); +#endif + ptr1 += charlen; + for (ptr2 = ptr1, charlen = 0; *ptr2; ptr2 += charlen) { + convchar_t del2; + charlen = MB_METACHARLENCONV(ptr2, &del2); +#ifdef MULTIBYTE_SUPPORT + if (del2 == WEOF) + del2 = (wint_t)((*ptr2 == Meta) ? + ptr2[1] ^ 32 : *ptr2); +#endif + if (del2 == del) + break; + } if (!*ptr2) { zerr("bad substitution"); return; } - *ptr2++ = '\0'; - for (ptr3 = ptr2; *ptr3 != del && *ptr3; ptr3++); - if ((sav = *ptr3)) - *ptr3++ = '\0'; + ptr1end = ptr2; + ptr2 += charlen; + sav1 = *ptr1end; + *ptr1end = '\0'; + for (ptr3 = ptr2, charlen = 0; *ptr3; ptr3 += charlen) { + convchar_t del3; + charlen = MB_METACHARLENCONV(ptr3, &del3); +#ifdef MULTIBYTE_SUPPORT + if (del3 == WEOF) + del3 = (wint_t)((*ptr3 == Meta) ? + ptr3[1] ^ 32 : *ptr3); +#endif + if (del3 == del) + break; + } + sav = *ptr3; + *ptr3 = '\0'; if (*ptr1) { zsfree(hsubl); hsubl = ztrdup(ptr1); @@ -3313,10 +3399,9 @@ for (tt = hsubr = ztrdup(ptr2); *tt; tt++) if (inull(*tt) && *tt != Bnullkeep) chuck(tt--); - ptr2[-1] = del; - if (sav) - ptr3[-1] = sav; - *ptr = ptr3 - 1; + *ptr1end = sav1; + *ptr3 = sav; + *ptr = ptr3 + charlen - 1; break; case '&': @@ -3335,13 +3420,13 @@ case 'W': wall = 1; (*ptr)++; - ptr1 = get_strarg(ptr2 = *ptr); + ptr1 = get_strarg(ptr2 = *ptr, &charlen); if ((sav = *ptr1)) *ptr1 = '\0'; - sep = dupstring(ptr2 + 1); + sep = dupstring(ptr2 + charlen); if (sav) *ptr1 = sav; - *ptr = ptr1 + 1; + *ptr = ptr1 + charlen; c = '\0'; break; @@ -3350,8 +3435,8 @@ (*ptr)++; break; case 'F': - rec = get_intarg(ptr); (*ptr)++; + rec = get_intarg(ptr, &delmatch); break; default: *ptr = lptr; Index: Test/D04parameter.ztst =================================================================== RCS file: /cvsroot/zsh/zsh/Test/D04parameter.ztst,v retrieving revision 1.21 diff -u -r1.21 D04parameter.ztst --- Test/D04parameter.ztst 13 Sep 2006 20:55:30 -0000 1.21 +++ Test/D04parameter.ztst 2 Nov 2006 18:34:01 -0000 @@ -867,3 +867,17 @@ >andsomekept >andsomekept + file=/one/two/three/four + print ${file:fh} + print ${file:F.1.h} + print ${file:F+2+h} + print ${file:F(3)h} + print ${file:F<4>h} + print ${file:F{5}h} +0:Modifiers with repetition +>/ +>/one/two/three +>/one/two +>/one +>/ +>/ Index: Test/D07multibyte.ztst =================================================================== RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v retrieving revision 1.10 diff -u -r1.10 D07multibyte.ztst --- Test/D07multibyte.ztst 13 Sep 2006 20:55:30 -0000 1.10 +++ Test/D07multibyte.ztst 2 Nov 2006 18:34:01 -0000 @@ -297,3 +297,17 @@ >«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ >ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ # er... yeah, that looks right... + + foo=picobarn + print ${foo:s£bar£rod£:s¥rod¥stick¥} +0:Delimiters in modifiers +>picostickn + +# TODO: if we get paired multibyte bracket delimiters to work +# (as Emacs does, the smug so-and-so), the following should change. + foo=bar + print ${(r£5£¥X¥)foo} + print ${(l«10«»Y»£HI£)foo} +0:Delimiters in parameter flags +>barXX +>YYYYYHIbar -- Peter Stephenson Software Engineer CSR PLC, Churchill House, Cambridge Business Park, Cowley Road Cambridge, CB4 0WZ, UK Tel: +44 (0)1223 692070 To access the latest news from CSR copy this link into a web browser: http://www.csr.com/email_sig.php