From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 7269 invoked by alias); 29 Sep 2015 18:43:16 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: X-Seq: 36700 Received: (qmail 8697 invoked from network); 29 Sep 2015 18:43:11 -0000 X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-1.9 required=5.0 tests=BAYES_00 autolearn=ham autolearn_force=no version=3.4.0 X-Originating-IP: [80.3.228.158] X-Spam: 0 X-Authority: v=2.1 cv=UKUgZ3ry c=1 sm=1 tr=0 a=P+FLVI8RzFchTbbqTxIDRw==:117 a=P+FLVI8RzFchTbbqTxIDRw==:17 a=NLZqzBF-AAAA:8 a=kj9zAlcOel0A:10 a=pYzSvKr_zD14daSHMi0A:9 a=xorPnZPuRDUrQFey:21 a=MdtEcypsxccSomK6:21 a=CjuIK1q_8ugA:10 Date: Tue, 29 Sep 2015 19:37:26 +0100 From: Peter Stephenson To: Peter Stephenson , zsh-workers@zsh.org Subject: Re: Substitution ${...///} slows down when certain UTF character occurs Message-ID: <20150929193726.38235c76@ntlworld.com> In-Reply-To: <20150929094436.32b62692@pwslap01u.europe.root.pri> References: <150926134410.ZM17546@torch.brasslantern.com> <150927091121.ZM25721@torch.brasslantern.com> <20150928095142.385a33eb@pwslap01u.europe.root.pri> <20150928202312.6679b38e@ntlworld.com> <20150929094436.32b62692@pwslap01u.europe.root.pri> X-Mailer: Claws Mail 3.11.1 (GTK+ 2.24.28; x86_64-redhat-linux-gnu) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit This uses the new interface. I haven't done any testing apart from the normal test suite. There's one change I still want to make, which is to put the remaining allocatiion onto the heap --- it should be used infrequently enough to make this feasible. However, it's probably seeing what the effects are before that. There's a reasonable chance there's still some glitch with metafied characters. We could probably do with some more parameter substitution tests involving Mikael's interesting characters, possibly more failure cases, too. pws diff --git a/Src/glob.c b/Src/glob.c index 8bf7352..0594f0a 100644 --- a/Src/glob.c +++ b/Src/glob.c @@ -2450,29 +2450,46 @@ matchpat(char *a, char *b) /* please do not laugh at this code. */ /* Having found a match in getmatch, decide what part of string - * to return. The matched part starts b characters into string s - * and finishes e characters in: 0 <= b <= e <= strlen(s) + * to return. The matched part starts b characters into string imd->ustr + * and finishes e characters in: 0 <= b <= e <= imd->ulen on input * (yes, empty matches should work). - * fl is a set of the SUB_* matches defined in zsh.h from SUB_MATCH onwards; - * the lower parts are ignored. - * replstr is the replacement string for a substitution + * + * imd->flags is a set of the SUB_* matches defined in zsh.h from + * SUB_MATCH onwards; the lower parts are ignored. + * + * imd->replstr is the replacement string for a substitution + * + * imd->replstr is metafied and the values put in imd->repllist are metafied. */ /**/ static char * -get_match_ret(char *s, int b, int e, int fl, char *replstr, - LinkList repllist) +get_match_ret(Imatchdata imd, int b, int e) { - char buf[80], *r, *p, *rr; - int ll = 0, l = strlen(s), bl = 0, t = 0, i; - + char buf[80], *r, *p, *rr, *replstr = imd->replstr; + int ll = 0, bl = 0, t = 0, add = 0, fl = imd->flags, i; + + /* Account for b and e referring to unmetafied string */ + for (p = imd->ustr; p < imd->ustr + b; p++) + if (imeta(*p)) + add++; + b += add; + for (; p < imd->ustr + e; p++) + if (imeta(*p)) + add++; + e += add; + for (; p < imd->ustr + imd->ulen; p++) + if (imeta(*p)) + add++; + + /* Everything now refers to meatfied lengths. */ if (replstr || (fl & SUB_LIST)) { if (fl & SUB_DOSUBST) { replstr = dupstring(replstr); singsub(&replstr); untokenize(replstr); } - if ((fl & (SUB_GLOBAL|SUB_LIST)) && repllist) { + if ((fl & (SUB_GLOBAL|SUB_LIST)) && imd->repllist) { /* We are replacing the chunk, just add this to the list */ Repldata rd = (Repldata) ((fl & SUB_LIST) ? zalloc(sizeof(*rd)) : zhalloc(sizeof(*rd))); @@ -2480,30 +2497,32 @@ get_match_ret(char *s, int b, int e, int fl, char *replstr, rd->e = e; rd->replstr = replstr; if (fl & SUB_LIST) - zaddlinknode(repllist, rd); + zaddlinknode(imd->repllist, rd); else - addlinknode(repllist, rd); - return s; + addlinknode(imd->repllist, rd); + return imd->mstr; } ll += strlen(replstr); } if (fl & SUB_MATCH) /* matched portion */ ll += 1 + (e - b); if (fl & SUB_REST) /* unmatched portion */ - ll += 1 + (l - (e - b)); + ll += 1 + (imd->mlen - (e - b)); if (fl & SUB_BIND) { /* position of start of matched portion */ - sprintf(buf, "%d ", MB_METASTRLEN2END(s, 0, s+b) + 1); + sprintf(buf, "%d ", MB_METASTRLEN2END(imd->mstr, 0, imd->mstr+b) + 1); ll += (bl = strlen(buf)); } if (fl & SUB_EIND) { /* position of end of matched portion */ - sprintf(buf + bl, "%d ", MB_METASTRLEN2END(s, 0, s+e) + 1); + sprintf(buf + bl, "%d ", + MB_METASTRLEN2END(imd->mstr, 0, imd->mstr+e) + 1); ll += (bl = strlen(buf)); } if (fl & SUB_LEN) { /* length of matched portion */ - sprintf(buf + bl, "%d ", MB_METASTRLEN2END(s+b, 0, s+e)); + sprintf(buf + bl, "%d ", MB_METASTRLEN2END(imd->mstr+b, 0, + imd->mstr+e)); ll += (bl = strlen(buf)); } if (bl) @@ -2513,7 +2532,7 @@ get_match_ret(char *s, int b, int e, int fl, char *replstr, if (fl & SUB_MATCH) { /* copy matched portion to new buffer */ - for (i = b, p = s + b; i < e; i++) + for (i = b, p = imd->mstr + b; i < e; i++) *rr++ = *p++; t = 1; } @@ -2523,12 +2542,12 @@ get_match_ret(char *s, int b, int e, int fl, char *replstr, if (t) *rr++ = ' '; /* there may be unmatched bits at both beginning and end of string */ - for (i = 0, p = s; i < b; i++) + for (i = 0, p = imd->mstr; i < b; i++) *rr++ = *p++; if (replstr) for (p = replstr; *p; ) *rr++ = *p++; - for (i = e, p = s + e; i < l; i++) + for (i = e, p = imd->mstr + e; i < imd->mlen; i++) *rr++ = *p++; t = 1; } @@ -2710,26 +2729,18 @@ set_pat_end(Patprog p, char null_me) /* * Increment *tp over character which may be multibyte. - * Return number of bytes that remain in the character after unmetafication. + * Return number of bytes. + * All unmetafied here. */ /**/ -static int iincchar(char **tp) +static int iincchar(char **tp, int left) { char *t = *tp; - int mbclen = mb_metacharlenconv(t, NULL); - int umlen = 0; - - while (mbclen--) { - umlen++; - if (*t++ == Meta) { - t++; - mbclen--; - } - } - *tp = t; + int mbclen = mb_charlenconv(t, left, NULL); + *tp = t + mbclen; - return umlen; + return mbclen; } /**/ @@ -2737,7 +2748,7 @@ static int igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, LinkList *repllistp) { - char *s = *sp, *t, *tmatch; + char *s = *sp, *t, *tmatch, *send; /* * Note that ioff counts (possibly multibyte) characters in the * character set (Meta's are not included), while l counts characters in @@ -2752,36 +2763,52 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ int ioff, l = strlen(*sp), matched = 1, umltot = ztrlen(*sp); int umlen, nmatches; - /* - * List of bits of matches to concatenate with replacement string. - * The data is a struct repldata. It is not used in cases like - * ${...//#foo/bar} even though SUB_GLOBAL is set, since the match - * is anchored. It goes on the heap. - */ - LinkList repllist = NULL; + struct patstralloc patstralloc; + struct imatchdata imd; + + (void)patallocstr(p, s, l, umltot, 1, &patstralloc); + s = patstralloc.alloced; + DPUTS(!s, "forced patallocstr failed"); + send = s + umltot; + + imd.mstr = *sp; + imd.mlen = l; + imd.ustr = s; + imd.ulen = umltot; + imd.flags = fl; + imd.replstr = replstr; + imd.repllist = NULL; /* perform must-match test for complex closures */ if (p->mustoff) { - /* - * Yuk. Probably we should rewrite this whole function to - * use an unmetafied test string. - * - * Use META_HEAPDUP because we need a terminating NULL. - */ - char *muststr = metafy((char *)p + p->mustoff, - p->patmlen, META_HEAPDUP); + char *muststr = (char *)p + p->mustoff; - if (!strstr(s, muststr)) - matched = 0; + matched = 0; + if (p->patmlen <= umltot) + { + for (t = s; t <= send - p->patmlen; t++) + { + if (!memcmp(muststr, t, p->patmlen)) { + matched = 1; + break; + } + } + } } /* in case we used the prog before... */ p->flags &= ~(PAT_NOTSTART|PAT_NOTEND); if (fl & SUB_ALL) { - int i = matched && pattrylen(p, s, -1, -1, NULL, 0); - *sp = get_match_ret(*sp, 0, i ? l : 0, fl, i ? replstr : 0, NULL); + int i = matched && pattrylen(p, s, umltot, 0, &patstralloc, 0); + if (!i) { + /* Perform under no-match conditions */ + umltot = 0; + imd.replstr = NULL; + } + *sp = get_match_ret(&imd, 0, umltot); + patfreestr(&patstralloc); if (! **sp && (((fl & SUB_MATCH) && !i) || ((fl & SUB_REST) && i))) return 0; return 1; @@ -2809,25 +2836,27 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * Largest/smallest possible match at head of string. * First get the longest match... */ - if (pattrylen(p, s, -1, -1, NULL, 0)) { - /* patmatchlen returns metafied length, as we need */ + if (pattrylen(p, s, umltot, 0, &patstralloc, 0)) { + /* patmatchlen returns unmetafied length in this case */ int mlen = patmatchlen(); if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { + send = s + mlen; /* * ... now we know whether it's worth looking for the * shortest, which we do by brute force. */ mb_charinit(); - for (t = s, umlen = 0; t < s + mlen; ) { + for (t = s, umlen = 0; t < send; ) { set_pat_end(p, *t); - if (pattrylen(p, s, t - s, umlen, NULL, 0)) { + if (pattrylen(p, s, umlen, 0, &patstralloc, 0)) { mlen = patmatchlen(); break; } - umlen += iincchar(&t); + umlen += iincchar(&t, send - t); } } - *sp = get_match_ret(*sp, 0, mlen, fl, replstr, NULL); + *sp = get_match_ret(&imd, 0, mlen); + patfreestr(&patstralloc); return 1; } break; @@ -2845,20 +2874,23 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ mb_charinit(); tmatch = NULL; - for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { + for (ioff = 0, t = s, umlen = umltot; t < send; ioff++) { set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff)) tmatch = t; if (fl & SUB_START) break; - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } if (tmatch) { - *sp = get_match_ret(*sp, tmatch - s, l, fl, replstr, NULL); + *sp = get_match_ret(&imd, tmatch - s, umltot); + patfreestr(&patstralloc); return 1; } - if (!(fl & SUB_START) && pattrylen(p, s + l, 0, 0, NULL, ioff)) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + if (!(fl & SUB_START) && pattrylen(p, s + umltot, 0, 0, + &patstralloc, ioff)) { + *sp = get_match_ret(&imd, umltot, umltot); + patfreestr(&patstralloc); return 1; } break; @@ -2868,18 +2900,21 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * move forward along string until we get a match. * * Again there's no optimisation. */ mb_charinit(); - for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { + for (ioff = 0, t = s, umlen = umltot; t < send ; ioff++) { set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { - *sp = get_match_ret(*sp, t-s, l, fl, replstr, NULL); + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff)) { + *sp = get_match_ret(&imd, t-s, umltot); + patfreestr(&patstralloc); return 1; } if (fl & SUB_START) break; - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } - if (!(fl & SUB_START) && pattrylen(p, s + l, 0, 0, NULL, ioff)) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + if (!(fl & SUB_START) && pattrylen(p, send, 0, 0, + &patstralloc, ioff)) { + *sp = get_match_ret(&imd, umltot, umltot); + patfreestr(&patstralloc); return 1; } break; @@ -2887,18 +2922,20 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, case SUB_SUBSTR: /* Smallest at start, but matching substrings. */ set_pat_start(p, l); - if (!(fl & SUB_GLOBAL) && pattrylen(p, s + l, -1, -1, NULL, 0) && + if (!(fl & SUB_GLOBAL) && + pattrylen(p, send, 0, 0, &patstralloc, 0) && !--n) { - *sp = get_match_ret(*sp, 0, 0, fl, replstr, NULL); + *sp = get_match_ret(&imd, 0, 0); + patfreestr(&patstralloc); return 1; } /* fall through */ case (SUB_SUBSTR|SUB_LONG): /* longest or smallest at start with substrings */ t = s; if (fl & SUB_GLOBAL) { - repllist = (fl & SUB_LIST) ? znewlinklist() : newlinklist(); + imd.repllist = (fl & SUB_LIST) ? znewlinklist() : newlinklist(); if (repllistp) - *repllistp = repllist; + *repllistp = imd.repllist; } ioff = 0; /* offset into string */ umlen = umltot; @@ -2906,10 +2943,10 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, do { /* loop over all matches for global substitution */ matched = 0; - for (; t < s + l; ioff++) { + for (; t < send; ioff++) { /* Find the longest match from this position. */ set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff)) { char *mpos = t + patmatchlen(); if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { char *ptr; @@ -2923,19 +2960,18 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ for (ptr = t, umlen2 = 0; ptr < mpos;) { set_pat_end(p, *ptr); - if (pattrylen(p, t, ptr - t, umlen2, - NULL, ioff)) { + if (pattrylen(p, t, umlen2, 0, + &patstralloc, ioff)) { mpos = t + patmatchlen(); break; } - umlen2 += iincchar(&ptr); + umlen2 += iincchar(&ptr, mpos - ptr); } } if (!--n || (n <= 0 && (fl & SUB_GLOBAL))) { - *sp = get_match_ret(*sp, t-s, mpos-s, fl, - replstr, repllist); + *sp = get_match_ret(&imd, t-s, mpos-s); if (mpos == t) - mpos += mb_metacharlenconv(mpos, NULL); + mpos += mb_charlenconv(mpos, send - mpos, NULL); } if (!(fl & SUB_GLOBAL)) { if (n) { @@ -2945,9 +2981,10 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * the next character, even if it overlaps * with what we just found. */ - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); continue; } else { + patfreestr(&patstralloc); return 1; } } @@ -2958,11 +2995,11 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, matched = 1; while (t < mpos) { ioff++; - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } break; } - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } } while (matched); /* @@ -2972,8 +3009,9 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ set_pat_start(p, l); if ((fl & (SUB_LONG|SUB_GLOBAL)) == SUB_LONG && - pattrylen(p, s + l, -1, -1, NULL, 0) && !--n) { - *sp = get_match_ret(*sp, 0, 0, fl, replstr, repllist); + pattrylen(p, send, 0, 0, &patstralloc, 0) && !--n) { + *sp = get_match_ret(&imd, 0, 0); + patfreestr(&patstralloc); return 1; } break; @@ -2983,8 +3021,10 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, /* Longest/shortest at end, matching substrings. */ if (!(fl & SUB_LONG)) { set_pat_start(p, l); - if (pattrylen(p, s + l, 0, 0, NULL, umltot) && !--n) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + if (pattrylen(p, send, 0, 0, &patstralloc, umltot) && + !--n) { + *sp = get_match_ret(&imd, umltot, umltot); + patfreestr(&patstralloc); return 1; } } @@ -3001,13 +3041,13 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, nmatches = 0; tmatch = NULL; mb_charinit(); - for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { + for (ioff = 0, t = s, umlen = umltot; t < send; ioff++) { set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff)) { nmatches++; tmatch = t; } - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } if (nmatches) { char *mpos; @@ -3017,14 +3057,14 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ n = nmatches - n; mb_charinit(); - for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { + for (ioff = 0, t = s, umlen = umltot; t < send; ioff++) { set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff) && + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff) && !n--) { tmatch = t; break; } - umlen -= iincchar(&t); + umlen -= iincchar(&t, send - t); } } mpos = tmatch + patmatchlen(); @@ -3032,29 +3072,31 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { for (t = tmatch, umlen = 0; t < mpos; ) { set_pat_end(p, *t); - if (pattrylen(p, tmatch, t - tmatch, umlen, - NULL, ioff)) { + if (pattrylen(p, tmatch, umlen, 0, + &patstralloc, ioff)) { mpos = tmatch + patmatchlen(); break; } - umlen += iincchar(&t); + umlen += iincchar(&t, mpos - t); } } - *sp = get_match_ret(*sp, tmatch-s, mpos-s, fl, - replstr, NULL); + *sp = get_match_ret(&imd, tmatch-s, mpos-s); + patfreestr(&patstralloc); return 1; } set_pat_start(p, l); - if ((fl & SUB_LONG) && pattrylen(p, s + l, 0, 0, NULL, umltot) && + if ((fl & SUB_LONG) && pattrylen(p, send, 0, 0, + &patstralloc, umltot) && !--n) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + *sp = get_match_ret(&imd, umltot, umltot); + patfreestr(&patstralloc); return 1; } break; } } - if (repllist && nonempty(repllist)) { + if (imd.repllist && nonempty(imd.repllist)) { /* Put all the bits of a global search and replace together. */ LinkNode nd; Repldata rd; @@ -3062,10 +3104,15 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, char *ptr, *start; int i; + /* + * Use metafied string again. + * Results from get_match_ret in repllist are all metafied. + */ + s = *sp; if (!(fl & SUB_LIST)) { lleft = 0; /* size of returned string */ - i = 0; /* start of last chunk we got from *sp */ - for (nd = firstnode(repllist); nd; incnode(nd)) { + i = 0; /* start of last chunk we got from *sp */ + for (nd = firstnode(imd.repllist); nd; incnode(nd)) { rd = (Repldata) getdata(nd); lleft += rd->b - i; /* previous chunk of *sp */ lleft += strlen(rd->replstr); /* the replaced bit */ @@ -3074,7 +3121,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, lleft += l - i; /* final chunk from *sp */ start = t = zhalloc(lleft+1); i = 0; - for (nd = firstnode(repllist); nd; incnode(nd)) { + for (nd = firstnode(imd.repllist); nd; incnode(nd)) { rd = (Repldata) getdata(nd); memcpy(t, s + i, rd->b - i); t += rd->b - i; @@ -3087,13 +3134,19 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, start[lleft] = '\0'; *sp = (char *)start; } + patfreestr(&patstralloc); return 1; } - if (fl & SUB_LIST) /* safety: don't think this can happen */ + if (fl & SUB_LIST) { /* safety: don't think this can happen */ + patfreestr(&patstralloc); return 0; + } /* munge the whole string: no match, so no replstr */ - *sp = get_match_ret(*sp, 0, 0, fl, 0, 0); + imd.replstr = NULL; + imd.repllist = NULL; + *sp = get_match_ret(&imd, 0, 0); + patfreestr(&patstralloc); return (fl & SUB_RETFAIL) ? 0 : 1; } @@ -3111,7 +3164,7 @@ static int igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, LinkList *repllistp) { - char *s = *sp, *t; + char *s = *sp, *t, *send; /* * Note that ioff and uml count characters in the character * set (Meta's are not included), while l counts characters in the @@ -3119,36 +3172,48 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * lengths. */ int ioff, l = strlen(*sp), uml = ztrlen(*sp), matched = 1, umlen; - /* - * List of bits of matches to concatenate with replacement string. - * The data is a struct repldata. It is not used in cases like - * ${...//#foo/bar} even though SUB_GLOBAL is set, since the match - * is anchored. It goes on the heap. - */ - LinkList repllist = NULL; + struct patstralloc patstralloc; + struct imatchdata imd; + + (void)patallocstr(p, s, l, uml, 1, &patstralloc); + s = patstralloc.alloced; + DPUTS(!s, "forced patallocstr failed"); + send = s + uml; + + imd.mstr = *sp; + imd.mlen = l; + imd.ustr = s; + imd.ulen = uml; + imd.flags = fl; + imd.replstr = replstr; + imd.repllist = NULL; /* perform must-match test for complex closures */ if (p->mustoff) { - /* - * Yuk. Probably we should rewrite this whole function to - * use an unmetafied test string. - * - * Use META_HEAPDUP because we need a terminating NULL. - */ - char *muststr = metafy((char *)p + p->mustoff, - p->patmlen, META_HEAPDUP); + char *muststr = (char *)p + p->mustoff; - if (!strstr(s, muststr)) - matched = 0; + matched = 0; + if (p->patmlen <= uml) + { + for (t = s; t <= send - p->patmlen; t++) + { + if (!memcmp(muststr, t, p->patmlen)) { + matched = 1; + break; + } + } + } } /* in case we used the prog before... */ p->flags &= ~(PAT_NOTSTART|PAT_NOTEND); if (fl & SUB_ALL) { - int i = matched && pattry(p, s); - *sp = get_match_ret(*sp, 0, i ? l : 0, fl, i ? replstr : 0, NULL); + int i = matched && pattrylen(p, s, uml, 0, &patstralloc, 0); + if (!i) + imd.replstr = NULL; + *sp = get_match_ret(&imd, 0, i ? l : 0); if (! **sp && (((fl & SUB_MATCH) && !i) || ((fl & SUB_REST) && i))) return 0; return 1; @@ -3161,23 +3226,25 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * Largest/smallest possible match at head of string. * First get the longest match... */ - if (pattry(p, s)) { + if (pattrylen(p, s, uml, 0, &patstralloc, 0)) { /* patmatchlen returns metafied length, as we need */ int mlen = patmatchlen(); if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { + send = s + mlen; /* * ... now we know whether it's worth looking for the * shortest, which we do by brute force. */ for (t = s, umlen = 0; t < s + mlen; METAINC(t), umlen++) { set_pat_end(p, *t); - if (pattrylen(p, s, t - s, umlen, NULL, 0)) { + if (pattrylen(p, s, umlen, 0, &patstralloc, 0)) { mlen = patmatchlen(); break; } } } - *sp = get_match_ret(*sp, 0, mlen, fl, replstr, NULL); + *sp = get_match_ret(&imd, 0, mlen); + patfreestr(&patstralloc); return 1; } break; @@ -3186,17 +3253,13 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, /* Smallest possible match at tail of string: * * move back down string until we get a match. * * There's no optimization here. */ - for (ioff = uml, t = s + l, umlen = 0; t >= s; + for (ioff = uml, t = send, umlen = 0; t >= s; t--, ioff--, umlen++) { - if (t > s && t[-1] == Meta) - t--; set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { - *sp = get_match_ret(*sp, t - s, l, fl, replstr, NULL); + if (pattrylen(p, t, umlen, 0, &patstralloc, ioff)) { + *sp = get_match_ret(&imd, t - s, uml); return 1; } - if (t > s+1 && t[-2] == Meta) - t--; } break; @@ -3204,61 +3267,59 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, /* Largest possible match at tail of string: * * move forward along string until we get a match. * * Again there's no optimisation. */ - for (ioff = 0, t = s, umlen = uml; t < s + l; - ioff++, METAINC(t), umlen--) { + for (ioff = 0, t = s, umlen = uml; t < send; + ioff++, t++, umlen--) { set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { - *sp = get_match_ret(*sp, t-s, l, fl, replstr, NULL); + if (pattrylen(p, t, send - t, umlen, &patstralloc, ioff)) { + *sp = get_match_ret(&imd, t-s, uml); return 1; } - if (*t == Meta) - t++; } break; case SUB_SUBSTR: /* Smallest at start, but matching substrings. */ set_pat_start(p, l); - if (!(fl & SUB_GLOBAL) && pattry(p, s + l) && !--n) { - *sp = get_match_ret(*sp, 0, 0, fl, replstr, NULL); + if (!(fl & SUB_GLOBAL) && + pattrylen(p, send, 0, 0, &patstralloc, 0) && !--n) { + *sp = get_match_ret(&imd, 0, 0); return 1; } /* fall through */ case (SUB_SUBSTR|SUB_LONG): /* longest or smallest at start with substrings */ t = s; if (fl & SUB_GLOBAL) { - repllist = newlinklist(); + imd.repllist = newlinklist(); if (repllistp) - *repllistp = repllist; + *repllistp = imd.repllist; } ioff = 0; /* offset into string */ umlen = uml; do { /* loop over all matches for global substitution */ matched = 0; - for (; t < s + l; METAINC(t), ioff++, umlen--) { + for (; t < send; t++, ioff++, umlen--) { /* Find the longest match from this position. */ set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff)) { + if (pattrylen(p, t, send - t, umlen, &patstralloc, ioff)) { char *mpos = t + patmatchlen(); if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { char *ptr; int umlen2; for (ptr = t, umlen2 = 0; ptr < mpos; - METAINC(ptr), umlen2++) { + ptr++, umlen2++) { set_pat_end(p, *ptr); if (pattrylen(p, t, ptr - t, umlen2, - NULL, ioff)) { + &patstralloc, ioff)) { mpos = t + patmatchlen(); break; } } } if (!--n || (n <= 0 && (fl & SUB_GLOBAL))) { - *sp = get_match_ret(*sp, t-s, mpos-s, fl, - replstr, repllist); + *sp = get_match_ret(&imd, t-s, mpos-s); if (mpos == t) - METAINC(mpos); + mpos++; } if (!(fl & SUB_GLOBAL)) { if (n) { @@ -3278,13 +3339,13 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * which is already marked for replacement. */ matched = 1; - for ( ; t < mpos; t++, ioff++, umlen--) - if (*t == Meta) - t++; + while (t < mpos) { + ioff++; + umlen--; + t++; + } break; } - if (*t == Meta) - t++; } } while (matched); /* @@ -3294,8 +3355,9 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ set_pat_start(p, l); if ((fl & (SUB_LONG|SUB_GLOBAL)) == SUB_LONG && - pattry(p, s + l) && !--n) { - *sp = get_match_ret(*sp, 0, 0, fl, replstr, repllist); + pattrylen(p, send, 0, 0, &patstralloc, 0) && !--n) { + *sp = get_match_ret(&imd, 0, 0); + patfreestr(&patstralloc); return 1; } break; @@ -3305,47 +3367,50 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, /* Longest/shortest at end, matching substrings. */ if (!(fl & SUB_LONG)) { set_pat_start(p, l); - if (pattrylen(p, s + l, 0, 0, NULL, uml) && !--n) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + if (pattrylen(p, send, 0, 0, &patstralloc, uml) && !--n) { + *sp = get_match_ret(&imd, uml, uml); + patfreestr(&patstralloc); return 1; } } - for (ioff = uml - 1, t = s + l - 1, umlen = 1; t >= s; + for (ioff = uml - 1, t = send - 1, umlen = 1; t >= s; t--, ioff--, umlen++) { - if (t > s && t[-1] == Meta) - t--; set_pat_start(p, t-s); - if (pattrylen(p, t, s + l - t, umlen, NULL, ioff) && !--n) { + if (pattrylen(p, t, send - t, umlen, &patstralloc, ioff) && + !--n) { /* Found the longest match */ char *mpos = t + patmatchlen(); if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { char *ptr; int umlen2; for (ptr = t, umlen2 = 0; ptr < mpos; - METAINC(ptr), umlen2++) { + ptr++, umlen2++) { set_pat_end(p, *ptr); - if (pattrylen(p, t, ptr - t, umlen2, NULL, ioff)) { + if (pattrylen(p, t, umlen2, 0, &patstralloc, + ioff)) { mpos = t + patmatchlen(); break; } } } - *sp = get_match_ret(*sp, t-s, mpos-s, fl, - replstr, NULL); + *sp = get_match_ret(&imd, t-s, mpos-s); + patfreestr(&patstralloc); return 1; } } set_pat_start(p, l); - if ((fl & SUB_LONG) && pattrylen(p, s + l, 0, 0, NULL, uml) && + if ((fl & SUB_LONG) && pattrylen(p, send, 0, 0, + &patstralloc, uml) && !--n) { - *sp = get_match_ret(*sp, l, l, fl, replstr, NULL); + *sp = get_match_ret(&imd, uml, uml); + patfreestr(&patstralloc); return 1; } break; } } - if (repllist && nonempty(repllist)) { + if (imd.repllist && nonempty(imd.repllist)) { /* Put all the bits of a global search and replace together. */ LinkNode nd; Repldata rd; @@ -3353,8 +3418,13 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, char *ptr, *start; int i; + /* + * Use metafied string again. + * Results from get_match_ret in repllist are all metafied. + */ + s = *sp; i = 0; /* start of last chunk we got from *sp */ - for (nd = firstnode(repllist); nd; incnode(nd)) { + for (nd = firstnode(imd.repllist); nd; incnode(nd)) { rd = (Repldata) getdata(nd); lleft += rd->b - i; /* previous chunk of *sp */ lleft += strlen(rd->replstr); /* the replaced bit */ @@ -3363,7 +3433,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, lleft += l - i; /* final chunk from *sp */ start = t = zhalloc(lleft+1); i = 0; - for (nd = firstnode(repllist); nd; incnode(nd)) { + for (nd = firstnode(imd.repllist); nd; incnode(nd)) { rd = (Repldata) getdata(nd); memcpy(t, s + i, rd->b - i); t += rd->b - i; @@ -3375,11 +3445,15 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, memcpy(t, s + i, l - i); start[lleft] = '\0'; *sp = (char *)start; + patfreestr(&patstralloc); return 1; } /* munge the whole string: no match, so no replstr */ - *sp = get_match_ret(*sp, 0, 0, fl, 0, 0); + imd.replstr = NULL; + imd.repllist = NULL; + *sp = get_match_ret(&imd, 0, 0); + patfreestr(&patstralloc); return 1; } diff --git a/Src/pattern.c b/Src/pattern.c index 03ba37d..8de372c 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -2204,7 +2204,10 @@ pattrylen(Patprog prog, char *string, int len, int unmetalen, * the pattern module) at which we are trying to match. * This is added in to the positions recorded in patbeginp and patendp * when we are looking for substrings. Currently this only happens - * in the parameter substitution code. + * in the parameter substitution code. It refers to a real character + * offset, i.e. is already in the form ready for presentation to the + * general public --- this is necessary as we don't have the + * information to convert it down here. * * Note this is a character offset, i.e. a single possibly metafied and * possibly multibyte character counts as 1. @@ -2292,7 +2295,8 @@ pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin, */ if (!patstralloc->progstrunmeta) { - patstralloc->progstrunmeta = dupstring(progstr); + patstralloc->progstrunmeta = + dupstrpfx(progstr, (int)prog->patmlen); unmetafy(patstralloc->progstrunmeta, &patstralloc->progstrunmetalen); } @@ -2346,7 +2350,7 @@ pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin, * In the orignal structure, but it might be unmetafied * for use with an unmetafied test string. */ - patinlen = (int)prog->patmlen; + patinlen = pstrlen; /* if matching files, must update globbing flags */ patglobflags = prog->globend; @@ -2360,7 +2364,7 @@ pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin, * Unmetafied: pstrlen contains unmetafied * length in bytes. */ - str = metafy(patinstart, pstrlen, META_ALLOC); + str = metafy(patinstart, pstrlen, META_DUP); mlen = CHARSUB(patinstart, patinstart + pstrlen); } else { str = ztrduppfx(patinstart, patinlen); @@ -2454,8 +2458,8 @@ pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin, /* * Optimization: if we didn't find any Meta characters * to begin with, we don't need to look for them now. - * Only do this if we did the unmetfication internally, - * since otherwise it's too hard to work out. + * + * For patstralloc pased in, we want the unmetafied length. */ if (patstralloc == &patstralloc_struct && patstralloc->unmetalen != origlen) { @@ -2588,7 +2592,9 @@ pattryrefs(Patprog prog, char *string, int stringlen, int unmetalenin, /* * Return length of previous succesful match. This is - * in metafied bytes, i.e. includes a count of Meta characters. + * in metafied bytes, i.e. includes a count of Meta characters, + * unless the match was done on an unmetafied string using + * a patstralloc stuct, in which case it, too is unmetafed. * Unusual and futile attempt at modular encapsulation. */ diff --git a/Src/zsh.h b/Src/zsh.h index 32f2e0c..15fa5e4 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -480,6 +480,7 @@ typedef struct heap *Heap; typedef struct heapstack *Heapstack; typedef struct histent *Histent; typedef struct hookdef *Hookdef; +typedef struct imatchdata *Imatchdata; typedef struct jobfile *Jobfile; typedef struct job *Job; typedef struct linkedmod *Linkedmod; @@ -1593,6 +1594,31 @@ typedef struct zpc_disables_save *Zpc_disables_save; /* Range: token followed by the (possibly multibyte) start and end */ #define PP_RANGE 21 +/* + * Argument to get_match_ret() in glob.c + */ +struct imatchdata { + /* Metafied trial string */ + char *mstr; + /* Its length */ + int mlen; + /* Unmetafied string */ + char *ustr; + /* Its length */ + int ulen; + /* Flags (SUB_*) */ + int flags; + /* Replacement string (metafied) */ + char *replstr; + /* + * List of bits of matches to concatenate with replacement string. + * The data is a struct repldata. It is not used in cases like + * ${...//#foo/bar} even though SUB_GLOBAL is set, since the match + * is anchored. It goes on the heap. + */ + LinkList repllist; +}; + /* Globbing flags: lower 8 bits gives approx count */ #define GF_LCMATCHUC 0x0100 #define GF_IGNCASE 0x0200