From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 2627 invoked by alias); 21 Oct 2011 10:11:56 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: X-Seq: 29837 Received: (qmail 2187 invoked from network); 21 Oct 2011 10:11:53 -0000 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on f.primenet.com.au X-Spam-Level: X-Spam-Status: No, score=-4.3 required=5.0 tests=BAYES_00,DKIM_SIGNED, DKIM_VALID,DKIM_VALID_AU,RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received-SPF: none (ns1.primenet.com.au: domain at spodhuis.org does not designate permitted sender hosts) DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=spodhuis.org; s=d201107; h=In-Reply-To:Content-Transfer-Encoding:Content-Type:MIME-Version:References:Message-ID:Subject:Cc:To:From:Date; bh=rxa8zRPmyj6XBpnaj44Ll241TM0pKlQpgiGmpgofPuI=; b=jyVp36jLlzR14g5fGG3r7iPi27GCuNVhPfS8dy1cvI2YgFi2Xuy8AJ0lnFSMAUXuE0FNISpO8UBI3MDqGY8W2XrmGPb2NRlE/u2d0tsjG4k7T6HUQjOLXAhX+VaFSSQLPim9VdFSPol5qNeOpRA0Tj11INGIhRXuS8gwvXXSgLQ=; Date: Fri, 21 Oct 2011 05:56:25 -0400 From: Phil Pennock To: Peter Stephenson Cc: zsh-workers@zsh.org Subject: [patch] Re: UTF-8 and PCRE and metafy Message-ID: <20111021095624.GA23272@redoubt.spodhuis.org> Mail-Followup-To: Peter Stephenson , zsh-workers@zsh.org References: <20110308065216.GB79682@redoubt.spodhuis.org> <20110308095850.12843492@pwslap01u.europe.root.pri> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="lrZ03NoBR/3+SXJZ" Content-Disposition: inline Content-Transfer-Encoding: 8bit In-Reply-To: <20110308095850.12843492@pwslap01u.europe.root.pri> --lrZ03NoBR/3+SXJZ Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: 8bit On 2011-03-08 at 09:58 +0000, Peter Stephenson wrote: > On Tue, 8 Mar 2011 01:52:16 -0500 > Phil Pennock wrote: > > I'm guessing I need a bunch of calls to metafy() to process the > > results of extraction in zpcre_get_substrings() ? > > You'll need to unmetafy any string getting passed into > pcre_get_substring_list() and metafy() the resulting captures coming > out. You should duplicate any string that needs unmetafying, since > otherwise it's in place and you may need the metafied form later (you do > for the string passed in as the first argument). Okay, it took me far too long to get back around to this, sorry. :( Attached is what looks to me to be a correct patch. With bash_rematch set, I can do: % [[ 'foo→bar' =~ .([^[:ascii:]]). ]] % echo $BASH_REMATCH o→b → % [[ 'foo→bar' =~ .(→.). ]] % echo $BASH_REMATCH o→ba →b I'm not sure on when I should be using the wcs_strdup() functions and the like; what I've got appears to work. None of what I've added appears to be specific to UTF-8. Is it reasonable to add tests to D07multibyte.ztst for this, with the zsh/pcre dependency? Can anyone spot any cases I've missed in zsh/pcre ? Does anyone know of a system extended regexp library which supports multibyte characters? I think I should be making the same changes to zsh/regex but am not sure where to actually test those changes. Regards, -Phil --lrZ03NoBR/3+SXJZ Content-Type: text/x-diff; charset=us-ascii Content-Disposition: inline; filename="pcre-utf8.patch" Index: Src/Modules/pcre.c =================================================================== RCS file: /home/cvsroot/zsh/Src/Modules/pcre.c,v retrieving revision 1.18 diff -a -u -p -r1.18 pcre.c --- Src/Modules/pcre.c 20 Jan 2010 11:17:11 -0000 1.18 +++ Src/Modules/pcre.c 21 Oct 2011 09:43:29 -0000 @@ -77,6 +77,7 @@ bin_pcre_compile(char *nam, char **args, { int pcre_opts = 0, pcre_errptr; const char *pcre_error; + char *target; if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED; if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS; @@ -92,8 +93,13 @@ bin_pcre_compile(char *nam, char **args, if (pcre_pattern) pcre_free(pcre_pattern); - pcre_pattern = pcre_compile(*args, pcre_opts, &pcre_error, &pcre_errptr, NULL); + target = ztrdup(*args); + unmetafy(target, NULL); + + pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL); + free(target); + if (pcre_pattern == NULL) { zwarnnam(nam, "error in regex: %s", pcre_error); @@ -161,7 +167,7 @@ zpcre_get_substrings(char *arg, int *ove sprintf(offset_all, "%d %d", ovec[0], ovec[1]); setsparam("ZPCRE_OP", ztrdup(offset_all)); } - match_all = ztrdup(captures[0]); + match_all = metafy(captures[0], -1, META_DUP); setsparam(matchvar, match_all); /* * If we're setting match, mbegin, mend we only do @@ -169,7 +175,15 @@ zpcre_get_substrings(char *arg, int *ove * (c.f. regex.c). */ if (!want_begin_end || nelem) { - matches = zarrdup(&captures[capture_start]); + char **x, **y; + y = &captures[capture_start]; + matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1)); + do { + if (*y) + *x++ = metafy(*y, -1, META_DUP); + else + *x++ = NULL; + } while (*y++); setaparam(substravar, matches); } @@ -255,6 +269,7 @@ bin_pcre_match(char *nam, char **args, O { int ret, capcount, *ovec, ovecsize, c; char *matched_portion = NULL; + char *plaintext = NULL; char *receptacle = NULL; int return_value = 1; /* The subject length and offset start are both int values in pcre_exec */ @@ -292,22 +307,23 @@ bin_pcre_match(char *nam, char **args, O ovecsize = (capcount+1)*3; ovec = zalloc(ovecsize*sizeof(int)); - subject_len = (int)strlen(*args); + plaintext = ztrdup(*args); + subject_len = (int)strlen(plaintext); if (offset_start < 0 || offset_start >= subject_len) ret = PCRE_ERROR_NOMATCH; else - ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize); + ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize); if (ret==0) return_value = 0; else if (ret==PCRE_ERROR_NOMATCH) /* no match */; else if (ret>0) { - zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, + zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle, want_offset_pair, 0, 0); return_value = 0; } else { - zwarnnam(nam, "error in pcre_exec"); + zwarnnam(nam, "error in pcre_exec [%d]", ret); } if (ovec) @@ -322,7 +338,8 @@ cond_pcre_match(char **a, int id) { pcre *pcre_pat; const char *pcre_err; - char *lhstr, *rhre, *avar=NULL; + char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL; + char *p; int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize; int return_value = 0; @@ -331,6 +348,10 @@ cond_pcre_match(char **a, int id) lhstr = cond_str(a,0,0); rhre = cond_str(a,1,0); + lhstr_plain = ztrdup(lhstr); + rhre_plain = ztrdup(rhre); + unmetafy(lhstr_plain, NULL); + unmetafy(rhre_plain, NULL); pcre_pat = NULL; ov = NULL; @@ -339,7 +360,7 @@ cond_pcre_match(char **a, int id) switch(id) { case CPCRE_PLAIN: - pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL); + pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL); if (pcre_pat == NULL) { zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err); break; @@ -347,7 +368,7 @@ cond_pcre_match(char **a, int id) pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt); ovsize = (capcnt+1)*3; ov = zalloc(ovsize*sizeof(int)); - r = pcre_exec(pcre_pat, NULL, lhstr, strlen(lhstr), 0, 0, ov, ovsize); + r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize); /* r < 0 => error; r==0 match but not enough size in ov * r > 0 => (r-1) substrings found; r==1 => no substrings */ @@ -356,13 +377,16 @@ cond_pcre_match(char **a, int id) return_value = 1; break; } - else if (r==PCRE_ERROR_NOMATCH) return 0; /* no match */ + else if (r==PCRE_ERROR_NOMATCH) { + return_value = 0; /* no match */ + break; + } else if (r<0) { - zwarn("pcre_exec() error: %d", r); + zwarn("pcre_exec() error [%d]", r); break; } else if (r>0) { - zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, + zpcre_get_substrings(lhstr_plain, ov, r, NULL, avar, 0, isset(BASHREMATCH), !isset(BASHREMATCH)); return_value = 1; @@ -371,6 +395,10 @@ cond_pcre_match(char **a, int id) break; } + if (lhstr_plain) + free(lhstr_plain); + if(rhre_plain) + free(rhre_plain); if (pcre_pat) pcre_free(pcre_pat); if (ov) --lrZ03NoBR/3+SXJZ--