From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=DKIM_ADSP_CUSTOM_MED, FORGED_GMAIL_RCVD,FREEMAIL_FROM,MAILING_LIST_MULTI,RCVD_IN_DNSWL_NONE autolearn=ham autolearn_force=no version=3.4.2 Received: from primenet.com.au (ns1.primenet.com.au [203.24.36.2]) by inbox.vuxu.org (OpenSMTPD) with ESMTP id a8c77491 for ; Tue, 17 Dec 2019 11:16:05 +0000 (UTC) Received: (qmail 15495 invoked by alias); 17 Dec 2019 11:15:58 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: List-Unsubscribe: X-Seq: 45071 Received: (qmail 19424 invoked by uid 1010); 17 Dec 2019 11:15:58 -0000 X-Qmail-Scanner-Diagnostics: from 195-159-176-226.customer.powertech.no by f.primenet.com.au (envelope-from , uid 7791) with qmail-scanner-2.11 (clamdscan: 0.102.1/25663. spamassassin: 3.4.2. Clear:RC:0(195.159.176.226):SA:0(1.6/5.0):. Processed in 2.804716 secs); 17 Dec 2019 11:15:58 -0000 X-Envelope-From: gcszd-zsh-workers@m.gmane.org X-Qmail-Scanner-Mime-Attachments: | X-Qmail-Scanner-Zip-Files: | Received-SPF: none (ns1.primenet.com.au: domain at m.gmane.org does not designate permitted sender hosts) X-Injected-Via-Gmane: http://gmane.org/ To: zsh-workers@zsh.org From: Stephane Chazelas Subject: [PATCH] Re: regexp-replace and ^, word boundary or look-behind operators Date: Tue, 17 Dec 2019 11:11:13 +0000 Message-ID: <20191217111113.z242f4g6sx7xdwru@chaz.gmail.com> References: <20191216211013.6opkv5sy4wvp3yn2@chaz.gmail.com> <20191216212706.i3xvf6hn5h3jwkjh@chaz.gmail.com> <20191217073846.4usg2hnsk66bhqvl@chaz.gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii User-Agent: NeoMutt/20180716 Content-Disposition: inline In-Reply-To: <20191217073846.4usg2hnsk66bhqvl@chaz.gmail.com> 2019-12-17 07:38:46 +0000, Stephane Chazelas: > 2019-12-16 21:27:06 +0000, Stephane Chazelas: > [...] > > PCRE should be OK, so it could be just a matter of > > exposing it via the pcre_match builtin > [...] > > D'oh, it's there already with the -b, -n options. > > I'll try and suggest a regexp-replace improvement using that. [...] There's another issue in that the zero-width matches cause infinite loops. Here's my first attempt at fixing those issues (also fixing a few issues in the zpgrep example functions while I'm at it): diff --git a/Doc/Zsh/contrib.yo b/Doc/Zsh/contrib.yo index d32ba018d..61e6a434f 100644 --- a/Doc/Zsh/contrib.yo +++ b/Doc/Zsh/contrib.yo @@ -4301,6 +4301,9 @@ and arithmetic expressions which will be replaced: in particular, a reference to tt($MATCH) will be replaced by the text matched by the pattern. The return status is 0 if at least one match was performed, else 1. + +Note that if not using PCRE, using the tt(^) or word boundary operators +(where available) may not work properly. ) findex(run-help) item(tt(run-help) var(cmd))( diff --git a/Functions/Example/zpgrep b/Functions/Example/zpgrep index 8b1edaa1c..556e58cd6 100644 --- a/Functions/Example/zpgrep +++ b/Functions/Example/zpgrep @@ -2,24 +2,31 @@ # zpgrep() { -local file pattern +local file pattern ret pattern=$1 shift +ret=1 if ((! ARGC)) then set -- - fi -pcre_compile $pattern +zmodload zsh/pcre || return +pcre_compile -- "$pattern" pcre_study for file do if [[ "$file" == - ]] then - while read -u0 buf; do pcre_match $buf && print $buf; done + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done else - while read -u0 buf; do pcre_match $buf && print $buf; done < "$file" + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done < "$file" fi done +return "$ret" } diff --git a/Functions/Misc/regexp-replace b/Functions/Misc/regexp-replace index dec105524..41ea9d79e 100644 --- a/Functions/Misc/regexp-replace +++ b/Functions/Misc/regexp-replace @@ -8,36 +8,79 @@ # $ and backtick substitutions; in particular, $MATCH will be replaced # by the portion of the string matched by the regular expression. -integer pcre +# we use positional parameters instead of variables to avoid +# clashing with the user's variable. Make sure we start with 3 and only +# 3 elements: +argv=("$1" "$2" "$3") -[[ -o re_match_pcre ]] && pcre=1 +# $4 records whether pcre is enabled as that information would otherwise +# be lost after emulate -L zsh +4=0 +[[ -o re_match_pcre ]] && 4=1 emulate -L zsh -(( pcre )) && setopt re_match_pcre - -# $4 is the string to be matched -4=${(P)1} -# $5 is the final string -5= -# 6 indicates if we made a change -6= + + local MATCH MBEGIN MEND local -a match mbegin mend -while [[ -n $4 ]]; do - if [[ $4 =~ $2 ]]; then - # append initial part and subsituted match - 5+=${4[1,MBEGIN-1]}${(e)3} - # truncate remaining string - 4=${4[MEND+1,-1]} - # indicate we did something - 6=1 - else - break - fi -done -5+=$4 - -eval ${1}=${(q)5} -# status 0 if we did something, else 1. -[[ -n $6 ]] +if (( $4 )); then + # if using pcre, we're using pcre_match and a running offset + # That's needed for ^, \A, \b, and look-behind operators to work + # properly. + + zmodload zsh/pcre || return 2 + pcre_compile -- "$2" && pcre_study || return 2 + + # $4 is the current *byte* offset, $5, $6 reserved for later + 4=0 5= 6=1 + + local ZPCRE_OP IFS=' ' + while pcre_match -b -n $4 -- "${(P)1}"; do + # append offsets and computed replacement to the array + argv+=($=ZPCRE_OP ${(e)3}) + + # for 0-width matches, increase offset by 1 to avoid + # infinite loop + 4=$((argv[-2] + (argv[-3] == argv[-2]))) + done + + (($# > 6)) || return # no match + + set +o multibyte + + # $5 contains the result, $6 the current offset + for 2 3 4 in "$@[7,-1]"; do + 5+=${(P)1[$6,$2]}$4 + 6=$(($3 + 1)) + done + 5+=${(P)1[$6,-1]} +else + # in ERE, we can't use an offset so ^, (and \<, \b, \B, [[:<:]] where + # available) won't work properly. + + # $4 is the string to be matched + 4=${(P)1} + + while [[ -n $4 ]]; do + if [[ $4 =~ $2 ]]; then + # append initial part and substituted match + 5+=${4[1,MBEGIN-1]}${(e)3} + # truncate remaining string + if ((MEND < MBEGIN)); then + # zero-width match, skip one character for the next match + ((MEND++)) + 5+=${4[1]} + fi + 4=${4[MEND+1,-1]} + # indicate we did something + 6=1 + else + break + fi + done + [[ -n $6 ]] || return # no match + 5+=$4 +fi + +eval $1=\$5