From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=DKIM_ADSP_CUSTOM_MED, FORGED_GMAIL_RCVD,FREEMAIL_FROM,MAILING_LIST_MULTI,RCVD_IN_DNSWL_NONE autolearn=ham autolearn_force=no version=3.4.2 Received: from primenet.com.au (ns1.primenet.com.au [203.24.36.2]) by inbox.vuxu.org (OpenSMTPD) with ESMTP id 77484566 for ; Wed, 1 Jan 2020 14:05:52 +0000 (UTC) Received: (qmail 18158 invoked by alias); 1 Jan 2020 14:05:47 -0000 Mailing-List: contact zsh-workers-help@zsh.org; run by ezmlm Precedence: bulk X-No-Archive: yes List-Id: Zsh Workers List List-Post: List-Help: List-Unsubscribe: X-Seq: 45180 Received: (qmail 467 invoked by uid 1010); 1 Jan 2020 14:05:47 -0000 X-Qmail-Scanner-Diagnostics: from 195-159-176-226.customer.powertech.no by f.primenet.com.au (envelope-from , uid 7791) with qmail-scanner-2.11 (clamdscan: 0.102.1/25677. spamassassin: 3.4.2. Clear:RC:0(195.159.176.226):SA:0(1.6/5.0):. Processed in 2.314869 secs); 01 Jan 2020 14:05:47 -0000 X-Envelope-From: gcszd-zsh-workers@m.gmane.org X-Qmail-Scanner-Mime-Attachments: | X-Qmail-Scanner-Zip-Files: | Received-SPF: none (ns1.primenet.com.au: domain at m.gmane.org does not designate permitted sender hosts) X-Injected-Via-Gmane: http://gmane.org/ To: zsh-workers@zsh.org From: Stephane Chazelas Subject: [PATCH v2] regexp-replace and ^, word boundary or look-behind operators Date: Wed, 1 Jan 2020 14:03:43 +0000 Message-ID: <20200101140343.qwfx2xaojumuds3d@chaz.gmail.com> References: <20191216211013.6opkv5sy4wvp3yn2@chaz.gmail.com> <20191216212706.i3xvf6hn5h3jwkjh@chaz.gmail.com> <20191217073846.4usg2hnsk66bhqvl@chaz.gmail.com> <20191217111113.z242f4g6sx7xdwru@chaz.gmail.com> <2ea6feb3-a686-4d83-ab27-6a582424487c@www.fastmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii User-Agent: NeoMutt/20180716 Content-Disposition: inline In-Reply-To: <2ea6feb3-a686-4d83-ab27-6a582424487c@www.fastmail.com> 2019-12-18 00:22:53 +0000, Daniel Shahaf: [...] > > + > > +Note that if not using PCRE, using the tt(^) or word boundary operators > > +(where available) may not work properly. > > Suggest to avoid the double negative: > > 1. s/not using PCRE/using POSIX ERE's/ > > 2. Add "(ERE's)" after "POSIX extended regular expressions" in the first paragraph > > I'll push a minor change to that first paragraph in a moment. [...] Thanks, I've incorporated that suggesting and fixed an issue with PCRE when the replacement was empty or generated more than one element. diff --git a/Doc/Zsh/contrib.yo b/Doc/Zsh/contrib.yo index 558342711..9a804fc11 100644 --- a/Doc/Zsh/contrib.yo +++ b/Doc/Zsh/contrib.yo @@ -4284,7 +4284,7 @@ See also the tt(pager), tt(prompt) and tt(rprompt) styles below. findex(regexp-replace) item(tt(regexp-replace) var(var) var(regexp) var(replace))( Use regular expressions to perform a global search and replace operation -on a variable. POSIX extended regular expressions are used, +on a variable. POSIX extended regular expressions (ERE) are used, unless the option tt(RE_MATCH_PCRE) has been set, in which case Perl-compatible regular expressions are used (this requires the shell to be linked against the tt(pcre) @@ -4302,6 +4302,9 @@ and arithmetic expressions which will be replaced: in particular, a reference to tt($MATCH) will be replaced by the text matched by the pattern. The return status is 0 if at least one match was performed, else 1. + +Note that if using POSIX EREs, the tt(^) or word boundary operators +(where available) may not work properly. ) findex(run-help) item(tt(run-help) var(cmd))( diff --git a/Functions/Example/zpgrep b/Functions/Example/zpgrep index 8b1edaa1c..556e58cd6 100644 --- a/Functions/Example/zpgrep +++ b/Functions/Example/zpgrep @@ -2,24 +2,31 @@ # zpgrep() { -local file pattern +local file pattern ret pattern=$1 shift +ret=1 if ((! ARGC)) then set -- - fi -pcre_compile $pattern +zmodload zsh/pcre || return +pcre_compile -- "$pattern" pcre_study for file do if [[ "$file" == - ]] then - while read -u0 buf; do pcre_match $buf && print $buf; done + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done else - while read -u0 buf; do pcre_match $buf && print $buf; done < "$file" + while IFS= read -ru0 buf; do + pcre_match -- "$buf" && ret=0 && print -r -- "$buf" + done < "$file" fi done +return "$ret" } diff --git a/Functions/Misc/regexp-replace b/Functions/Misc/regexp-replace index dec105524..0d5948075 100644 --- a/Functions/Misc/regexp-replace +++ b/Functions/Misc/regexp-replace @@ -8,36 +8,84 @@ # $ and backtick substitutions; in particular, $MATCH will be replaced # by the portion of the string matched by the regular expression. -integer pcre +# we use positional parameters instead of variables to avoid +# clashing with the user's variable. Make sure we start with 3 and only +# 3 elements: +argv=("$1" "$2" "$3") -[[ -o re_match_pcre ]] && pcre=1 +# $4 records whether pcre is enabled as that information would otherwise +# be lost after emulate -L zsh +4=0 +[[ -o re_match_pcre ]] && 4=1 emulate -L zsh -(( pcre )) && setopt re_match_pcre - -# $4 is the string to be matched -4=${(P)1} -# $5 is the final string -5= -# 6 indicates if we made a change -6= + + local MATCH MBEGIN MEND local -a match mbegin mend -while [[ -n $4 ]]; do - if [[ $4 =~ $2 ]]; then - # append initial part and subsituted match - 5+=${4[1,MBEGIN-1]}${(e)3} - # truncate remaining string - 4=${4[MEND+1,-1]} - # indicate we did something - 6=1 - else - break - fi -done -5+=$4 - -eval ${1}=${(q)5} -# status 0 if we did something, else 1. -[[ -n $6 ]] +if (( $4 )); then + # if using pcre, we're using pcre_match and a running offset + # That's needed for ^, \A, \b, and look-behind operators to work + # properly. + + zmodload zsh/pcre || return 2 + pcre_compile -- "$2" && pcre_study || return 2 + + # $4 is the current *byte* offset, $5, $6 reserved for later use + 4=0 6= + + local ZPCRE_OP + while pcre_match -b -n $4 -- "${(P)1}"; do + # append offsets and computed replacement to the array + # we need to perform the evaluation in a scalar assignment so that if + # it generates an array, the elements are converted to string (by + # joining with the first chararacter of $IFS as usual) + 5=${(e)3} + argv+=(${(s: :)ZPCRE_OP} "$5") + + # for 0-width matches, increase offset by 1 to avoid + # infinite loop + 4=$((argv[-2] + (argv[-3] == argv[-2]))) + done + + (($# > 6)) || return # no match + + set +o multibyte + + # $5 contains the result, $6 the current offset + 5= 6=1 + for 2 3 4 in "$@[7,-1]"; do + 5+=${(P)1[$6,$2]}$4 + 6=$(($3 + 1)) + done + 5+=${(P)1[$6,-1]} +else + # in ERE, we can't use an offset so ^, (and \<, \b, \B, [[:<:]] where + # available) won't work properly. + + # $4 is the string to be matched + 4=${(P)1} + + while [[ -n $4 ]]; do + if [[ $4 =~ $2 ]]; then + # append initial part and substituted match + 5+=${4[1,MBEGIN-1]}${(e)3} + # truncate remaining string + if ((MEND < MBEGIN)); then + # zero-width match, skip one character for the next match + ((MEND++)) + 5+=${4[1]} + fi + 4=${4[MEND+1,-1]} + # indicate we did something + 6=1 + else + break + fi + done + [[ -n $6 ]] || return # no match + 5+=$4 +fi + +eval $1=\$5