%prep if ! zmodload -F zsh/re2 C:re2-match 2>/dev/null then ZTST_unimplemented="the zsh/re2 module is not available" return 0 fi # Load the rest of the builtins zmodload zsh/re2 # TODO: use future mechanism to switch =~ to use re2 and test =~ too # Find a UTF-8 locale. setopt multibyte # Don't let LC_* override our choice of locale. unset -m LC_\* mb_ok= langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) for LANG in $langs; do if [[ é = ? ]]; then mb_ok=1 break; fi done if [[ -z $mb_ok ]]; then ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" else print -u $ZTST_fd Testing RE2 multibyte with locale $LANG mkdir multibyte.tmp && cd multibyte.tmp fi %test [[ 'foo→bar' -re2-match .([^[:ascii:]]). ]] print $MATCH print $match[1] 0:Basic non-ASCII regexp matching >o→b >→ MATCH='' [[ ÷x -re2-match '^(\p{Sm})(\p{Latin})$' ]] print "$? <$MATCH> .${match[1]}|${match[2]}." 0:Unicode character class names & extracting correct widths >0 <÷x> .÷|x. [[ alphabeta -re2-match a([^a]+)a ]] echo "$? basic" print $MATCH print $match[1] [[ ! alphabeta -re2-match a(.+)a ]] echo "$? negated op" [[ alphabeta -re2-match ^b ]] echo "$? failed match" # default matches on first, then takes longest substring # -longest keeps looking [[ abb -re2-match a(b|bb) ]] echo "$? first .${MATCH}.${match[1]}." [[ abb -re2-match-longest a(b|bb) ]] echo "$? longest .${MATCH}.${match[1]}." [[ alphabeta -re2-match ab ]]; echo "$? unanchored" [[ alphabeta -re2-match ^ab ]]; echo "$? anchored" [[ alphabeta -re2-match '^a(\w+)a$' ]] echo "$? perl class used" echo ".${MATCH}. .${match[1]}." [[ alphabeta -re2-match-posix '^a(\w+)a$' ]] echo "$? POSIX-mode, should inhibit Perl class" [[ alphabeta -re2-match-posixperl '^a(\w+)a$' ]] echo "$? POSIX-mode with Perl classes enabled .${match[1]}." unset MATCH match [[ alphabeta -re2-match ^a([^a]+)a([^a]+)a$ ]] echo "$? matched, set vars" echo ".$MATCH. ${#MATCH}" echo ".${(j:|:)match[*]}." unset MATCH match [[ alphabeta -re2-match fr(.+)d ]] echo "$? unmatched, not setting MATCH/match" echo ".$MATCH. ${#MATCH}" echo ".${(j:|:)match[*]}." 0:Basic matching & result codes >0 basic >alpha >lph >1 negated op >1 failed match >0 first .ab.b. >0 longest .abb.bb. >0 unanchored >1 anchored >0 perl class used >.alphabeta. .lphabet. *?\(eval\):*: re2 rexexp compilation failed: invalid escape sequence: \w >1 POSIX-mode, should inhibit Perl class >0 POSIX-mode with Perl classes enabled .lphabet. >0 matched, set vars >.alphabeta. 9 >.lph|bet. >1 unmatched, not setting MATCH/match >.. 0 >.. m() { unset MATCH MBEGIN MEND match mbegin mend [[ $2 -re2-match $3 ]] print $? $1: m:${MATCH}: ma:${(j:|:)match}: MBEGIN=$MBEGIN MEND=$MEND mbegin="(${mbegin[*]})" mend="(${mend[*]})" } data='alpha beta gamma delta' m uncapturing $data '\b\w+\b' m capturing $data '\b(\w+)\b' m 'capture 2' $data '\b(\w+)\s+(\w+)\b' m 'capture repeat' $data '\b(?:(\w+)\s+)+(\w+)\b' 0:Beginning and end testing >0 uncapturing: m:alpha: ma:: MBEGIN=1 MEND=5 mbegin=() mend=() >0 capturing: m:alpha: ma:alpha: MBEGIN=1 MEND=5 mbegin=(1) mend=(5) >0 capture 2: m:alpha beta: ma:alpha|beta: MBEGIN=1 MEND=10 mbegin=(1 7) mend=(5 10) >0 capture repeat: m:alpha beta gamma delta: ma:gamma|delta: MBEGIN=1 MEND=22 mbegin=(12 18) mend=(16 22) unset match mend s=$'\u00a0' [[ $s -re2-match '^.$' ]] && print OK [[ A${s}B -re2-match .(.). && $match[1] == $s ]] && print OK [[ A${s}${s}B -re2-match A([^[:ascii:]]*)B && $mend[1] == 3 ]] && print OK unset s 0:Raw IMETA characters in input string >OK >OK >OK [[ foo -re2-match f.+ ]] ; print $? [[ foo -re2-match x.+ ]] ; print $? [[ ! foo -re2-match f.+ ]] ; print $? [[ ! foo -re2-match x.+ ]] ; print $? [[ foo -re2-match f.+ && bar -re2-match b.+ ]] ; print $? [[ foo -re2-match x.+ && bar -re2-match b.+ ]] ; print $? [[ foo -re2-match f.+ && bar -re2-match x.+ ]] ; print $? [[ ! foo -re2-match f.+ && bar -re2-match b.+ ]] ; print $? [[ foo -re2-match f.+ && ! bar -re2-match b.+ ]] ; print $? [[ ! ( foo -re2-match f.+ && bar -re2-match b.+ ) ]] ; print $? [[ ! foo -re2-match x.+ && bar -re2-match b.+ ]] ; print $? [[ foo -re2-match x.+ && ! bar -re2-match b.+ ]] ; print $? [[ ! ( foo -re2-match x.+ && bar -re2-match b.+ ) ]] ; print $? 0:Regex result inversion detection >0 >1 >1 >0 >0 >1 >1 >1 >1 >1 >0 >1 >0 # Subshell because crash on failure ( [[ test.txt -re2-match '^(.*_)?(test)' ]] echo $match[2] ) 0:regression for segmentation fault (pcre, dup for re2), workers/38307 >test setopt BASH_REMATCH KSH_ARRAYS unset MATCH MBEGIN MEND match mbegin mend BASH_REMATCH [[ alphabeta -re2-match '^a([^a]+)(a)([^a]+)a$' ]] print "$? bash_rematch" print "m:${MATCH}: ma:${(j:|:)match}:" print MBEGIN=$MBEGIN MEND=$MEND mbegin="(${mbegin[*]})" mend="(${mend[*]})" print "BASH_REMATCH=[${(j:, :)BASH_REMATCH[@]}]" print "[0]=${BASH_REMATCH[0]} [1]=${BASH_REMATCH[1]}" 0:bash_rematch works >0 bash_rematch >m:: ma:: >MBEGIN= MEND= mbegin=() mend=() >BASH_REMATCH=[alphabeta, lph, a, bet] >[0]=alphabeta [1]=lph unsetopt BASH_REMATCH KSH_ARRAYS m() { local label="$1" text="$2" rc out shift 2 unset MATCH match # can't capture stderr sanely for amalgamation, need compile to happen in parent re2_compile "$@" rc=$? if (( rc )); then print "${rc}-NoCompile $label"; return 1; fi print -n "$rc:" re2_match "$text" print $? $label: m:${MATCH}: ma:${(j:|:)match}: } # m cmd-clean alphabeta lph m cmd-anchored-nomatch alphabeta -a lph.+ m cmd-anchored-match alphabeta -a alp.+ m case-mismatch alphabeta 'A\w+' m case-insensitive-pattern alphabeta -i 'A\w+' m case-insensitive-text Alphabeta -i 'a\w+' m case-sensitive-text Alphabeta 'a\w+' m non-posix-okay-normal ÷1 '^(\p{Sm})\d$' m non-posix-reject-normal ÷x '^(\p{Sm})\d$' print -u2 'stderr start non-posix-posixmode' m non-posix-posixmode ÷1 -P '^(\p{Sm})\d$' print -u2 'stderr end non-posix-posixmode' m literal-match x1 -L x1 m literal-nomatch x1 -L .1 m literal-match-substr abcd -L bc m literal-nomatch-anchored abcd -aL bc m not-longest abb 'a(b|bb)' m longest abb -l 'a(b|bb)' 0:re2 compile/match testing with anonymous var >0:0 cmd-clean: m:lph: ma:: >0:1 cmd-anchored-nomatch: m:: ma:: >0:0 cmd-anchored-match: m:alphabeta: ma:: >0:1 case-mismatch: m:: ma:: >0:0 case-insensitive-pattern: m:alphabeta: ma:: >0:0 case-insensitive-text: m:Alphabeta: ma:: >0:0 case-sensitive-text: m:abeta: ma:: >0:0 non-posix-okay-normal: m:÷1: ma:÷: >0:1 non-posix-reject-normal: m:: ma:: >1-NoCompile non-posix-posixmode ?stderr start non-posix-posixmode *?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \p ?stderr end non-posix-posixmode >0:0 literal-match: m:x1: ma:: >0:1 literal-nomatch: m:: ma:: >0:0 literal-match-substr: m:bc: ma:: >0:1 literal-nomatch-anchored: m:: ma:: >0:0 not-longest: m:ab: ma:b: >0:0 longest: m:abb: ma:bb: ### We've dropped multi-line support for now, rather than debug RE2/cre2 ### interactions and figure out how I (pdp) am mis-reading docs. Should ### we add it, this is the test which exposed the presence of problems: # m multiline-reject-nom $'ab\ncd' '^cd' # set -x # m multiline-okay $'ab\ncd' -m '^cd' # set +x #0:re2 multiline matching #>0:1 multiline-reject-nom: m:: ma:: #>0:0 multiline-okay: m:cd: ma:: m posix-simple a1d -Pa '([[:alpha:]])([[:digit:]])([[:alpha:]])' # print -u2 'stderr start posix-reject-perlclass' m posix-reject-perlclass a1d -Pa '(\w)(\d)(\w)' print -u2 'stderr end posix-reject-perlclass' m posix-perlclass-enabled a1d -Pac '(\w)(\d)(\w)' m boundaries-normal 'def efg' '\be(.)' print -u2 'stderr start posix-reject-boundaries' m posix-reject-boundaries 'def efg' -P '\be(.)' print -u2 'stderr end posix-reject-boundaries' m posix-boundaries-enabled 'def efg' -Pw '\be(.)' m posix-perlclass-boundaries 'de1g e2h' -Pcw '\be(\d)(\w)' m posix-pcb-mattered 'de1g e2h' -Pcw 'e(\d)(\w)' 0:re2 POSIX mode with various features added back >0:0 posix-simple: m:a1d: ma:a|1|d: ?stderr start posix-reject-perlclass *?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \\w ?stderr end posix-reject-perlclass >1-NoCompile posix-reject-perlclass >0:0 posix-perlclass-enabled: m:a1d: ma:a|1|d: >0:0 boundaries-normal: m:ef: ma:f: ?stderr start posix-reject-boundaries *?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \\b ?stderr end posix-reject-boundaries >1-NoCompile posix-reject-boundaries >0:0 posix-boundaries-enabled: m:ef: ma:f: >0:0 posix-perlclass-boundaries: m:e2h: ma:2|h: >0:0 posix-pcb-mattered: m:e1g: ma:1|g: re2_compile -i '^([aeiou])(\w{2})' mintov() { local label="$1"; shift unset MATCH match T1 t1 re2_match "$@" print "$? $label MATCH=<$MATCH> match=<${(j:|:)match}> T1=<$T1> t1=<${(j:|:)t1}>" } mintov not-first not_first mintov simple orange mintov redir-arr -a t1 orange mintov redir-var -v T1 orange mintov redir-both -v T1 -a t1 orange mintov normal-after orange 0:re2_match capturing to named vars >1 not-first MATCH=<> match=<> T1=<> t1=<> >0 simple MATCH= match= T1=<> t1=<> >0 redir-arr MATCH= match=<> T1=<> t1= >0 redir-var MATCH=<> match= T1= t1=<> >0 redir-both MATCH=<> match=<> T1= t1= >0 normal-after MATCH= match= T1=<> t1=<> re2_compile '^([aeiou])(\w{2})' re2_match orange && echo "yes-1" re2_match -P '^t.{3}' orange || echo "no-2" re2_match -P '^t.{3}' tangerine && echo "yes-3" re2_match tangerine || echo "no-4" re2_match orange && echo "yes-5 ${match[2]}" 0:re2_match -P pattern works & doesn't mess with anonymous >yes-1 >no-2 >yes-3 >no-4 >yes-5 ra re2_compile '^(\p{Sm})(?!\d+)(?:.)$' 1:re2 check no crash on unsupported syntax ?(eval):re2_compile:1: re2 rexexp compilation failed: invalid perl operator: (?! re2_compile '(fred' 1:re2 complain parens not closed ?(eval):re2_compile:1: re2 rexexp compilation failed: missing ): (fred %clean unfunction -m 'm*'