1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
| | %prep
if ! zmodload -F zsh/re2 C:re2-match 2>/dev/null
then
ZTST_unimplemented="the zsh/re2 module is not available"
return 0
fi
# Load the rest of the builtins
zmodload zsh/re2
# TODO: use future mechanism to switch =~ to use re2 and test =~ too
# Find a UTF-8 locale.
setopt multibyte
# Don't let LC_* override our choice of locale.
unset -m LC_\*
mb_ok=
langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
$(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
for LANG in $langs; do
if [[ é = ? ]]; then
mb_ok=1
break;
fi
done
if [[ -z $mb_ok ]]; then
ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
else
print -u $ZTST_fd Testing RE2 multibyte with locale $LANG
mkdir multibyte.tmp && cd multibyte.tmp
fi
%test
[[ 'foo→bar' -re2-match .([^[:ascii:]]). ]]
print $MATCH
print $match[1]
0:Basic non-ASCII regexp matching
>o→b
>→
MATCH=''
[[ ÷x -re2-match '^(\p{Sm})(\p{Latin})$' ]]
print "$? <$MATCH> .${match[1]}|${match[2]}."
0:Unicode character class names & extracting correct widths
>0 <÷x> .÷|x.
[[ alphabeta -re2-match a([^a]+)a ]]
echo "$? basic"
print $MATCH
print $match[1]
[[ ! alphabeta -re2-match a(.+)a ]]
echo "$? negated op"
[[ alphabeta -re2-match ^b ]]
echo "$? failed match"
# default matches on first, then takes longest substring
# -longest keeps looking
[[ abb -re2-match a(b|bb) ]]
echo "$? first .${MATCH}.${match[1]}."
[[ abb -re2-match-longest a(b|bb) ]]
echo "$? longest .${MATCH}.${match[1]}."
[[ alphabeta -re2-match ab ]]; echo "$? unanchored"
[[ alphabeta -re2-match ^ab ]]; echo "$? anchored"
[[ alphabeta -re2-match '^a(\w+)a$' ]]
echo "$? perl class used"
echo ".${MATCH}. .${match[1]}."
[[ alphabeta -re2-match-posix '^a(\w+)a$' ]]
echo "$? POSIX-mode, should inhibit Perl class"
[[ alphabeta -re2-match-posixperl '^a(\w+)a$' ]]
echo "$? POSIX-mode with Perl classes enabled .${match[1]}."
unset MATCH match
[[ alphabeta -re2-match ^a([^a]+)a([^a]+)a$ ]]
echo "$? matched, set vars"
echo ".$MATCH. ${#MATCH}"
echo ".${(j:|:)match[*]}."
unset MATCH match
[[ alphabeta -re2-match fr(.+)d ]]
echo "$? unmatched, not setting MATCH/match"
echo ".$MATCH. ${#MATCH}"
echo ".${(j:|:)match[*]}."
0:Basic matching & result codes
>0 basic
>alpha
>lph
>1 negated op
>1 failed match
>0 first .ab.b.
>0 longest .abb.bb.
>0 unanchored
>1 anchored
>0 perl class used
>.alphabeta. .lphabet.
*?\(eval\):*: re2 rexexp compilation failed: invalid escape sequence: \w
>1 POSIX-mode, should inhibit Perl class
>0 POSIX-mode with Perl classes enabled .lphabet.
>0 matched, set vars
>.alphabeta. 9
>.lph|bet.
>1 unmatched, not setting MATCH/match
>.. 0
>..
m() {
unset MATCH MBEGIN MEND match mbegin mend
[[ $2 -re2-match $3 ]]
print $? $1: m:${MATCH}: ma:${(j:|:)match}: MBEGIN=$MBEGIN MEND=$MEND mbegin="(${mbegin[*]})" mend="(${mend[*]})"
}
data='alpha beta gamma delta'
m uncapturing $data '\b\w+\b'
m capturing $data '\b(\w+)\b'
m 'capture 2' $data '\b(\w+)\s+(\w+)\b'
m 'capture repeat' $data '\b(?:(\w+)\s+)+(\w+)\b'
0:Beginning and end testing
>0 uncapturing: m:alpha: ma:: MBEGIN=1 MEND=5 mbegin=() mend=()
>0 capturing: m:alpha: ma:alpha: MBEGIN=1 MEND=5 mbegin=(1) mend=(5)
>0 capture 2: m:alpha beta: ma:alpha|beta: MBEGIN=1 MEND=10 mbegin=(1 7) mend=(5 10)
>0 capture repeat: m:alpha beta gamma delta: ma:gamma|delta: MBEGIN=1 MEND=22 mbegin=(12 18) mend=(16 22)
unset match mend
s=$'\u00a0'
[[ $s -re2-match '^.$' ]] && print OK
[[ A${s}B -re2-match .(.). && $match[1] == $s ]] && print OK
[[ A${s}${s}B -re2-match A([^[:ascii:]]*)B && $mend[1] == 3 ]] && print OK
unset s
0:Raw IMETA characters in input string
>OK
>OK
>OK
[[ foo -re2-match f.+ ]] ; print $?
[[ foo -re2-match x.+ ]] ; print $?
[[ ! foo -re2-match f.+ ]] ; print $?
[[ ! foo -re2-match x.+ ]] ; print $?
[[ foo -re2-match f.+ && bar -re2-match b.+ ]] ; print $?
[[ foo -re2-match x.+ && bar -re2-match b.+ ]] ; print $?
[[ foo -re2-match f.+ && bar -re2-match x.+ ]] ; print $?
[[ ! foo -re2-match f.+ && bar -re2-match b.+ ]] ; print $?
[[ foo -re2-match f.+ && ! bar -re2-match b.+ ]] ; print $?
[[ ! ( foo -re2-match f.+ && bar -re2-match b.+ ) ]] ; print $?
[[ ! foo -re2-match x.+ && bar -re2-match b.+ ]] ; print $?
[[ foo -re2-match x.+ && ! bar -re2-match b.+ ]] ; print $?
[[ ! ( foo -re2-match x.+ && bar -re2-match b.+ ) ]] ; print $?
0:Regex result inversion detection
>0
>1
>1
>0
>0
>1
>1
>1
>1
>1
>0
>1
>0
# Subshell because crash on failure
( [[ test.txt -re2-match '^(.*_)?(test)' ]]
echo $match[2] )
0:regression for segmentation fault (pcre, dup for re2), workers/38307
>test
setopt BASH_REMATCH KSH_ARRAYS
unset MATCH MBEGIN MEND match mbegin mend BASH_REMATCH
[[ alphabeta -re2-match '^a([^a]+)(a)([^a]+)a$' ]]
print "$? bash_rematch"
print "m:${MATCH}: ma:${(j:|:)match}:"
print MBEGIN=$MBEGIN MEND=$MEND mbegin="(${mbegin[*]})" mend="(${mend[*]})"
print "BASH_REMATCH=[${(j:, :)BASH_REMATCH[@]}]"
print "[0]=${BASH_REMATCH[0]} [1]=${BASH_REMATCH[1]}"
0:bash_rematch works
>0 bash_rematch
>m:: ma::
>MBEGIN= MEND= mbegin=() mend=()
>BASH_REMATCH=[alphabeta, lph, a, bet]
>[0]=alphabeta [1]=lph
unsetopt BASH_REMATCH KSH_ARRAYS
m() {
local label="$1" text="$2" rc out
shift 2
unset MATCH match
# can't capture stderr sanely for amalgamation, need compile to happen in parent
re2_compile "$@"
rc=$?
if (( rc )); then print "${rc}-NoCompile $label"; return 1; fi
print -n "$rc:"
re2_match "$text"
print $? $label: m:${MATCH}: ma:${(j:|:)match}:
}
#
m cmd-clean alphabeta lph
m cmd-anchored-nomatch alphabeta -a lph.+
m cmd-anchored-match alphabeta -a alp.+
m case-mismatch alphabeta 'A\w+'
m case-insensitive-pattern alphabeta -i 'A\w+'
m case-insensitive-text Alphabeta -i 'a\w+'
m case-sensitive-text Alphabeta 'a\w+'
m non-posix-okay-normal ÷1 '^(\p{Sm})\d$'
m non-posix-reject-normal ÷x '^(\p{Sm})\d$'
print -u2 'stderr start non-posix-posixmode'
m non-posix-posixmode ÷1 -P '^(\p{Sm})\d$'
print -u2 'stderr end non-posix-posixmode'
m literal-match x1 -L x1
m literal-nomatch x1 -L .1
m literal-match-substr abcd -L bc
m literal-nomatch-anchored abcd -aL bc
m not-longest abb 'a(b|bb)'
m longest abb -l 'a(b|bb)'
0:re2 compile/match testing with anonymous var
>0:0 cmd-clean: m:lph: ma::
>0:1 cmd-anchored-nomatch: m:: ma::
>0:0 cmd-anchored-match: m:alphabeta: ma::
>0:1 case-mismatch: m:: ma::
>0:0 case-insensitive-pattern: m:alphabeta: ma::
>0:0 case-insensitive-text: m:Alphabeta: ma::
>0:0 case-sensitive-text: m:abeta: ma::
>0:0 non-posix-okay-normal: m:÷1: ma:÷:
>0:1 non-posix-reject-normal: m:: ma::
>1-NoCompile non-posix-posixmode
?stderr start non-posix-posixmode
*?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \p
?stderr end non-posix-posixmode
>0:0 literal-match: m:x1: ma::
>0:1 literal-nomatch: m:: ma::
>0:0 literal-match-substr: m:bc: ma::
>0:1 literal-nomatch-anchored: m:: ma::
>0:0 not-longest: m:ab: ma:b:
>0:0 longest: m:abb: ma:bb:
### We've dropped multi-line support for now, rather than debug RE2/cre2
### interactions and figure out how I (pdp) am mis-reading docs. Should
### we add it, this is the test which exposed the presence of problems:
# m multiline-reject-nom $'ab\ncd' '^cd'
# set -x
# m multiline-okay $'ab\ncd' -m '^cd'
# set +x
#0:re2 multiline matching
#>0:1 multiline-reject-nom: m:: ma::
#>0:0 multiline-okay: m:cd: ma::
m posix-simple a1d -Pa '([[:alpha:]])([[:digit:]])([[:alpha:]])'
#
print -u2 'stderr start posix-reject-perlclass'
m posix-reject-perlclass a1d -Pa '(\w)(\d)(\w)'
print -u2 'stderr end posix-reject-perlclass'
m posix-perlclass-enabled a1d -Pac '(\w)(\d)(\w)'
m boundaries-normal 'def efg' '\be(.)'
print -u2 'stderr start posix-reject-boundaries'
m posix-reject-boundaries 'def efg' -P '\be(.)'
print -u2 'stderr end posix-reject-boundaries'
m posix-boundaries-enabled 'def efg' -Pw '\be(.)'
m posix-perlclass-boundaries 'de1g e2h' -Pcw '\be(\d)(\w)'
m posix-pcb-mattered 'de1g e2h' -Pcw 'e(\d)(\w)'
0:re2 POSIX mode with various features added back
>0:0 posix-simple: m:a1d: ma:a|1|d:
?stderr start posix-reject-perlclass
*?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \\w
?stderr end posix-reject-perlclass
>1-NoCompile posix-reject-perlclass
>0:0 posix-perlclass-enabled: m:a1d: ma:a|1|d:
>0:0 boundaries-normal: m:ef: ma:f:
?stderr start posix-reject-boundaries
*?m:re2_compile:*: re2 rexexp compilation failed: invalid escape sequence: \\b
?stderr end posix-reject-boundaries
>1-NoCompile posix-reject-boundaries
>0:0 posix-boundaries-enabled: m:ef: ma:f:
>0:0 posix-perlclass-boundaries: m:e2h: ma:2|h:
>0:0 posix-pcb-mattered: m:e1g: ma:1|g:
re2_compile -i '^([aeiou])(\w{2})'
mintov() {
local label="$1"; shift
unset MATCH match T1 t1
re2_match "$@"
print "$? $label MATCH=<$MATCH> match=<${(j:|:)match}> T1=<$T1> t1=<${(j:|:)t1}>"
}
mintov not-first not_first
mintov simple orange
mintov redir-arr -a t1 orange
mintov redir-var -v T1 orange
mintov redir-both -v T1 -a t1 orange
mintov normal-after orange
0:re2_match capturing to named vars
>1 not-first MATCH=<> match=<> T1=<> t1=<>
>0 simple MATCH=<ora> match=<o|ra> T1=<> t1=<>
>0 redir-arr MATCH=<ora> match=<> T1=<> t1=<o|ra>
>0 redir-var MATCH=<> match=<o|ra> T1=<ora> t1=<>
>0 redir-both MATCH=<> match=<> T1=<ora> t1=<o|ra>
>0 normal-after MATCH=<ora> match=<o|ra> T1=<> t1=<>
re2_compile '^([aeiou])(\w{2})'
re2_match orange && echo "yes-1"
re2_match -P '^t.{3}' orange || echo "no-2"
re2_match -P '^t.{3}' tangerine && echo "yes-3"
re2_match tangerine || echo "no-4"
re2_match orange && echo "yes-5 ${match[2]}"
0:re2_match -P pattern works & doesn't mess with anonymous
>yes-1
>no-2
>yes-3
>no-4
>yes-5 ra
re2_compile '^(\p{Sm})(?!\d+)(?:.)$'
1:re2 check no crash on unsupported syntax
?(eval):re2_compile:1: re2 rexexp compilation failed: invalid perl operator: (?!
re2_compile '(fred'
1:re2 complain parens not closed
?(eval):re2_compile:1: re2 rexexp compilation failed: missing ): (fred
%clean
unfunction -m 'm*'
|