zsh-workers
 help / color / mirror / code / Atom feed
* [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
@ 2017-06-15 20:40 ` Phil Pennock
  2017-06-16  6:41   ` Stephane Chazelas
                     ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Phil Pennock @ 2017-06-15 20:40 UTC (permalink / raw)
  To: zsh-workers

[-- Attachment #1: Type: text/plain, Size: 4975 bytes --]

The regexp itself is always NUL-terminated, so detect embedded NULs and
zwarn on their presence.
---
 Src/Modules/pcre.c | 37 ++++++++++++++++++++++++++-----------
 Test/V07pcre.ztst  |  5 +++++
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 5fd67963d..27191d709 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -75,7 +75,7 @@ zpcre_utf8_enabled(void)
 static int
 bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
 {
-    int pcre_opts = 0, pcre_errptr;
+    int pcre_opts = 0, pcre_errptr, target_len;
     const char *pcre_error;
     char *target;
     
@@ -89,15 +89,19 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
 	pcre_opts |= PCRE_UTF8;
 
     pcre_hints = NULL;  /* Is this necessary? */
-    
+
     if (pcre_pattern)
 	pcre_free(pcre_pattern);
 
     target = ztrdup(*args);
-    unmetafy(target, NULL);
+    unmetafy(target, &target_len);
+
+    if ((int)strlen(target) != target_len) {
+	zwarnnam(nam, "embedded NULs in PCRE pattern terminate pattern");
+    }
 
     pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
-    
+
     free(target);
 
     if (pcre_pattern == NULL)
@@ -167,7 +171,12 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
 	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
 	    setsparam("ZPCRE_OP", ztrdup(offset_all));
 	}
-	match_all = metafy(captures[0], -1, META_DUP);
+	/*
+	 * Result strings can contain embedded NULs; the length of each is the
+	 * difference between the two values in each paired entry in ovec.
+	 * ovec is length 2*(1+capture_list_length)
+	 */
+	match_all = metafy(captures[0], ovec[1] - ovec[0], META_DUP);
 	setsparam(matchvar, match_all);
 	/*
 	 * If we're setting match, mbegin, mend we only do
@@ -176,13 +185,16 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
 	 */
 	if (!want_begin_end || nelem) {
 	    char **x, **y;
+	    int vec_off;
 	    y = &captures[capture_start];
 	    matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1));
+	    vec_off = 2;
 	    do {
 		if (*y)
-		    *x++ = metafy(*y, -1, META_DUP);
+		    *x++ = metafy(*y, ovec[vec_off+1]-ovec[vec_off], META_DUP);
 		else
 		    *x++ = NULL;
+		vec_off += 2;
 	    } while (*y++);
 	    setaparam(substravar, matches);
 	}
@@ -318,8 +330,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     ovec = zalloc(ovecsize*sizeof(int));
 
     plaintext = ztrdup(*args);
-    unmetafy(plaintext, NULL);
-    subject_len = (int)strlen(plaintext);
+    unmetafy(plaintext, &subject_len);
 
     if (offset_start > 0 && offset_start >= subject_len)
 	ret = PCRE_ERROR_NOMATCH;
@@ -351,6 +362,7 @@ cond_pcre_match(char **a, int id)
     const char *pcre_err;
     char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL;
     int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
+    int lhstr_plain_len, rhre_plain_len;
     int return_value = 0;
 
     if (zpcre_utf8_enabled())
@@ -362,8 +374,8 @@ cond_pcre_match(char **a, int id)
     rhre = cond_str(a,1,0);
     lhstr_plain = ztrdup(lhstr);
     rhre_plain = ztrdup(rhre);
-    unmetafy(lhstr_plain, NULL);
-    unmetafy(rhre_plain, NULL);
+    unmetafy(lhstr_plain, &lhstr_plain_len);
+    unmetafy(rhre_plain, &rhre_plain_len);
     pcre_pat = NULL;
     ov = NULL;
     ovsize = 0;
@@ -373,6 +385,9 @@ cond_pcre_match(char **a, int id)
 
     switch(id) {
 	 case CPCRE_PLAIN:
+		if ((int)strlen(rhre_plain) != rhre_plain_len) {
+		    zwarn("embedded NULs in PCRE pattern terminate pattern");
+		}
 		pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
 		if (pcre_pat == NULL) {
 		    zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
@@ -381,7 +396,7 @@ cond_pcre_match(char **a, int id)
                 pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
     		ovsize = (capcnt+1)*3;
 		ov = zalloc(ovsize*sizeof(int));
-    		r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize);
+    		r = pcre_exec(pcre_pat, NULL, lhstr_plain, lhstr_plain_len, 0, 0, ov, ovsize);
 		/* r < 0 => error; r==0 match but not enough size in ov
 		 * r > 0 => (r-1) substrings found; r==1 => no substrings
 		 */
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index ad1770712..03cb95791 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -131,6 +131,11 @@
 >78884; ZPCRE_OP: 25 30
 >90210; ZPCRE_OP: 31 36
 
+# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
+  [[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
+  print "${#MATCH}; ${#match[1]}; ${#match[2]}"
+>6; 3; 3
+
 # Subshell because crash on failure
   ( setopt re_match_pcre
     [[ test.txt =~ '^(.*_)?(test)' ]]
-- 
2.13.1


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 996 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-15 20:40 ` [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out Phil Pennock
@ 2017-06-16  6:41   ` Stephane Chazelas
  2017-06-17  3:10     ` Bart Schaefer
  2017-06-20 20:07   ` Phil Pennock
  2017-06-21  8:59   ` Peter Stephenson
  2 siblings, 1 reply; 7+ messages in thread
From: Stephane Chazelas @ 2017-06-16  6:41 UTC (permalink / raw)
  To: Phil Pennock; +Cc: zsh-workers

2017-06-15 16:40:50 -0400, Phil Pennock:
> The regexp itself is always NUL-terminated, so detect embedded NULs and
> zwarn on their presence.
[...]

Thanks Phil.

Maybe an improvement could be to replace those NULs with a
litteral \0. Though that would not be valid if the NUL is
preceded by a backslash or within \Q...\E. So it's probably
just as well to use the warning to let the user do the escaping
himself.



Having a way to use \Q...\E reliably would be nice by the way

As in a variant of:

    [[ $x =~ "...\Q$y\E" ]]

That still works if $y contains \E. (I can't think of a
reasonable way though).

bash32+/ksh93 support:

    [[ $x =~ ..."$y"...]]

as in quoting $y prevents interpretation of RE operators in
them. It's quite buggy in ksh93, and in bash, that means the
syntax is limited to POSIX ERE even on systems where EREs have
many extensions like  \<, back-references... (except if you
store the regex in a variable which you then leave unquoted:
ws='\<' we='\>'; [[ $var = $ws"$word"$we ]], not [[ $var =
\<"$word"\> ]]) so I'm not sure we want to go there.

Solution for now in zsh is to escape like:

   [[ $x =~ "\b\Q${word//\\E/\\E\\\\E\\Q}\E" ]]

(possibly still doesn't work in locales that have characters
whose encoding ends in 0x5c bytes (same as \), but nobody should
use those really in this day and age).

-- 
Stephane


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-16  6:41   ` Stephane Chazelas
@ 2017-06-17  3:10     ` Bart Schaefer
  2017-06-17  6:31       ` Stephane Chazelas
  0 siblings, 1 reply; 7+ messages in thread
From: Bart Schaefer @ 2017-06-17  3:10 UTC (permalink / raw)
  To: zsh-workers

On Jun 16,  7:41am, Stephane Chazelas wrote:
}
} Solution for now in zsh is to escape like:
} 
}    [[ $x =~ "\b\Q${word//\\E/\\E\\\\E\\Q}\E" ]]

Hmm, wouldn't "\b\Q${(b)word}\E" be sufficient there?  If fact if
you've applied ${(b)word} do you even need \E and \Q ?


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-17  3:10     ` Bart Schaefer
@ 2017-06-17  6:31       ` Stephane Chazelas
  0 siblings, 0 replies; 7+ messages in thread
From: Stephane Chazelas @ 2017-06-17  6:31 UTC (permalink / raw)
  To: Bart Schaefer; +Cc: zsh-workers

2017-06-16 20:10:49 -0700, Bart Schaefer:
> On Jun 16,  7:41am, Stephane Chazelas wrote:
> }
> } Solution for now in zsh is to escape like:
> } 
> }    [[ $x =~ "\b\Q${word//\\E/\\E\\\\E\\Q}\E" ]]
> 
> Hmm, wouldn't "\b\Q${(b)word}\E" be sufficient there?  In fact if
> you've applied ${(b)word} do you even need \E and \Q ?

Not really

Inside \Q...\E PCREs, only \E is special, and there's no
escaping you may do. It's like strong quotes. Changing ? to \?
would change the meaning of the regexp. And wouldn't help for \E

Outside of \Q...\E where what needs to be escaped on whether the
regexp has a (?x)), there are things like . or $ (or blanks with
(?x)) it would still leave unescaped.

PCREs (as opposed to some ERE implementations that have things
like \<, \=) are good though in that AFAICT, there are only \x
operators where x is an ASCII alnum, so adding a \ in front of
every ASCII non-alnum should be enough I would think (as long as
we're not inside [...] or things like \g{...}). So a an
equivalent of ${(b)var} for PCRE should not too difficult.

Quoting both ERE and PCRE is a problem in theory for (?x) and
blanks where "\ " is unspecified in ERE, but in practice, I
don't think any ERE implementation would ever have "\ " as a
special operator. So I think it should be a matter of quoting
only (and not more than):

ASCII [[:space:]]
$^*()+[]{}.?\|

(and again (from a security standpoint at least), that quoting
could be fooled in some locales like those that have BIG5-HKSCS
or GB18030 as the charset where some characters whose encoding
contains the encoding of other characters including ASCII ones).

-- 
Stephane


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-15 20:40 ` [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out Phil Pennock
  2017-06-16  6:41   ` Stephane Chazelas
@ 2017-06-20 20:07   ` Phil Pennock
  2017-06-21  8:59   ` Peter Stephenson
  2 siblings, 0 replies; 7+ messages in thread
From: Phil Pennock @ 2017-06-20 20:07 UTC (permalink / raw)
  To: zsh-workers

On 2017-06-15 at 16:40 -0400, Phil Pennock wrote:
> The regexp itself is always NUL-terminated, so detect embedded NULs and
> zwarn on their presence.

*nudge* Reminder that my commit-bit was handed in for safekeeping, so
someone else will need to merge this.

-Phil


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-15 20:40 ` [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out Phil Pennock
  2017-06-16  6:41   ` Stephane Chazelas
  2017-06-20 20:07   ` Phil Pennock
@ 2017-06-21  8:59   ` Peter Stephenson
  2017-06-21 22:48     ` Phil Pennock
  2 siblings, 1 reply; 7+ messages in thread
From: Peter Stephenson @ 2017-06-21  8:59 UTC (permalink / raw)
  To: zsh-workers

On Thu, 15 Jun 2017 16:40:50 -0400
Phil Pennock <zsh-workers+phil.pennock@spodhuis.org> wrote:
> diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
> index ad1770712..03cb95791 100644
> --- a/Test/V07pcre.ztst
> +++ b/Test/V07pcre.ztst
> @@ -131,6 +131,11 @@
>  >78884; ZPCRE_OP: 25 30
>  >90210; ZPCRE_OP: 31 36
>  
> +# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
> +  [[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
> +  print "${#MATCH}; ${#match[1]}; ${#match[2]}"
> +>6; 3; 3

Not sure what's going on here but there needs to be a status /
description line.

Was there are a response to Stephane's further suggestion?

pws


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out
  2017-06-21  8:59   ` Peter Stephenson
@ 2017-06-21 22:48     ` Phil Pennock
  0 siblings, 0 replies; 7+ messages in thread
From: Phil Pennock @ 2017-06-21 22:48 UTC (permalink / raw)
  To: Peter Stephenson; +Cc: zsh-workers

On 2017-06-21 at 09:59 +0100, Peter Stephenson wrote:
> On Thu, 15 Jun 2017 16:40:50 -0400
> Phil Pennock <zsh-workers+phil.pennock@spodhuis.org> wrote:
> > +# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
> > +  [[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
> > +  print "${#MATCH}; ${#match[1]}; ${#match[2]}"
> > +>6; 3; 3
> 
> Not sure what's going on here but there needs to be a status /
> description line.

How strange, that test passed when I tried it out locally but I would
have expected a failure.  You're right, sorry about that.

Status/description line:

0:ensure ASCII NUL passes in and out of matched plaintext

I didn't see a point in trying to figure out text-matches of exact
NUL-containing strings in the test-suite, when we could just match the
length of the strings.  There is logic for passing the full string in
now, and for fixing the length of $MATCH, and separately for the lengths
of the items in $match; testing both that $match works twice (ie, ovec
iteration is not busted) and that `.+` swallows the NUL, not just
explicit \0 matching.  It seemed a concise way to test all the changes
and detect regression.

> Was there are a response to Stephane's further suggestion?

The <<replace NUL with "\0">> suggestion was withdrawn within the same
paragraph, I agree that the behaviour I implemented is the best choice.
;)  As for changing \Q...\E support, that's an entirely different issue,
it's a feature request where this patch is a bug-fix.

My response is "sounds great, feature patches welcome; any issues with
this bug-fix?"

-Phil


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-06-21 23:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20170615204203epcas5p4b0a2bcbcfe0843d979c653df6b8352db@epcas5p4.samsung.com>
2017-06-15 20:40 ` [PATCH] PCRE/NUL: pass NUL in for text, handle NUL out Phil Pennock
2017-06-16  6:41   ` Stephane Chazelas
2017-06-17  3:10     ` Bart Schaefer
2017-06-17  6:31       ` Stephane Chazelas
2017-06-20 20:07   ` Phil Pennock
2017-06-21  8:59   ` Peter Stephenson
2017-06-21 22:48     ` Phil Pennock

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).