zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: New options for the PCRE module (to replace my previous)
@ 2009-03-25  9:20 Jon Strait
  2009-03-25 10:02 ` Peter Stephenson
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Strait @ 2009-03-25  9:20 UTC (permalink / raw)
  To: zsh workers

[-- Attachment #1: Type: text/plain, Size: 309 bytes --]

A few adjustments since last time, with documentation.

No reset of the special variables is done on a match failure.  No 
feedback from anyone was given about this since my email to Phil and I 
also discovered that this behavior is already entrenched within the 
extended globbing flags (#m and #b).


Jon



[-- Attachment #2: pcre.c.patch --]
[-- Type: text/x-patch, Size: 3747 bytes --]

--- pcre-old.c	2009-02-08 14:23:52.000000000 -0800
+++ pcre.c	2009-03-24 18:12:04.000000000 -0700
@@ -138,9 +138,11 @@
 
 /**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, 
+    int want_offset_pair, int matchedinarr)
 {
     char **captures, *match_all, **matches;
+    char offset_all[50];
     int capture_start = 1;
 
     if (matchedinarr)
@@ -149,9 +151,14 @@
 	matchvar = "MATCH";
     if (substravar == NULL)
 	substravar = "match";
-
+    
     /* captures[0] will be entire matched string, [1] first substring */
-    if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+    if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+	/* Set to the offsets of the complete match */
+	if (want_offset_pair) {
+	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+	    setsparam("ZPCRE_OP", ztrdup(offset_all));
+	}
 	match_all = ztrdup(captures[0]);
 	setsparam(matchvar, match_all);
 	matches = zarrdup(&captures[capture_start]);
@@ -164,12 +171,32 @@
 
 /**/
 static int
+getposint(char *instr, char *nam)
+{
+    char *eptr;
+    int ret;
+
+    ret = (int)zstrtol(instr, &eptr, 10);
+    if (*eptr || ret < 0) {
+	zwarnnam(nam, "integer expected: %s", instr);
+	return -1;
+    }
+
+    return ret;
+}
+
+/**/
+static int
 bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
     int ret, capcount, *ovec, ovecsize, c;
     char *matched_portion = NULL;
     char *receptacle = NULL;
     int return_value = 1;
+    /* The subject length and offset start are both int values in pcre_exec */
+    int subject_len;
+    int offset_start = 0;
+    int want_offset_pair = 0;
 
     if (pcre_pattern == NULL) {
 	zwarnnam(nam, "no pattern has been compiled");
@@ -182,6 +209,12 @@
     if(OPT_HASARG(ops,c='v')) {
 	matched_portion = OPT_ARG(ops,c);
     }
+    if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
+	offset_start = getposint(OPT_ARG(ops,c), nam);
+    }
+    /* For the entire match, 'Return' the offset byte positions instead of the matched string */
+    if(OPT_ISSET(ops,'b')) want_offset_pair = 1; 
+    
     if(!*args) {
 	zwarnnam(nam, "not enough arguments");
     }
@@ -195,12 +228,17 @@
     ovecsize = (capcount+1)*3;
     ovec = zalloc(ovecsize*sizeof(int));
     
-    ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
-    
+    subject_len = (int)strlen(*args);
+
+    if (offset_start < 0 || offset_start >= subject_len)
+	ret = PCRE_ERROR_NOMATCH;
+    else
+	ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
+
     if (ret==0) return_value = 0;
     else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
+	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
 	return_value = 0;
     }
     else {
@@ -259,7 +297,7 @@
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
+		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
@@ -291,7 +329,7 @@
 
 static struct builtin bintab[] = {
     BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:",    NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:n:b",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 

[-- Attachment #3: mod_pcre.yo.patch --]
[-- Type: text/x-patch, Size: 2495 bytes --]

--- mod_pcre-old.yo	2007-05-02 03:58:57.000000000 -0700
+++ mod_pcre.yo	2009-03-25 00:36:47.000000000 -0700
@@ -6,7 +6,7 @@
 
 startitem()
 findex(pcre_compile)
-item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))(
+item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))(
 Compiles a perl-compatible regular expression.
 
 Option tt(-a) will force the pattern to be anchored.
@@ -15,6 +15,8 @@
 tt(^) and tt($) will match newlines within the pattern.
 Option tt(-x) will compile an extended pattern, wherein
 whitespace and tt(#) comments are ignored.
+Option tt(-s) makes the dot metacharacter match all characters, 
+including those that indicate newline.
 )
 findex(pcre_study)
 item(tt(pcre_study))(
@@ -22,7 +24,7 @@
 matching.
 )
 findex(pcre_match)
-item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))(
+item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -32,7 +34,36 @@
 case it will set the array var(arr).  Similarly, the variable
 var(MATCH) will be set to the entire matched portion of the
 string, unless the tt(-v) option is given, in which case the variable
-var(var) will be set.
+var(var) will be set.  A tt(-n) option starts searching for a match from the byte
+var(offset) position in var(string).  If the tt(-b) option is set, the variable 
+var(ZPCRE_OP) will be set to an offset pair string, representing the byte offset positions of 
+the entire matched portion within the var(string).  For example, a var(ZPCRE_OP)
+set to "32 45" indicates that the matched portion began on byte offset 32 and ended
+on byte offset 44.  Here, byte offset position 45 is the position directly after the matched
+portion.  Keep in mind that the byte position isn't necessarily the same as the character
+position when UTF-8 characters are involved.  Consequently, the byte offset positions
+are only to be relied on in the context of using them for subsequent searches of
+var(string), using an offset position as an argument to the tt(-n) option.  This is mostly
+used to implement the "find all non-overlapping matches" functionality.
+
+A simple example of "find all non-overlapping matches":
+
+example(
+pcre_compile -m "^example\spattern$"
+
+accum=()
+
+pcre_match -b -- $string
+
+while [[ $? -eq 0 ]] do
+    b=($=ZPCRE_OP)
+    accum+=$match[1]
+    pcre_match -b -n $b[2] -- $string
+done
+   
+print -l $accum
+
+)
 )
 enditem()
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: New options for the PCRE module (to replace my previous)
  2009-03-25  9:20 PATCH: New options for the PCRE module (to replace my previous) Jon Strait
@ 2009-03-25 10:02 ` Peter Stephenson
  2009-03-25 11:21   ` Jon Strait
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Stephenson @ 2009-03-25 10:02 UTC (permalink / raw)
  To: zsh workers

On Wed, 25 Mar 2009 02:20:02 -0700
Jon Strait <jstrait@moonloop.net> wrote:
> A few adjustments since last time, with documentation.
> 
> No reset of the special variables is done on a match failure.

Er, I can't remember what Phil said (I haven't been following this in any
detail), but the documentation now says variables aren't altered on a
failure, so presumably that is now incorrect.  I don't think this is
crucial as long as it's documented correctly.

Could you in any case send a documentation patch against the current source
and with lines wrapped to 80 columns?

Thanks.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: New options for the PCRE module (to replace my previous)
  2009-03-25 10:02 ` Peter Stephenson
@ 2009-03-25 11:21   ` Jon Strait
  2009-03-25 11:30     ` Peter Stephenson
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Strait @ 2009-03-25 11:21 UTC (permalink / raw)
  Cc: zsh workers


[-- Attachment #1.1: Type: text/plain, Size: 870 bytes --]

Peter Stephenson wrote:
> On Wed, 25 Mar 2009 02:20:02 -0700
> Jon Strait <jstrait@moonloop.net> wrote:
>   
>> A few adjustments since last time, with documentation.
>>
>> No reset of the special variables is done on a match failure.
>>     
>
> Er, I can't remember what Phil said (I haven't been following this in any
> detail), but the documentation now says variables aren't altered on a
> failure, so presumably that is now incorrect.  I don't think this is
> crucial as long as it's documented correctly.
>
> Could you in any case send a documentation patch against the current source
> and with lines wrapped to 80 columns?
>
> Thanks.
>
>   
No, I ended up keeping the original behavior:  On match failure, none of 
the special variables are modified (reset).

Here is the updated doc patch.

Please let me know if anything I added isn't clear enough.  Thanks.

[-- Attachment #1.2: Type: text/html, Size: 1352 bytes --]

[-- Attachment #2: mod_pcre.yo.patch --]
[-- Type: text/x-patch, Size: 2654 bytes --]

--- mod_pcre-old.yo	2009-01-15 01:49:06.000000000 -0800
+++ mod_pcre.yo	2009-03-25 03:55:58.000000000 -0700
@@ -6,7 +6,7 @@
 
 startitem()
 findex(pcre_compile)
-item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))(
+item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))(
 Compiles a perl-compatible regular expression.
 
 Option tt(-a) will force the pattern to be anchored.
@@ -15,6 +15,8 @@
 tt(^) and tt($) will match newlines within the pattern.
 Option tt(-x) will compile an extended pattern, wherein
 whitespace and tt(#) comments are ignored.
+Option tt(-s) makes the dot metacharacter match all characters, 
+including those that indicate newline.
 )
 findex(pcre_study)
 item(tt(pcre_study))(
@@ -22,7 +24,8 @@
 matching.
 )
 findex(pcre_match)
-item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))(
+item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
+[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -33,8 +36,38 @@
 case it will set the array var(arr).  Similarly, the variable
 var(MATCH) will be set to the entire matched portion of the
 string, unless the tt(-v) option is given, in which case the variable
-var(var) will be set.
-No variables are altered if there is no successful match.
+var(var) will be set. 
+No variables are altered if there is no successful match. 
+A tt(-n) option starts searching for a match from the 
+byte var(offset) position in var(string).  If the tt(-b) option is given, 
+the variable var(ZPCRE_OP) will be set to an offset pair string, 
+representing the byte offset positions of the entire matched portion 
+within the var(string).  For example, a var(ZPCRE_OP) set to "32 45" indicates
+that the matched portion began on byte offset 32 and ended on byte offset 44. 
+Here, byte offset position 45 is the position directly after the matched
+portion.  Keep in mind that the byte position isn't necessarily the same 
+as the character position when UTF-8 characters are involved.  
+Consequently, the byte offset positions are only to be relied on in the
+context of using them for subsequent searches on var(string), using an offset
+position as an argument to the tt(-n) option.  This is mostly
+used to implement the "find all non-overlapping matches" functionality.
+
+A simple example of "find all non-overlapping matches":
+
+example(
+string="The following zip codes: 78884 90210 99513"
+pcre_compile -m "\d{5}"
+accum=()
+pcre_match -b -- $string
+while [[ $? -eq 0 ]] do
+    b=($=ZPCRE_OP)
+    accum+=$MATCH
+    pcre_match -b -n $b[2] -- $string
+done
+print -l $accum
+
+
+)
 )
 enditem()
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PATCH: New options for the PCRE module (to replace my previous)
  2009-03-25 11:21   ` Jon Strait
@ 2009-03-25 11:30     ` Peter Stephenson
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Stephenson @ 2009-03-25 11:30 UTC (permalink / raw)
  To: zsh workers

On Wed, 25 Mar 2009 04:21:36 -0700
Jon Strait <jstrait@moonloop.net> wrote:
> Here is the updated doc patch.

I've committed both parts of the change (stripped some trailing whitespace
but otherwise it's unchanged).  Thanks.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2009-03-25 11:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-25  9:20 PATCH: New options for the PCRE module (to replace my previous) Jon Strait
2009-03-25 10:02 ` Peter Stephenson
2009-03-25 11:21   ` Jon Strait
2009-03-25 11:30     ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).