zsh-workers
 help / color / mirror / code / Atom feed
From: Jon Strait <jstrait@moonloop.net>
To: zsh workers <zsh-workers@sunsite.dk>
Subject: PATCH: New options for the PCRE module
Date: Thu, 26 Feb 2009 23:15:50 -0800	[thread overview]
Message-ID: <49A79326.5070703@moonloop.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 1770 bytes --]

Hi all,

I thought the PCRE module could use a little enhancement, so I added a 
few things that may be useful to some.

Let's see if I can get through all of this before the coffee wears off...

1.  A new '-s' option to pcre_compile.  This is the frequently set 
PCRE_DOTALL option, allowing the dot character to match a newline as well.

2. For pcre_match, a  '-n offset' option for starting the search at the 
offset position in the match string, and a '-b' option for setting the 
variable ZPCRE_OP to the offset pair of positions of the entire 
successful pattern match.  For example, if a pattern matches with the 
'-b' option set, a ZPCRE_OP set to the string "32 45" indicates that the 
entire match started on byte position 32 and ended on byte position 44.  
PCRE is saying byte position 32 to 45 exclusive, zero based.

All of this is to enable the 'find all' functionality.  For example, if 
I want all of the non-overlapping matches within a string, I can now do:

accum=()

pcre_match -b -- $match_string

while [[ -n $ZPCRE_OP ]] do
    b=($=ZPCRE_OP)
    accum+=$MATCH
    pcre_match -b -n $(( b[2] )) -- $match_string
done
   
print -l $accum

On the safe side, regarding the possibility of multi-byte characters, 
I'm assuming that the returned offset positions are only for sending 
back to pcre_match and not for indexing on a match string, because the 
offsets are in byte count, not character count.

3.  A needed correction: all of the module's external variables are now 
unset on each match attempt, so that a failed match will be obvious.

Could someone please point me to the doc files that would need updating 
(for the zshmodule man page), or if someone here has that part 
automated, I can send them whatever targeted write-up they want.



[-- Attachment #2: pcre.patch --]
[-- Type: text/x-patch, Size: 4207 bytes --]

--- pcre.c	2007-07-09 02:30:42.000000000 -0700
+++ pcre-new.c	2009-02-26 22:10:46.000000000 -0800
@@ -82,6 +82,7 @@
     if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
     if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
     if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
+    if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
     
     if (zpcre_utf8_enabled())
 	pcre_opts |= PCRE_UTF8;
@@ -137,20 +138,23 @@
 
 /**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, 
+    int want_offset_pair, int matchedinarr)
 {
     char **captures, *match_all, **matches;
+    char offset_all[50];
     int capture_start = 1;
 
     if (matchedinarr)
 	capture_start = 0;
-    if (matchvar == NULL)
-	matchvar = "MATCH";
-    if (substravar == NULL)
-	substravar = "match";
-
+    
     /* captures[0] will be entire matched string, [1] first substring */
-    if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+    if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+	/* Set to the offsets of the complete match */
+	if (want_offset_pair) {
+	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+	    setsparam("ZPCRE_OP", ztrdup(offset_all));
+	}
 	match_all = ztrdup(captures[0]);
 	setsparam(matchvar, match_all);
 	matches = zarrdup(&captures[capture_start]);
@@ -163,12 +167,30 @@
 
 /**/
 static int
+getposint(char *instr, char *nam)
+{
+    char *eptr;
+    int ret;
+
+    ret = (int)zstrtol(instr, &eptr, 10);
+    if (*eptr || ret < 0) {
+	zwarnnam(nam, "integer expected: %s", instr);
+	return -1;
+    }
+
+    return ret;
+}
+
+/**/
+static int
 bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
     int ret, capcount, *ovec, ovecsize, c;
     char *matched_portion = NULL;
     char *receptacle = NULL;
     int return_value = 1;
+    int offset_start = 0;
+    int want_offset_pair = 0;
 
     if (pcre_pattern == NULL) {
 	zwarnnam(nam, "no pattern has been compiled");
@@ -181,6 +203,12 @@
     if(OPT_HASARG(ops,c='v')) {
 	matched_portion = OPT_ARG(ops,c);
     }
+    if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search */
+	offset_start = getposint(OPT_ARG(ops,c), nam);
+    }
+    /* For the entire match, 'Return' the offset positions instead of the matched string */
+    if(OPT_ISSET(ops,'b')) want_offset_pair = 1; 
+    
     if(!*args) {
 	zwarnnam(nam, "not enough arguments");
     }
@@ -194,12 +222,22 @@
     ovecsize = (capcount+1)*3;
     ovec = zalloc(ovecsize*sizeof(int));
     
-    ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
+    ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), offset_start, 0, ovec, ovecsize);
+
+    if (matched_portion == NULL)
+	matched_portion = "MATCH";
+    if (receptacle == NULL)
+	receptacle = "match";
+
+    /* Reset the external variables */
+    unsetparam(matched_portion);
+    unsetparam(receptacle);
+    unsetparam("ZPCRE_OP");
     
     if (ret==0) return_value = 0;
     else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
+	zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
 	return_value = 0;
     }
     else {
@@ -258,7 +296,7 @@
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
+		    zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
@@ -289,8 +327,8 @@
 #endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
 
 static struct builtin bintab[] = {
-    BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:",    NULL),
+    BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:n:b",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 

             reply	other threads:[~2009-02-27  7:27 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-27  7:15 Jon Strait [this message]
2009-02-27  8:33 ` Phil Pennock
2009-02-27 23:10 Jon Strait

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49A79326.5070703@moonloop.net \
    --to=jstrait@moonloop.net \
    --cc=zsh-workers@sunsite.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).