zsh-workers
 help / color / mirror / code / Atom feed
From: Oliver Kiddle <opk@zsh.org>
To: Zsh workers <zsh-workers@zsh.org>
Subject: Re: PATCH: migrate pcre module to pcre2
Date: Thu, 11 May 2023 01:26:53 +0200	[thread overview]
Message-ID: <45573-1683761213.262419@qFbu.zwAQ.DmqT> (raw)
In-Reply-To: <81584-1683329746.147485@6Tk5.mCsC.BbNC>

On 6 May, I wrote:
> Would it perhaps be useful to add support for PCRE2's alternative DFA
> algorithm? This might meet the needs Phil was considering with the re2
> based module he once posted but never committed. What form might that
> take?

This adds a -d option to pcre_match to do this. Any better suggestions
for the option letter?

The differences are documented in pcre2matching(3). That man page says
that in addition to not supporting various features like captures
that DFA is substantially slower. What this misses is that it avoids
the problem of pathological cases that exhibit terrible worst-case
performance with a backtracking algorithm.

It returns all possible matches from the same start position so the
result needs to be an array. This uses $match or whatever array is
specified with -a which is what we use for captures with the
backtracking algorithm.

Oliver

diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo
index 6d073985d..da73ac85a 100644
--- a/Doc/Zsh/mod_pcre.yo
+++ b/Doc/Zsh/mod_pcre.yo
@@ -25,7 +25,7 @@ may result in faster matching.
 )
 findex(pcre_match)
 item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
-[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
+[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-bd) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -69,6 +69,10 @@ print -l $accum)
 )
 enditem()
 
+The option tt(-d) uses the alternative breadth-first DFA search algorithm of
+pcre. This sets tt(match), or the array given with tt(-a), to all the matches
+found from the same start point in the subject.
+
 The tt(zsh/pcre) module makes available the following test condition:
 
 startitem()
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 6be1f76e2..96f3c6e65 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -305,30 +305,29 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     pcre2_match_data *pcre_mdata = NULL;
     char *matched_portion = NULL;
     char *plaintext = NULL;
-    char *receptacle = NULL;
-    char *named = ".pcre.match";
+    char *receptacle;
+    char *named = NULL;
     int return_value = 1;
     /* The subject length and offset start are both int values in pcre_exec */
     int subject_len;
     int offset_start = 0;
     int want_offset_pair = 0;
+    int use_dfa = 0;
 
     if (pcre_pattern == NULL) {
 	zwarnnam(nam, "no pattern has been compiled");
 	return 1;
     }
 
-    matched_portion = "MATCH";
-    receptacle = "match";
-    if(OPT_HASARG(ops,c='a')) {
-	receptacle = OPT_ARG(ops,c);
-    }
-    if(OPT_HASARG(ops,c='v')) {
-	matched_portion = OPT_ARG(ops,c);
-    }
-    if (OPT_HASARG(ops, c='A')) {
-	named = OPT_ARG(ops, c);
+    if (!(use_dfa = OPT_ISSET(ops, 'd'))) {
+	matched_portion = OPT_HASARG(ops, c='v') ? OPT_ARG(ops, c) : "MATCH";
+	named = OPT_HASARG(ops, c='A') ? OPT_ARG(ops, c) : ".pcre.match";
+    } else if (OPT_HASARG(ops, c='v') || OPT_HASARG(ops, c='A')) {
+	zwarnnam(nam, "-d cannot be combined with -%c", c);
+	return 1;
     }
+    receptacle = OPT_HASARG(ops, 'a') ? OPT_ARG(ops, 'a') : "match";
+
     if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
 	if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
 	    return 1;
@@ -341,7 +340,25 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 
     if (offset_start > 0 && offset_start >= subject_len)
 	ret = PCRE2_ERROR_NOMATCH;
-    else {
+    else if (use_dfa) {
+	PCRE2_SIZE old, wscount = 128, capcount = 128;
+	void *workspace = zhalloc(sizeof(int) * wscount);
+	pcre_mdata = pcre2_match_data_create(capcount, NULL);
+	do {
+	    ret = pcre2_dfa_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
+		offset_start, 0, pcre_mdata, NULL, (int *) workspace, wscount);
+	    if (ret == PCRE2_ERROR_DFA_WSSIZE) {
+		old = wscount;
+		wscount += wscount / 2;
+		workspace = hrealloc(workspace, sizeof(int) * old, sizeof(int) * wscount);
+	    } else if (ret == 0) {
+		capcount += capcount / 2;
+		pcre2_match_data_free(pcre_mdata);
+		pcre_mdata = pcre2_match_data_create(capcount, NULL);
+	    } else
+		break;
+	} while(1);
+    } else {
 	pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
 	ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
 		offset_start, 0, pcre_mdata, NULL);
@@ -350,12 +367,14 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if (ret==0) return_value = 0;
     else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret, matched_portion,
-		receptacle, named, want_offset_pair, 0, 0);
+	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret,
+		matched_portion, receptacle, named, want_offset_pair, use_dfa, 0);
 	return_value = 0;
     }
     else {
-	zwarnnam(nam, "error in pcre2_match [%d]", ret);
+	PCRE2_UCHAR buffer[256];
+	pcre2_get_error_message(ret, buffer, sizeof(buffer));
+	zwarnnam(nam, "error in pcre matching for /%s/: %s", plaintext, buffer);
     }
     
     if (pcre_mdata)
@@ -466,7 +485,7 @@ static struct conddef cotab[] = {
 
 static struct builtin bintab[] = {
     BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:b",    NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:bd",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 027fea3aa..585698d05 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -196,3 +196,8 @@
 >  [package]=name-12345
 >  [version]=12345
 >)
+
+  pcre_compile 'cat(er(pillar)?)?'
+  pcre_match -d 'the caterpillar catchment' && print $match
+0:pcre_match -d
+>caterpillar cater cat


  parent reply	other threads:[~2023-05-10 23:27 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-05 23:35 Oliver Kiddle
2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
2023-05-06 18:56   ` Bart Schaefer
2023-05-06 23:06     ` Oliver Kiddle
2023-05-10 23:26 ` Oliver Kiddle [this message]
2023-05-12 23:12 ` PATCH: migrate pcre module to pcre2 Phil Pennock
2023-05-24 18:11   ` Oliver Kiddle
2023-06-13  7:35 ` Jun T
2023-06-19  8:20   ` Jun T

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=45573-1683761213.262419@qFbu.zwAQ.DmqT \
    --to=opk@zsh.org \
    --cc=zsh-workers@zsh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).