zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: migrate pcre module to pcre2
@ 2023-05-05 23:35 Oliver Kiddle
  2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Oliver Kiddle @ 2023-05-05 23:35 UTC (permalink / raw)
  To: Zsh workers

The old PCRE library that zsh uses is considered "end of life, and is no
longer being actively maintained". The newer PCRE2 has a slighty
different interface. It has been around since 2015 so this is not new.
Actual version numbers are 8.x and 10.x rather than 1 and 2. Migrating
seems to be the sensible way forward rather than, for example, adding a
separate zsh/pcre2 module. That way, zsh users don't need to be
concerned with the change.

I've mapped the pcre_study builtin which was named according to the
old interface onto pcre2_jit_compile. We previously had autoconf tests
for all of pcre_compile, pcre_study and pcre_exec. Building PCRE2 with
--disable-jit doesn't affect what symbols the resulting library contains
so I saw no point in checking for anything more than pcre2_compile_8
(and pcre2.h).

It remains the case that if you build with --enable-pcre, then pcre is
linked into everything. As before, LDFLAGS='-Wl,--as-needed' (or
-zignore depending on your linker) is a workaround.

The new interface handles embedded NULs in the regex so the warnings
that were added for that in workers/41308 could be removed and test
cases adapted. Aside from that this adds a test case for an empty capture
which came about as part of satisfying myself that I wasn't introducing
a bug when removing the *x++ = NULL; line in zpcre_get_substrings().

Would it perhaps be useful to add support for PCRE2's alternative DFA
algorithm? This might meet the needs Phil was considering with the re2
based module he once posted but never committed. What form might that
take?

When looking at PCRE documentation for this, other ideas for extensions
I saw would be to support:

 * named capture groups
   (either to an associative array or to variables directly?)
 * callouts
   (would need to consider the interface, either eval a string
   or name a function, then either variables or parameters for
   the callout interface. And reentrancy may be a concern.)

Oliver

diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 46875a59b..079ecc2c5 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -34,11 +34,11 @@
 #define CPCRE_PLAIN 0
 
 /**/
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#include <pcre.h>
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
-static pcre *pcre_pattern;
-static pcre_extra *pcre_hints;
+static pcre2_code *pcre_pattern;
 
 /**/
 static int
@@ -54,8 +54,8 @@ zpcre_utf8_enabled(void)
 	return 0;
 
     if ((have_utf8_pcre == -1) &&
-	(pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre))) {
-	    have_utf8_pcre = -2; /* erk, failed to ask */
+       (pcre2_config(PCRE2_CONFIG_UNICODE, &have_utf8_pcre))) {
+           have_utf8_pcre = -2; /* erk, failed to ask */
     }
 
     return (have_utf8_pcre == 1) && (!strcmp(nl_langinfo(CODESET), "UTF-8"));
@@ -69,115 +69,87 @@ zpcre_utf8_enabled(void)
 static int
 bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
 {
-    int pcre_opts = 0, pcre_errptr, target_len;
-    const char *pcre_error;
+    uint32_t pcre_opts = 0;
+    int target_len;
+    int pcre_error;
+    PCRE2_SIZE pcre_offset;
     char *target;
     
-    if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
-    if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
-    if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
-    if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
-    if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
+    if (OPT_ISSET(ops, 'a')) pcre_opts |= PCRE2_ANCHORED;
+    if (OPT_ISSET(ops, 'i')) pcre_opts |= PCRE2_CASELESS;
+    if (OPT_ISSET(ops, 'm')) pcre_opts |= PCRE2_MULTILINE;
+    if (OPT_ISSET(ops, 'x')) pcre_opts |= PCRE2_EXTENDED;
+    if (OPT_ISSET(ops, 's')) pcre_opts |= PCRE2_DOTALL;
     
     if (zpcre_utf8_enabled())
-	pcre_opts |= PCRE_UTF8;
-
-#ifdef HAVE_PCRE_STUDY
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
-#endif
+	pcre_opts |= PCRE2_UTF;
 
     if (pcre_pattern)
-	pcre_free(pcre_pattern);
+	pcre2_code_free(pcre_pattern);
     pcre_pattern = NULL;
 
     target = ztrdup(*args);
     unmetafy(target, &target_len);
 
-    if ((int)strlen(target) != target_len) {
-	zwarnnam(nam, "embedded NULs in PCRE pattern terminate pattern");
-    }
-
-    pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+    pcre_pattern = pcre2_compile((PCRE2_SPTR) target, (PCRE2_SIZE) target_len,
+	    pcre_opts, &pcre_error, &pcre_offset, NULL);
 
     free(target);
 
     if (pcre_pattern == NULL)
     {
-	zwarnnam(nam, "error in regex: %s", pcre_error);
+	PCRE2_UCHAR buffer[256];
+	pcre2_get_error_message(pcre_error, buffer, sizeof(buffer));
+	zwarnnam(nam, "error in regex: %s", buffer);
 	return 1;
     }
     
     return 0;
 }
 
-/**/
-#ifdef HAVE_PCRE_STUDY
-
 /**/
 static int
 bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int func))
 {
-    const char *pcre_error;
-
     if (pcre_pattern == NULL)
     {
 	zwarnnam(nam, "no pattern has been compiled for study");
 	return 1;
     }
-    
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
 
-    pcre_hints = pcre_study(pcre_pattern, 0, &pcre_error);
-    if (pcre_error != NULL)
-    {
-	zwarnnam(nam, "error while studying regex: %s", pcre_error);
-	return 1;
+    int jit = 0;
+    if (!pcre2_config(PCRE2_CONFIG_JIT, &jit) && jit) {
+	if (pcre2_jit_compile(pcre_pattern, PCRE2_JIT_COMPLETE) < 0) {
+	    zwarnnam(nam, "error while studying regex");
+	    return 1;
+	}
     }
     
     return 0;
 }
 
-/**/
-#else /* !HAVE_PCRE_STUDY */
-
-# define bin_pcre_study bin_notavail
-
-/**/
-#endif /* !HAVE_PCRE_STUDY */
-
-/**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
-		     char *substravar, int want_offset_pair, int matchedinarr,
-		     int want_begin_end)
+zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
+	char *matchvar, char *substravar, int want_offset_pair,
+	int matchedinarr, int want_begin_end)
 {
-    char **captures, *match_all, **matches;
+    PCRE2_SIZE *ovec;
+    char *match_all, **matches;
     char offset_all[50];
     int capture_start = 1;
 
     if (matchedinarr) {
-	/* bash-style captures[0] entire-matched string in the array */
+	/* bash-style ovec[0] entire-matched string in the array */
 	capture_start = 0;
     }
 
-    /* captures[0] will be entire matched string, [1] first substring */
-    if (!pcre_get_substring_list(arg, ovec, captured_count, (const char ***)&captures)) {
-	int nelem = arrlen(captures)-1;
+    /* ovec[0] will be entire matched string, [1] first substring */
+    ovec = pcre2_get_ovector_pointer(mdata);
+    if (ovec) {
+	int nelem = captured_count - 1;
 	/* Set to the offsets of the complete match */
 	if (want_offset_pair) {
-	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+	    sprintf(offset_all, "%ld %ld", ovec[0], ovec[1]);
 	    setsparam("ZPCRE_OP", ztrdup(offset_all));
 	}
 	/*
@@ -186,7 +158,7 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
 	 * ovec is length 2*(1+capture_list_length)
 	 */
 	if (matchvar) {
-	    match_all = metafy(captures[0], ovec[1] - ovec[0], META_DUP);
+	    match_all = metafy(arg + ovec[0], ovec[1] - ovec[0], META_DUP);
 	    setsparam(matchvar, match_all);
 	}
 	/*
@@ -201,16 +173,12 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
 	 */
 	if (substravar &&
 	    (!want_begin_end || nelem)) {
-	    char **x, **y;
+	    char **x;
 	    int vec_off, i;
-	    y = &captures[capture_start];
 	    matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
-	    for (i = capture_start; i < captured_count; i++, y++) {
+	    for (i = capture_start; i < captured_count; i++) {
 		vec_off = 2*i;
-		if (*y)
-		    *x++ = metafy(*y, ovec[vec_off+1]-ovec[vec_off], META_DUP);
-		else
-		    *x++ = NULL;
+		*x++ = metafy(arg + ovec[vec_off], ovec[vec_off+1]-ovec[vec_off], META_DUP);
 	    }
 	    *x = NULL;
 	    setaparam(substravar, matches);
@@ -247,7 +215,8 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
 	    setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
 	    if (nelem) {
 		char **mbegin, **mend, **bptr, **eptr;
-		int i, *ipair;
+		int i;
+		size_t *ipair;
 
 		bptr = mbegin = zalloc(sizeof(char*)*(nelem+1));
 		eptr = mend = zalloc(sizeof(char*)*(nelem+1));
@@ -287,8 +256,6 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
 		setaparam("mend", mend);
 	    }
 	}
-
-	pcre_free_substring_list((const char **)captures);
     }
 
     return 0;
@@ -314,7 +281,8 @@ getposint(char *instr, char *nam)
 static int
 bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
-    int ret, capcount, *ovec, ovecsize, c;
+    int ret, c;
+    pcre2_match_data *pcre_mdata = NULL;
     char *matched_portion = NULL;
     char *plaintext = NULL;
     char *receptacle = NULL;
@@ -344,36 +312,30 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     /* For the entire match, 'Return' the offset byte positions instead of the matched string */
     if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
 
-    if ((ret = pcre_fullinfo(pcre_pattern, pcre_hints, PCRE_INFO_CAPTURECOUNT, &capcount)))
-    {
-	zwarnnam(nam, "error %d in fullinfo", ret);
-	return 1;
-    }
-
-    ovecsize = (capcount+1)*3;
-    ovec = zalloc(ovecsize*sizeof(int));
-
     plaintext = ztrdup(*args);
     unmetafy(plaintext, &subject_len);
 
     if (offset_start > 0 && offset_start >= subject_len)
-	ret = PCRE_ERROR_NOMATCH;
-    else
-	ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
+	ret = PCRE2_ERROR_NOMATCH;
+    else {
+	pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
+	ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
+		offset_start, 0, pcre_mdata, NULL);
+    }
 
     if (ret==0) return_value = 0;
-    else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
+    else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
+	zpcre_get_substrings(plaintext, pcre_mdata, ret, matched_portion, receptacle,
 			     want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
-	zwarnnam(nam, "error in pcre_exec [%d]", ret);
+	zwarnnam(nam, "error in pcre2_match [%d]", ret);
     }
     
-    if (ovec)
-	zfree(ovec, ovecsize*sizeof(int));
+    if (pcre_mdata)
+	pcre2_match_data_free(pcre_mdata);
     zsfree(plaintext);
 
     return return_value;
@@ -383,17 +345,19 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 static int
 cond_pcre_match(char **a, int id)
 {
-    pcre *pcre_pat;
-    const char *pcre_err;
+    pcre2_code *pcre_pat = NULL;
+    int pcre_err;
+    PCRE2_SIZE pcre_erroff;
     char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar, *svar;
-    int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
+    int r = 0, pcre_opts = 0;
+    pcre2_match_data *pcre_mdata = NULL;
     int lhstr_plain_len, rhre_plain_len;
     int return_value = 0;
 
     if (zpcre_utf8_enabled())
-	pcre_opts |= PCRE_UTF8;
+	pcre_opts |= PCRE2_UTF;
     if (isset(REMATCHPCRE) && !isset(CASEMATCH))
-	pcre_opts |= PCRE_CASELESS;
+	pcre_opts |= PCRE2_CASELESS;
 
     lhstr = cond_str(a,0,0);
     rhre = cond_str(a,1,0);
@@ -401,9 +365,6 @@ cond_pcre_match(char **a, int id)
     rhre_plain = ztrdup(rhre);
     unmetafy(lhstr_plain, &lhstr_plain_len);
     unmetafy(rhre_plain, &rhre_plain_len);
-    pcre_pat = NULL;
-    ov = NULL;
-    ovsize = 0;
 
     if (isset(BASHREMATCH)) {
 	svar = NULL;
@@ -415,27 +376,27 @@ cond_pcre_match(char **a, int id)
 
     switch(id) {
 	 case CPCRE_PLAIN:
-		if ((int)strlen(rhre_plain) != rhre_plain_len) {
-		    zwarn("embedded NULs in PCRE pattern terminate pattern");
-		}
-		pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
-		if (pcre_pat == NULL) {
-		    zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
+		if (!(pcre_pat = pcre2_compile((PCRE2_SPTR) rhre_plain,
+			(PCRE2_SIZE) rhre_plain_len, pcre_opts,
+			&pcre_err, &pcre_erroff, NULL)))
+		{
+		    PCRE2_UCHAR buffer[256];
+		    pcre2_get_error_message(pcre_err, buffer, sizeof(buffer));
+		    zwarn("failed to compile regexp /%s/: %s", rhre, buffer);
 		    break;
 		}
-                pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
-    		ovsize = (capcnt+1)*3;
-		ov = zalloc(ovsize*sizeof(int));
-    		r = pcre_exec(pcre_pat, NULL, lhstr_plain, lhstr_plain_len, 0, 0, ov, ovsize);
-		/* r < 0 => error; r==0 match but not enough size in ov
+		pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pat, NULL);
+		r = pcre2_match(pcre_pat, (PCRE2_SPTR8) lhstr_plain, lhstr_plain_len,
+			0, 0, pcre_mdata, NULL);
+		/* r < 0 => error; r==0 match but not enough size in match data
 		 * r > 0 => (r-1) substrings found; r==1 => no substrings
 		 */
     		if (r==0) {
-		    zwarn("reportable zsh problem: pcre_exec() returned 0");
+		    zwarn("reportable zsh problem: pcre2_match() returned 0");
 		    return_value = 1;
 		    break;
 		}
-	        else if (r==PCRE_ERROR_NOMATCH) {
+		else if (r == PCRE2_ERROR_NOMATCH) {
 		    return_value = 0; /* no match */
 		    break;
 		}
@@ -444,7 +405,7 @@ cond_pcre_match(char **a, int id)
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr_plain, ov, r, svar, avar, 0,
+		    zpcre_get_substrings(lhstr_plain, pcre_mdata, r, svar, avar, 0,
 					 isset(BASHREMATCH),
 					 !isset(BASHREMATCH));
 		    return_value = 1;
@@ -457,10 +418,10 @@ cond_pcre_match(char **a, int id)
 	free(lhstr_plain);
     if(rhre_plain)
 	free(rhre_plain);
+    if (pcre_mdata)
+	pcre2_match_data_free(pcre_mdata);
     if (pcre_pat)
-	pcre_free(pcre_pat);
-    if (ov)
-	zfree(ov, ovsize*sizeof(int));
+	pcre2_code_free(pcre_pat);
 
     return return_value;
 }
@@ -489,11 +450,11 @@ static struct builtin bintab[] = {
 
 static struct features module_features = {
     bintab, sizeof(bintab)/sizeof(*bintab),
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
     cotab, sizeof(cotab)/sizeof(*cotab),
-#else /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#else /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
     NULL, 0,
-#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#endif /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
     NULL, 0,
     NULL, 0,
     0
@@ -540,19 +501,9 @@ cleanup_(Module m)
 int
 finish_(UNUSED(Module m))
 {
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#ifdef HAVE_PCRE_STUDY
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
-#endif
-
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
     if (pcre_pattern)
-	pcre_free(pcre_pattern);
+	pcre2_code_free(pcre_pattern);
     pcre_pattern = NULL;
 #endif
 
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 22a0b64c7..6eb366964 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -117,12 +117,17 @@
 >78884; ZPCRE_OP: 25 30
 >90210; ZPCRE_OP: 31 36
 
-# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
+# Embedded NULs allowed in plaintext, in RE, pcre supports \0 as two-chars
   [[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
   print "${#MATCH}; ${#match[1]}; ${#match[2]}"
 0:ensure ASCII NUL passes in and out of matched plaintext
 >6; 3; 3
 
+# PCRE2 supports NULs also in the RE
+  [[ $'a\0b\0c' =~ $'^(.\0)+' ]] && print "${#MATCH}; ${#match[1]}"
+0:ensure ASCII NUL works also in the regex
+>4; 2
+
 # Ensure the long-form infix operator works
   [[ foo -pcre-match ^f..$ ]]
   print $?
@@ -169,7 +174,11 @@
   [[ é =~ '^..\z' ]]; echo $?
   LANG=$LANG_SAVE
   [[ é =~ '^.\z' ]]; echo $?
-0:swich between C/UTF-8 locales
+0:switch between C/UTF-8 locales
 >0
 >0
 >0
+
+  [[ abc =~ 'a(d*)bc' ]] && print "$#MATCH; $#match; ${#match[1]}"
+0:empty capture
+>3; 1; 0
diff --git a/configure.ac b/configure.ac
index d8a17791a..4710d1659 100644
--- a/configure.ac
+++ b/configure.ac
@@ -438,7 +438,7 @@ fi],
 
 dnl Do you want to look for pcre support?
 AC_ARG_ENABLE(pcre,
-AS_HELP_STRING([--enable-pcre],[enable the search for the pcre library (may create run-time library dependencies)]))
+AS_HELP_STRING([--enable-pcre],[enable the search for the pcre2 library (may create run-time library dependencies)]))
 
 dnl Do you want to look for capability support?
 AC_ARG_ENABLE(cap,
@@ -652,13 +652,12 @@ AC_HEADER_SYS_WAIT
 
 oldcflags="$CFLAGS"
 if test x$enable_pcre = xyes; then
-AC_CHECK_PROG([PCRECONF], pcre-config, pcre-config)
-dnl Typically (meaning on this single RedHat 9 box in front of me)
-dnl pcre-config --cflags produces a -I output which needs to go into
+AC_CHECK_PROG([PCRECONF], pcre2-config, pcre2-config)
+dnl pcre2-config --cflags may produce a -I output which needs to go into
 dnl CPPFLAGS else configure's preprocessor tests don't pick it up,
 dnl producing a warning.
-if test "x$ac_cv_prog_PCRECONF" = xpcre-config; then
-  CPPFLAGS="$CPPFLAGS `pcre-config --cflags`"
+if test "x$ac_cv_prog_PCRECONF" = xpcre2-config; then
+  CPPFLAGS="$CPPFLAGS `pcre2-config --cflags`"
 fi
 fi
 
@@ -668,9 +667,10 @@ AC_CHECK_HEADERS(sys/time.h sys/times.h sys/select.h termcap.h termio.h \
 		 locale.h errno.h stdio.h stdarg.h varargs.h stdlib.h \
 		 unistd.h sys/capability.h \
 		 utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \
-		 netinet/in_systm.h pcre.h langinfo.h wchar.h stddef.h \
+		 netinet/in_systm.h langinfo.h wchar.h stddef.h \
 		 sys/stropts.h iconv.h ncurses.h ncursesw/ncurses.h \
 		 ncurses/ncurses.h)
+AC_CHECK_HEADERS([pcre2.h],,,[#define PCRE2_CODE_UNIT_WIDTH 8])
 if test x$dynamic = xyes; then
   AC_CHECK_HEADERS(dlfcn.h)
   AC_CHECK_HEADERS(dl.h)
@@ -948,9 +948,7 @@ if test "x$ac_found_iconv" = "xyes"; then
 fi
 
 if test x$enable_pcre = xyes; then
-dnl pcre-config should probably be employed here
-dnl AC_SEARCH_LIBS(pcre_compile, pcre)
-  LIBS="`$ac_cv_prog_PCRECONF --libs` $LIBS"
+  LIBS="`$ac_cv_prog_PCRECONF --libs8` $LIBS"
 fi
 
 dnl ---------------------
@@ -1313,7 +1311,7 @@ AC_CHECK_FUNCS(strftime strptime mktime timelocal \
 	       pathconf sysconf \
 	       tgetent tigetflag tigetnum tigetstr setupterm initscr resize_term \
 	       getcchar setcchar waddwstr wget_wch win_wch use_default_colors \
-	       pcre_compile pcre_study pcre_exec \
+	       pcre2_compile_8 \
 	       nl_langinfo \
 	       erand48 open_memstream \
 	       posix_openpt \


^ permalink raw reply	[flat|nested] 9+ messages in thread

* named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2)
  2023-05-05 23:35 PATCH: migrate pcre module to pcre2 Oliver Kiddle
@ 2023-05-06  4:44 ` Stephane Chazelas
  2023-05-06 18:56   ` Bart Schaefer
  2023-05-10 23:26 ` PATCH: migrate pcre module to pcre2 Oliver Kiddle
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Stephane Chazelas @ 2023-05-06  4:44 UTC (permalink / raw)
  To: Oliver Kiddle; +Cc: Zsh workers

2023-05-06 01:35:46 +0200, Oliver Kiddle:
[...]
> When looking at PCRE documentation for this, other ideas for extensions
> I saw would be to support:
> 
>  * named capture groups
>    (either to an associative array or to variables directly?)
[...]

I was thinking about that when answering
https://unix.stackexchange.com/questions/742664/bash-posix-regex-optional-groups/742680#742680
recently.

Maybe we could do:

typeset -A match
if [[ aa =~ '(?<foo>.)(?<bar>)' ]]
  print -r -- $match[foo] $match[bar]

That is if $match is an associative array, store the named
capture groups there. Same for pcre_match -a array (or add a -A
assoc there)

There's the question of $mbegin / $mend. Should all three
$match, $mbegin and $mend be automatically converted to assoc if
any of them is an assoc?

-- 
Stephane


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2)
  2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
@ 2023-05-06 18:56   ` Bart Schaefer
  2023-05-06 23:06     ` Oliver Kiddle
  0 siblings, 1 reply; 9+ messages in thread
From: Bart Schaefer @ 2023-05-06 18:56 UTC (permalink / raw)
  To: Zsh workers

On Fri, May 5, 2023 at 9:44 PM Stephane Chazelas <stephane@chazelas.org> wrote:
>
> 2023-05-06 01:35:46 +0200, Oliver Kiddle:
> [...]
> >  * named capture groups
> >    (either to an associative array or to variables directly?)
> [...]
>
> Maybe [...]
>
> [...] if $match is an associative array, store the named
> capture groups there. Same for pcre_match -a array (or add a -A
> assoc there)

Since this would be new functionality anyway, perhaps it's a good
opportunity to take advantage of namespaces, e.g. ${.pcre.match[foo]}
etc., instead of overloading the existing variables.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2)
  2023-05-06 18:56   ` Bart Schaefer
@ 2023-05-06 23:06     ` Oliver Kiddle
  0 siblings, 0 replies; 9+ messages in thread
From: Oliver Kiddle @ 2023-05-06 23:06 UTC (permalink / raw)
  To: Bart Schaefer; +Cc: Zsh workers

Bart Schaefer wrote:
> > >  * named capture groups

> Since this would be new functionality anyway, perhaps it's a good
> opportunity to take advantage of namespaces, e.g. ${.pcre.match[foo]}
> etc., instead of overloading the existing variables.

You can mix named and unnamed captures in Perl so overloading is perhaps
better avoided. And I agree we should make good use of the namespaces.
This uses that but also adds -A to pcre_match as suggested by Stephane.

Oliver

diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo
index c2817f519..6d073985d 100644
--- a/Doc/Zsh/mod_pcre.yo
+++ b/Doc/Zsh/mod_pcre.yo
@@ -20,12 +20,12 @@ including those that indicate newline.
 )
 findex(pcre_study)
 item(tt(pcre_study))(
-Studies the previously-compiled PCRE which may result in faster
-matching.
+Requests JIT compilation for the previously-compiled PCRE which
+may result in faster matching.
 )
 findex(pcre_match)
 item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
-[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
+[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -36,7 +36,9 @@ substrings, unless the tt(-a) option is given, in which
 case it will set the array var(arr).  Similarly, the variable
 tt(MATCH) will be set to the entire matched portion of the
 string, unless the tt(-v) option is given, in which case the variable
-var(var) will be set.
+var(var) will be set. Furthermore, any named captures will
+be stored in the associative array tt(.pcre.match) unless an
+alternative is given with tt(-A).
 No variables are altered if there is no successful match.
 A tt(-n) option starts searching for a match from the
 byte var(offset) position in var(string).  If the tt(-b) option is given,
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 079ecc2c5..6be1f76e2 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -129,14 +129,17 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
 }
 
 static int
-zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
-	char *matchvar, char *substravar, int want_offset_pair,
-	int matchedinarr, int want_begin_end)
+zpcre_get_substrings(pcre2_code *pat, char *arg, pcre2_match_data *mdata,
+	int captured_count, char *matchvar, char *substravar, char *namedassoc,
+	int want_offset_pair, int matchedinarr, int want_begin_end)
 {
     PCRE2_SIZE *ovec;
     char *match_all, **matches;
     char offset_all[50];
     int capture_start = 1;
+    int vec_off;
+    PCRE2_SPTR ntable; /* table of named captures */
+    uint32_t ncount, nsize;
 
     if (matchedinarr) {
 	/* bash-style ovec[0] entire-matched string in the array */
@@ -174,7 +177,7 @@ zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
 	if (substravar &&
 	    (!want_begin_end || nelem)) {
 	    char **x;
-	    int vec_off, i;
+	    int i;
 	    matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
 	    for (i = capture_start; i < captured_count; i++) {
 		vec_off = 2*i;
@@ -184,6 +187,23 @@ zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
 	    setaparam(substravar, matches);
 	}
 
+	if (!pcre2_pattern_info(pat, PCRE2_INFO_NAMECOUNT, &ncount) && ncount
+		&& !pcre2_pattern_info(pat, PCRE2_INFO_NAMEENTRYSIZE, &nsize)
+		&& !pcre2_pattern_info(pat, PCRE2_INFO_NAMETABLE, &ntable))
+	{
+	    char **hash, **hashptr;
+	    uint32_t nidx;
+	    hashptr = hash = (char **)zshcalloc((ncount+1)*2*sizeof(char *));
+	    for (nidx = 0; nidx < ncount; nidx++) {
+		vec_off = (ntable[nsize * nidx] << 9) + 2 * ntable[nsize * nidx + 1];
+		/* would metafy the key but pcre limits characters in the name */
+		*hashptr++ = ztrdup((char *) ntable + nsize * nidx + 2);
+		*hashptr++ = metafy(arg + ovec[vec_off],
+			ovec[vec_off+1]-ovec[vec_off], META_DUP);
+	    }
+	    sethparam(namedassoc, hash);
+	}
+
 	if (want_begin_end) {
 	    /*
 	     * cond-infix rather than builtin; also not bash; so we set a bunch
@@ -286,6 +306,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     char *matched_portion = NULL;
     char *plaintext = NULL;
     char *receptacle = NULL;
+    char *named = ".pcre.match";
     int return_value = 1;
     /* The subject length and offset start are both int values in pcre_exec */
     int subject_len;
@@ -305,6 +326,9 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if(OPT_HASARG(ops,c='v')) {
 	matched_portion = OPT_ARG(ops,c);
     }
+    if (OPT_HASARG(ops, c='A')) {
+	named = OPT_ARG(ops, c);
+    }
     if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
 	if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
 	    return 1;
@@ -326,8 +350,8 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if (ret==0) return_value = 0;
     else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(plaintext, pcre_mdata, ret, matched_portion, receptacle,
-			     want_offset_pair, 0, 0);
+	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret, matched_portion,
+		receptacle, named, want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
@@ -405,9 +429,8 @@ cond_pcre_match(char **a, int id)
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr_plain, pcre_mdata, r, svar, avar, 0,
-					 isset(BASHREMATCH),
-					 !isset(BASHREMATCH));
+		    zpcre_get_substrings(pcre_pat, lhstr_plain, pcre_mdata, r, svar, avar,
+			    ".pcre.match", 0, isset(BASHREMATCH), !isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
@@ -443,7 +466,7 @@ static struct conddef cotab[] = {
 
 static struct builtin bintab[] = {
     BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:n:b",    NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:b",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 6eb366964..027fea3aa 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -182,3 +182,17 @@
   [[ abc =~ 'a(d*)bc' ]] && print "$#MATCH; $#match; ${#match[1]}"
 0:empty capture
 >3; 1; 0
+
+  [[ category/name-12345 =~ '(?x)^
+    (?<category> [^/]* ) /
+    (?<package>
+      (?<name> \w+ ) -
+      (?<version> \d+ ))$' ]]
+  typeset -p1 .pcre.match
+0:named captures
+>typeset -g -A .pcre.match=(
+>  [category]=category
+>  [name]=name
+>  [package]=name-12345
+>  [version]=12345
+>)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: PATCH: migrate pcre module to pcre2
  2023-05-05 23:35 PATCH: migrate pcre module to pcre2 Oliver Kiddle
  2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
@ 2023-05-10 23:26 ` Oliver Kiddle
  2023-05-12 23:12 ` Phil Pennock
  2023-06-13  7:35 ` Jun T
  3 siblings, 0 replies; 9+ messages in thread
From: Oliver Kiddle @ 2023-05-10 23:26 UTC (permalink / raw)
  To: Zsh workers

On 6 May, I wrote:
> Would it perhaps be useful to add support for PCRE2's alternative DFA
> algorithm? This might meet the needs Phil was considering with the re2
> based module he once posted but never committed. What form might that
> take?

This adds a -d option to pcre_match to do this. Any better suggestions
for the option letter?

The differences are documented in pcre2matching(3). That man page says
that in addition to not supporting various features like captures
that DFA is substantially slower. What this misses is that it avoids
the problem of pathological cases that exhibit terrible worst-case
performance with a backtracking algorithm.

It returns all possible matches from the same start position so the
result needs to be an array. This uses $match or whatever array is
specified with -a which is what we use for captures with the
backtracking algorithm.

Oliver

diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo
index 6d073985d..da73ac85a 100644
--- a/Doc/Zsh/mod_pcre.yo
+++ b/Doc/Zsh/mod_pcre.yo
@@ -25,7 +25,7 @@ may result in faster matching.
 )
 findex(pcre_match)
 item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
-[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
+[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-bd) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -69,6 +69,10 @@ print -l $accum)
 )
 enditem()
 
+The option tt(-d) uses the alternative breadth-first DFA search algorithm of
+pcre. This sets tt(match), or the array given with tt(-a), to all the matches
+found from the same start point in the subject.
+
 The tt(zsh/pcre) module makes available the following test condition:
 
 startitem()
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 6be1f76e2..96f3c6e65 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -305,30 +305,29 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     pcre2_match_data *pcre_mdata = NULL;
     char *matched_portion = NULL;
     char *plaintext = NULL;
-    char *receptacle = NULL;
-    char *named = ".pcre.match";
+    char *receptacle;
+    char *named = NULL;
     int return_value = 1;
     /* The subject length and offset start are both int values in pcre_exec */
     int subject_len;
     int offset_start = 0;
     int want_offset_pair = 0;
+    int use_dfa = 0;
 
     if (pcre_pattern == NULL) {
 	zwarnnam(nam, "no pattern has been compiled");
 	return 1;
     }
 
-    matched_portion = "MATCH";
-    receptacle = "match";
-    if(OPT_HASARG(ops,c='a')) {
-	receptacle = OPT_ARG(ops,c);
-    }
-    if(OPT_HASARG(ops,c='v')) {
-	matched_portion = OPT_ARG(ops,c);
-    }
-    if (OPT_HASARG(ops, c='A')) {
-	named = OPT_ARG(ops, c);
+    if (!(use_dfa = OPT_ISSET(ops, 'd'))) {
+	matched_portion = OPT_HASARG(ops, c='v') ? OPT_ARG(ops, c) : "MATCH";
+	named = OPT_HASARG(ops, c='A') ? OPT_ARG(ops, c) : ".pcre.match";
+    } else if (OPT_HASARG(ops, c='v') || OPT_HASARG(ops, c='A')) {
+	zwarnnam(nam, "-d cannot be combined with -%c", c);
+	return 1;
     }
+    receptacle = OPT_HASARG(ops, 'a') ? OPT_ARG(ops, 'a') : "match";
+
     if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
 	if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
 	    return 1;
@@ -341,7 +340,25 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 
     if (offset_start > 0 && offset_start >= subject_len)
 	ret = PCRE2_ERROR_NOMATCH;
-    else {
+    else if (use_dfa) {
+	PCRE2_SIZE old, wscount = 128, capcount = 128;
+	void *workspace = zhalloc(sizeof(int) * wscount);
+	pcre_mdata = pcre2_match_data_create(capcount, NULL);
+	do {
+	    ret = pcre2_dfa_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
+		offset_start, 0, pcre_mdata, NULL, (int *) workspace, wscount);
+	    if (ret == PCRE2_ERROR_DFA_WSSIZE) {
+		old = wscount;
+		wscount += wscount / 2;
+		workspace = hrealloc(workspace, sizeof(int) * old, sizeof(int) * wscount);
+	    } else if (ret == 0) {
+		capcount += capcount / 2;
+		pcre2_match_data_free(pcre_mdata);
+		pcre_mdata = pcre2_match_data_create(capcount, NULL);
+	    } else
+		break;
+	} while(1);
+    } else {
 	pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
 	ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
 		offset_start, 0, pcre_mdata, NULL);
@@ -350,12 +367,14 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if (ret==0) return_value = 0;
     else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret, matched_portion,
-		receptacle, named, want_offset_pair, 0, 0);
+	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret,
+		matched_portion, receptacle, named, want_offset_pair, use_dfa, 0);
 	return_value = 0;
     }
     else {
-	zwarnnam(nam, "error in pcre2_match [%d]", ret);
+	PCRE2_UCHAR buffer[256];
+	pcre2_get_error_message(ret, buffer, sizeof(buffer));
+	zwarnnam(nam, "error in pcre matching for /%s/: %s", plaintext, buffer);
     }
     
     if (pcre_mdata)
@@ -466,7 +485,7 @@ static struct conddef cotab[] = {
 
 static struct builtin bintab[] = {
     BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:b",    NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:bd",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 027fea3aa..585698d05 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -196,3 +196,8 @@
 >  [package]=name-12345
 >  [version]=12345
 >)
+
+  pcre_compile 'cat(er(pillar)?)?'
+  pcre_match -d 'the caterpillar catchment' && print $match
+0:pcre_match -d
+>caterpillar cater cat


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: PATCH: migrate pcre module to pcre2
  2023-05-05 23:35 PATCH: migrate pcre module to pcre2 Oliver Kiddle
  2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
  2023-05-10 23:26 ` PATCH: migrate pcre module to pcre2 Oliver Kiddle
@ 2023-05-12 23:12 ` Phil Pennock
  2023-05-24 18:11   ` Oliver Kiddle
  2023-06-13  7:35 ` Jun T
  3 siblings, 1 reply; 9+ messages in thread
From: Phil Pennock @ 2023-05-12 23:12 UTC (permalink / raw)
  To: Oliver Kiddle; +Cc: Zsh workers

On 2023-05-06 at 01:35 +0200, Oliver Kiddle wrote:
> The old PCRE library that zsh uses is considered "end of life, and is no
> longer being actively maintained". The newer PCRE2 has a slighty
> different interface. It has been around since 2015 so this is not new.
> Actual version numbers are 8.x and 10.x rather than 1 and 2. Migrating
> seems to be the sensible way forward rather than, for example, adding a
> separate zsh/pcre2 module. That way, zsh users don't need to be
> concerned with the change.

Before anything else, this is great to see, thank you.

> Would it perhaps be useful to add support for PCRE2's alternative DFA
> algorithm? This might meet the needs Phil was considering with the re2
> based module he once posted but never committed. What form might that
> take?

I handed in my commit bit a while back because I used it so rarely and
it felt better for commit to be only the people more actively involved.
Shells and their provenance matter.  So I can't commit.

My main motivation with RE2 was to get to something which all vendors
will actually include when distributing.  The reason can't be licensing,
since the PCRE lib is BSD licensed.

I'd seen RE2 pick up quite a bit of steam and wanted to explore to see
how feasible it was.

Ideally, we'd have a contact within Apple who can tell us what they
might or might not consider, so that we can have working regexps with
semantics younger than POSIX.2 (1992) and some more actual utility.  I'm
tired of dealing with regexps being so sub-par 30 years later.  Trying
to write tools which work both for people using macOS and people using
Linux today pretty much means "write a client in Go".  Which is fine,
but sometimes a scripted solution is desirable.

I wrote the zsh/regex module because folks felt that we needed to match
Bash's semantics and use ERE by default when I added the  =~  syntactic
sugar.  I still see the benefit of that, but wonder if not doing so
would have led to a better net outcome, as distributors would have had
to enable a modern RE engine to get =~ working ... or would it have just
left =~ not working on random platforms.

-Phil


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: PATCH: migrate pcre module to pcre2
  2023-05-12 23:12 ` Phil Pennock
@ 2023-05-24 18:11   ` Oliver Kiddle
  0 siblings, 0 replies; 9+ messages in thread
From: Oliver Kiddle @ 2023-05-24 18:11 UTC (permalink / raw)
  To: Zsh workers

On 12 May, Phil Pennock wrote:
> I handed in my commit bit a while back because I used it so rarely and
> it felt better for commit to be only the people more actively involved.
> Shells and their provenance matter.  So I can't commit.

If I remember correctly, you indicated at the time that it shouldn't be
committed, at least not at that time. We probably have more of a problem
with patches getting missed than having to be backed out later. Trust
and competence don't expire if not used.

> My main motivation with RE2 was to get to something which all vendors
> will actually include when distributing.  The reason can't be licensing,
> since the PCRE lib is BSD licensed.

Do Apple include any of pcre, pcre2 and re2 already with macOS? It's
difficult to guess at their motivations. Zsh's build system linking pcre
into every ELF file by default if you enable pcre may not help. They
must have put some thought into at least their initial packaging of zsh
as they stripped out some of the shell functions.

> I'd seen RE2 pick up quite a bit of steam and wanted to explore to see
> how feasible it was.
>
> Ideally, we'd have a contact within Apple who can tell us what they
> might or might not consider, so that we can have working regexps with
> semantics younger than POSIX.2 (1992) and some more actual utility.  I'm

The one time I had a chance to speak to an actual Apple employee, and
that was someone far removed from the Unix layer, they only reinforced
my general impression that Apple really don't care about the shell and
Unix part of macOS in general. The change of default shell was likely
due to licencing alone. Did Apple at least provide 5.8.1 as a security
patch? RHEL 8 still comes with a vulnerable 5.8 so it would seem that
us doing security only releases is a waste of time.

I generally stick with zsh pattern matching out of habit. =~ seems to be
a popular feature, though, I guess people know and understand regexes.
Does bash ignore the system regex library and use the GNU one on all
systems? That's equivalent to us pulling something like TRE or PCRE into
the zsh source tree and forcing its use.

Oliver


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: PATCH: migrate pcre module to pcre2
  2023-05-05 23:35 PATCH: migrate pcre module to pcre2 Oliver Kiddle
                   ` (2 preceding siblings ...)
  2023-05-12 23:12 ` Phil Pennock
@ 2023-06-13  7:35 ` Jun T
  2023-06-19  8:20   ` Jun T
  3 siblings, 1 reply; 9+ messages in thread
From: Jun T @ 2023-06-13  7:35 UTC (permalink / raw)
  To: zsh-workers

If pcre-config is not available then pcre module is not built
even if pcre2 is available.

The simplest patch is the following.
Or is it better to set enable_pcre=no in configure.ac
if pcre2-config is not available?


diff --git a/Src/Modules/pcre.mdd b/Src/Modules/pcre.mdd
index 6eb3c691b..26c7f6eee 100644
--- a/Src/Modules/pcre.mdd
+++ b/Src/Modules/pcre.mdd
@@ -1,5 +1,5 @@
 name=zsh/pcre
-link=`if test x$enable_pcre = xyes && (pcre-config --version >/dev/null 2>/dev/null); then echo dynamic; else echo no; fi`
+link=`if test x$enable_pcre = xyes && (pcre2-config --version >/dev/null 2>/dev/null); then echo dynamic; else echo no; fi`
 load=no
 
 autofeatures="b:pcre_compile b:pcre_study b:pcre_match"




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: PATCH: migrate pcre module to pcre2
  2023-06-13  7:35 ` Jun T
@ 2023-06-19  8:20   ` Jun T
  0 siblings, 0 replies; 9+ messages in thread
From: Jun T @ 2023-06-19  8:20 UTC (permalink / raw)
  To: zsh-workers


> 2023/06/13 16:35, I wrote
> 
> Or is it better to set enable_pcre=no in configure.ac
> if pcre2-config is not available?

I think this is better. Availability of pcre2-config
need not be checked in two places.
Moreover, currently, pcre2-config is called at configure.ac:956
even if it doesn't exist.

PCRECONF is renamed PCRE_CONFIG. User can override PCRE_CONFIG as
./configure --enable-pcre PCRE_CONFIG=/path/to/pcre2-conf
if pcre2-config is not in PATH.

oldcflags is removed because it is not used anywhere.


diff --git a/Src/Modules/pcre.mdd b/Src/Modules/pcre.mdd
index 6eb3c691b..3e1579117 100644
--- a/Src/Modules/pcre.mdd
+++ b/Src/Modules/pcre.mdd
@@ -1,5 +1,5 @@
 name=zsh/pcre
-link=`if test x$enable_pcre = xyes && (pcre-config --version >/dev/null 2>/dev/null); then echo dynamic; else echo no; fi`
+link=`if test x$enable_pcre = xyes; then echo dynamic; else echo no; fi`
 load=no
 
 autofeatures="b:pcre_compile b:pcre_study b:pcre_match"
diff --git a/configure.ac b/configure.ac
index ba76f9a60..c5263035e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -440,6 +440,17 @@ dnl Do you want to look for pcre support?
 AC_ARG_ENABLE(pcre,
 AS_HELP_STRING([--enable-pcre],[enable the search for the pcre2 library (may create run-time library dependencies)]))
 
+AC_ARG_VAR(PCRE_CONFIG, [pathname of pcre2-config if it is not in PATH])
+if test "x$enable_pcre" = xyes; then
+  AC_CHECK_PROG([PCRE_CONFIG], pcre2-config, pcre2-config)
+  if test "x$PCRE_CONFIG" = x; then
+    enable_pcre=no
+    AC_MSG_WARN([pcre2-config not found: pcre module is disabled.])
+    AC_MSG_NOTICE(
+      [Set PCRE_CONFIG to pathname of pcre2-config if it is not in PATH.])
+  fi
+fi
+
 dnl Do you want to look for capability support?
 AC_ARG_ENABLE(cap,
 AS_HELP_STRING([--enable-cap],[enable the search for POSIX capabilities (may require additional headers to be added by hand)]))
@@ -655,15 +666,12 @@ AC_HEADER_DIRENT
 AC_HEADER_STAT
 AC_HEADER_SYS_WAIT
 
-oldcflags="$CFLAGS"
-if test x$enable_pcre = xyes; then
-AC_CHECK_PROG([PCRECONF], pcre2-config, pcre2-config)
 dnl pcre2-config --cflags may produce a -I output which needs to go into
 dnl CPPFLAGS else configure's preprocessor tests don't pick it up,
 dnl producing a warning.
-if test "x$ac_cv_prog_PCRECONF" = xpcre2-config; then
-  CPPFLAGS="$CPPFLAGS `pcre2-config --cflags`"
-fi
+if test "x$enable_pcre" = xyes; then
+  CPPFLAGS="`$PCRE_CONFIG --cflags` $CPPFLAGS"
+  AC_CHECK_HEADERS([pcre2.h],,,[#define PCRE2_CODE_UNIT_WIDTH 8])
 fi
 
 AC_CHECK_HEADERS(sys/time.h sys/times.h sys/select.h termcap.h termio.h \
@@ -675,7 +683,6 @@ AC_CHECK_HEADERS(sys/time.h sys/times.h sys/select.h termcap.h termio.h \
 		 netinet/in_systm.h langinfo.h wchar.h stddef.h \
 		 sys/stropts.h iconv.h ncurses.h ncursesw/ncurses.h \
 		 ncurses/ncurses.h)
-AC_CHECK_HEADERS([pcre2.h],,,[#define PCRE2_CODE_UNIT_WIDTH 8])
 if test x$dynamic = xyes; then
   AC_CHECK_HEADERS(dlfcn.h)
   AC_CHECK_HEADERS(dl.h)
@@ -952,10 +959,6 @@ if test "x$ac_found_iconv" = "xyes"; then
     [Define as const if the declaration of iconv() needs const.])
 fi
 
-if test x$enable_pcre = xyes; then
-  LIBS="`$ac_cv_prog_PCRECONF --libs8` $LIBS"
-fi
-
 dnl ---------------------
 dnl CHECK TERMCAP LIBRARY
 dnl ---------------------
@@ -1316,7 +1319,6 @@ AC_CHECK_FUNCS(strftime strptime mktime timelocal \
 	       pathconf sysconf \
 	       tgetent tigetflag tigetnum tigetstr setupterm initscr resize_term \
 	       getcchar setcchar waddwstr wget_wch win_wch use_default_colors \
-	       pcre2_compile_8 \
 	       nl_langinfo \
 	       erand48 open_memstream \
 	       posix_openpt \
@@ -1371,6 +1373,11 @@ if test x$zsh_cv_func_realpath_accepts_null = xyes; then
   AC_DEFINE(REALPATH_ACCEPTS_NULL)
 fi
 
+if test x$enable_pcre = xyes; then
+  LIBS="`$PCRE_CONFIG --libs8` $LIBS"
+  AC_CHECK_FUNCS(pcre2_compile_8)
+fi
+
 if test x$enable_cap = xyes; then
   AC_CHECK_FUNCS(cap_get_proc)
 fi




^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-06-19  8:21 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-05 23:35 PATCH: migrate pcre module to pcre2 Oliver Kiddle
2023-05-06  4:44 ` named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2) Stephane Chazelas
2023-05-06 18:56   ` Bart Schaefer
2023-05-06 23:06     ` Oliver Kiddle
2023-05-10 23:26 ` PATCH: migrate pcre module to pcre2 Oliver Kiddle
2023-05-12 23:12 ` Phil Pennock
2023-05-24 18:11   ` Oliver Kiddle
2023-06-13  7:35 ` Jun T
2023-06-19  8:20   ` Jun T

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).