zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: =~ regex match
@ 2007-04-26  4:19 Phil Pennock
  2007-04-26  5:12 ` Phil Pennock
  2007-04-26  9:31 ` Peter Stephenson
  0 siblings, 2 replies; 6+ messages in thread
From: Phil Pennock @ 2007-04-26  4:19 UTC (permalink / raw)
  To: zsh-workers

My mental processes push me towards regexps perhaps more than is
healthy.  zsh/pcre's -pcre-match conditional operator is firmly in my
toolkit.

Earlier today I discovered that bash has a =~ operator for doing
extended regexp comparisons.  I became envious, so I did something about
it.  Which didn't involve giving up all the conveniences of zsh which
have spoiled me.

The below patch makes =~ an operator which silently auto-loads zsh/pcre
and uses -pcre-match from there.  This is not bash-compatible in that:

 1: it's PCRE -- far more to my liking :^)
 2: if the regexp pattern is bad, it returns false/1 not false/2.

Further, I became aware of =~ because someone was ranting about bash3.2
breaking their scripts by insisting that the regexp must not be quoted.
zsh's =~ is more compatible with older bash scripts by accepting either.

The Src/cond.c @@ -139,7 +153,8 @@ patch covers a case where a bug in my
developing exposed a bug in zsh's internal-protection error handling,
whereby the else condition of an "if non-null and other condition"
assumed the non-null aspect.

My uncovering that bug does more to demonstrate how little I understand
things, so if there are other issues, please do let me know.

This =~ will return an error cleanly if zsh/pcre is not available for
whatever reason.

Index: Src/cond.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/cond.c,v
retrieving revision 1.8
diff -p -u -r1.8 cond.c
--- Src/cond.c	30 May 2006 22:35:03 -0000	1.8
+++ Src/cond.c	26 Apr 2007 04:05:18 -0000
@@ -34,7 +34,7 @@ int tracingcond;
 
 static char *condstr[COND_MOD] = {
     "!", "&&", "||", "==", "!=", "<", ">", "-nt", "-ot", "-ef", "-eq",
-    "-ne", "-lt", "-gt", "-le", "-ge"
+    "-ne", "-lt", "-gt", "-le", "-ge", "=~"
 };
 
 /*
@@ -53,14 +53,14 @@ int
 evalcond(Estate state, char *fromtest)
 {
     struct stat *st;
-    char *left, *right;
+    char *left, *right, *overridename;
     Wordcode pcode;
     wordcode code;
     int ctype, htok = 0, ret;
 
  rec:
 
-    left = right = NULL;
+    left = right = overridename = NULL;
     pcode = state->pc++;
     code = *pcode;
     ctype = WC_COND_TYPE(code);
@@ -92,13 +92,27 @@ evalcond(Estate state, char *fromtest)
 	    state->pc = pcode + (WC_COND_SKIP(code) + 1);
 	    return ret;
 	}
+    case COND_REGEX:
+	{
+	    int loaded;
+	    loaded = load_module_silence("zsh/pcre", 1);
+	    if (!loaded) {
+		zwarnnam(fromtest, "zsh/pcre not available for regex");
+		return 2;
+	    }
+	    ctype = COND_MODI;
+	    overridename = "-pcre-match";
+	}
     case COND_MOD:
     case COND_MODI:
 	{
 	    Conddef cd;
-	    char *name = ecgetstr(state, EC_NODUP, NULL), **strs;
+	    char *name = overridename;
+	    char **strs;
 	    int l = WC_COND_SKIP(code);
 
+	    if (name == NULL)
+		name = ecgetstr(state, EC_NODUP, NULL);
 	    if (ctype == COND_MOD)
 		strs = ecgetarr(state, l, EC_DUP, NULL);
 	    else {
@@ -139,7 +153,8 @@ evalcond(Estate state, char *fromtest)
 		    return !cd->handler(strs, cd->condid);
 		} else {
 		    zwarnnam(fromtest,
-			     "unrecognized condition: `%s'", name);
+			     "unrecognized condition: `%s'",
+			     name ? name : "<null>");
 		}
 	    }
 	    /* module not found, error */
Index: Src/parse.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/parse.c,v
retrieving revision 1.64
diff -p -u -r1.64 parse.c
--- Src/parse.c	23 Apr 2007 17:24:23 -0000	1.64
+++ Src/parse.c	26 Apr 2007 04:05:19 -0000
@@ -2124,6 +2124,12 @@ par_cond_triple(char *a, char *b, char *
 	ecstr(a);
 	ecstr(c);
 	ecadd(ecnpats++);
+    } else if ((b[0] == Equals || b[0] == '=') &&
+               (b[1] == '~' || b[1] == Tilde) && ~b[2]) {
+	ecadd(WCB_COND(COND_REGEX, 0));
+	ecstr(a);
+	ecstr(c);
+	ecadd(ecnpats++);
     } else if (b[0] == '-') {
 	if ((t0 = get_cond_num(b + 1)) > -1) {
 	    ecadd(WCB_COND(t0 + COND_NT, 0));
Index: Src/text.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/text.c,v
retrieving revision 1.19
diff -p -u -r1.19 text.c
--- Src/text.c	23 Apr 2007 15:24:00 -0000	1.19
+++ Src/text.c	26 Apr 2007 04:05:19 -0000
@@ -640,7 +640,7 @@ gettext2(Estate state)
 	    {
 		static char *c1[] = {
 		    "=", "!=", "<", ">", "-nt", "-ot", "-ef", "-eq",
-		    "-ne", "-lt", "-gt", "-le", "-ge"
+		    "-ne", "-lt", "-gt", "-le", "-ge", "=~"
 		};
 
 		int ctype;
@@ -724,7 +724,7 @@ gettext2(Estate state)
 			}
 			break;
 		    default:
-			if (ctype <= COND_GE) {
+			if (ctype < COND_MOD) {
 			    /* Binary test: `a = b' etc. */
 			    taddstr(ecgetstr(state, EC_NODUP, NULL));
 			    taddstr(" ");
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.112
diff -p -u -r1.112 zsh.h
--- Src/zsh.h	29 Mar 2007 21:35:39 -0000	1.112
+++ Src/zsh.h	26 Apr 2007 04:05:22 -0000
@@ -519,8 +519,9 @@ struct timedfn {
 #define COND_GT    13
 #define COND_LE    14
 #define COND_GE    15
-#define COND_MOD   16
-#define COND_MODI  17
+#define COND_REGEX 16
+#define COND_MOD   17
+#define COND_MODI  18
 
 typedef int (*CondHandler) _((char **, int));
 
Index: Src/Modules/pcre.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Modules/pcre.c,v
retrieving revision 1.11
diff -p -u -r1.11 pcre.c
--- Src/Modules/pcre.c	5 Apr 2007 16:20:15 -0000	1.11
+++ Src/Modules/pcre.c	26 Apr 2007 04:05:22 -0000
@@ -173,6 +173,10 @@ cond_pcre_match(char **a, int id)
     switch(id) {
 	 case CPCRE_PLAIN:
 		 pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL);
+		 if (pcre_pat == NULL) {
+		     zwarn("failed to compile regexp: %s", rhre);
+		     return 0;
+		 }
                  pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
     		 ovsize = (capcnt+1)*3;
 		 ov = zalloc(ovsize*sizeof(int));
@@ -191,6 +195,7 @@ cond_pcre_match(char **a, int id)
 
 static struct conddef cotab[] = {
     CONDDEF("pcre-match", CONDF_INFIX, cond_pcre_match, 0, 0, CPCRE_PLAIN)
+    /* CONDDEF can register =~ but it won't be found */
 };
 
 /**/


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH: =~ regex match
  2007-04-26  4:19 PATCH: =~ regex match Phil Pennock
@ 2007-04-26  5:12 ` Phil Pennock
  2007-04-26  9:31 ` Peter Stephenson
  1 sibling, 0 replies; 6+ messages in thread
From: Phil Pennock @ 2007-04-26  5:12 UTC (permalink / raw)
  To: zsh-workers

On 2007-04-25 at 21:19 -0700, Phil Pennock wrote:
> The below patch makes =~ an operator which silently auto-loads zsh/pcre
> and uses -pcre-match from there.

Documentation probably a good thing.

Index: Doc/Zsh/cond.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/cond.yo,v
retrieving revision 1.3
diff -p -u -r1.3 cond.yo
--- Doc/Zsh/cond.yo	22 May 2000 15:01:35 -0000	1.3
+++ Doc/Zsh/cond.yo	26 Apr 2007 05:11:00 -0000
@@ -109,6 +109,11 @@ backward compatibility and should be con
 item(var(string) tt(!=) var(pattern))(
 true if var(string) does not match var(pattern).
 )
+item(var(string) tt(=~) var(regexp))(
+true if var(string) matches the PCRE regular expression
+var(regexp).  Requires the tt(zsh/pcre) module to be present,
+which is a compile-time option.
+)
 item(var(string1) tt(<) var(string2))(
 true if var(string1) comes before var(string2)
 based on ASCII value of their characters.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH: =~ regex match
  2007-04-26  4:19 PATCH: =~ regex match Phil Pennock
  2007-04-26  5:12 ` Phil Pennock
@ 2007-04-26  9:31 ` Peter Stephenson
  2007-04-26 20:19   ` Phil Pennock
  1 sibling, 1 reply; 6+ messages in thread
From: Peter Stephenson @ 2007-04-26  9:31 UTC (permalink / raw)
  To: zsh-workers

Phil Pennock wrote:
> My mental processes push me towards regexps perhaps more than is
> healthy.  zsh/pcre's -pcre-match conditional operator is firmly in my
> toolkit.
> 
> Earlier today I discovered that bash has a =~ operator for doing
> extended regexp comparisons.  I became envious, so I did something about
> it.  Which didn't involve giving up all the conveniences of zsh which
> have spoiled me.
> 
> The below patch makes =~ an operator which silently auto-loads zsh/pcre
> and uses -pcre-match from there.  This is not bash-compatible in that:
> 
>  1: it's PCRE -- far more to my liking :^)
>  2: if the regexp pattern is bad, it returns false/1 not false/2.

I thought about =~ and it seemed to me that since it would be largely
there for bash compatibility it would be better to do it with the system
regexp library, which would be more compatible and wouldn't depend on
optional packages.  It shouldn't be too hard to do.  We probably
wouldn't support BASH_REMATCH, however.

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


To access the latest news from CSR copy this link into a web browser:  http://www.csr.com/email_sig.php

To get further information regarding CSR, please visit our Investor Relations page at http://ir.csr.com/csr/about/overview


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH: =~ regex match
  2007-04-26  9:31 ` Peter Stephenson
@ 2007-04-26 20:19   ` Phil Pennock
  2007-04-27  0:06     ` Phil Pennock
  2007-04-27  9:33     ` Peter Stephenson
  0 siblings, 2 replies; 6+ messages in thread
From: Phil Pennock @ 2007-04-26 20:19 UTC (permalink / raw)
  To: zsh-workers

On 2007-04-26 at 10:31 +0100, Peter Stephenson wrote:
> I thought about =~ and it seemed to me that since it would be largely
> there for bash compatibility it would be better to do it with the system
> regexp library, which would be more compatible and wouldn't depend on
> optional packages.  It shouldn't be too hard to do.  We probably
> wouldn't support BASH_REMATCH, however.

Bash uses extended regexps, so the PCRE stuff should be vaguely a
superset, although it might be worth adding a different match operator,
overloading with condid, to specify some PCRE options to pcre_compile; I
was thinking sticking another case before CPCRE_PLAIN in cond_pcre_match
which sets pcre_opts and then falls through to the plain case.  That
would mostly affect newline handling.

I was also thinking about how to deal with UTF8, which is another
potential advantage to sticking with PCRE.  Zsh isn't specifically
UTF-8 when in widechar, is it?  Is the "right" way something like
(untested):

#if defined(MULTIBYTE_SUPPORT) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
  {
    static int have_utf8_pcre = -1;

    if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
      if (have_utf8_pcre == -1) {
        if (pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre) {
	  have_utf8_pcre = -2; /* erk, failed to ask */
	}
      }

      if (have_utf8_pcre > 0) {
        pcre_opts |= PCRE_UTF8;
      }
    }
  }
#endif

Which means that in non-UTF-8 multibyte locales, you'll get per-octet
regexps, but in UTF-8 locales, a multibyte zsh with a libpcre also built
with UTF-8 support will let you get "proper" matching.

I'm envious of the =~ operator but that doesn't mean that I want to lose
the funky stuff of PCRE when I use it -- I like negative lookahead
assertions, freak that I am.

As to BASH_REMATCH ... how frowned upon are new zsh options which
auto-set for compatibility?  It wouldn't be hard, since the
infrastructure's all already in place.  Call the zsh option BASH_REMATCH
to set the BASH_REMATCH variable.  :^)

% [[ alphabetical =~ ^a([^a]+)a([^a]+)a ]] && print -l $match
lph
betic

Change the last parameter in cond_pcre_match()'s call to
zpcre_get_substrings() to be non-NULL if the zsh option is set so that a
different receptacle to "match" is set.

If I code this up, is it likely to make it in?  If not, I won't bother
as full bash compatibility isn't so important to me, only having =~.
It's not like POSIX is involved here ...

Thanks,
-Phil


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH: =~ regex match
  2007-04-26 20:19   ` Phil Pennock
@ 2007-04-27  0:06     ` Phil Pennock
  2007-04-27  9:33     ` Peter Stephenson
  1 sibling, 0 replies; 6+ messages in thread
From: Phil Pennock @ 2007-04-27  0:06 UTC (permalink / raw)
  To: zsh-workers

On 2007-04-26 at 13:19 -0700, Phil Pennock wrote:
> As to BASH_REMATCH ... how frowned upon are new zsh options which
> auto-set for compatibility?  It wouldn't be hard, since the
> infrastructure's all already in place.  Call the zsh option BASH_REMATCH
> to set the BASH_REMATCH variable.  :^)

I just double-checked something in passing and discovered that Bash uses
the equivalent of KSH_ARRAYS, so the variable would need to be marked
similarly to that and provided with the entire matched portion of the
string in index 0.

Would it be sufficient to assume that if someone's truly after bash
compat, ksh_arrays will be set, so inside the test isset(BASHREMATCH)
also check isset(KSHARRAYS) and if and only if that is set too, then
prepend the entire matched portion to the array?

Normal:
  set $MATCH to matched portion,
      ${match[@]} to captured substrings
BASH_REMATCH:
  set $MATCH to matched portion,
      ${BASH_REMATCH} to captured substrings
  => not full compat
BASH_REMATCH && KSH_ARRAYS
  set $MATCH to matched portion,
      ${BASH_REMATCH[0]} to matched portion too,
      ${BASH_REMATCH[1...n]} to captured substrings
  => full compat

Which highlights that =~ should also be setting $MATCH, not just $match.

Would it be okay to extend the -pcre-match operator to do the same?

What about the pcre_match builtin function -- should that also be
setting $MATCH and accept a "-v var" option to set a different variable,
similarly to "-a arr"?  If so, any preferences as to what the option
should be?

I'm willing to do this work, provided people think that it would be
fairly likely to be accepted.

Regards,
-Phil


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH: =~ regex match
  2007-04-26 20:19   ` Phil Pennock
  2007-04-27  0:06     ` Phil Pennock
@ 2007-04-27  9:33     ` Peter Stephenson
  1 sibling, 0 replies; 6+ messages in thread
From: Peter Stephenson @ 2007-04-27  9:33 UTC (permalink / raw)
  To: zsh-workers

Phil Pennock <zsh-workers+phil.pennock@spodhuis.org> wrote:
> I was also thinking about how to deal with UTF8, which is another
> potential advantage to sticking with PCRE.  Zsh isn't specifically
> UTF-8 when in widechar, is it?

That's correct, but that's actually an advantage of the system regular
expression libraries, which will use the locale in the same way as the
rest of the system to handle multibyte strings.

> #if defined(MULTIBYTE_SUPPORT) && defined(HAVE_NL_LANGINFO) && defined
> #(CODESET)
>   {
>     static int have_utf8_pcre = -1;
> 
>     if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
>       if (have_utf8_pcre == -1) {
>         if (pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre) {
> 	  have_utf8_pcre = -2; /* erk, failed to ask */
> 	}
>       }
> 
>       if (have_utf8_pcre > 0) {
>         pcre_opts |= PCRE_UTF8;
>       }
>     }
>   }
> #endif
> 
> Which means that in non-UTF-8 multibyte locales, you'll get per-octet
> regexps, but in UTF-8 locales, a multibyte zsh with a libpcre also
> built with UTF-8 support will let you get "proper" matching.

You might want to add that to the pcre library, if appropriate; you
probably also need to test for isset(MULTIBYTE) since unsetting the
multibyte option is supposed to force all strings to be single bytes.

> I'm envious of the =~ operator but that doesn't mean that I want to
> lose the funky stuff of PCRE when I use it -- I like negative
> lookahead assertions, freak that I am.

I don't think there's any question of removing -pcre-match.

> As to BASH_REMATCH ... how frowned upon are new zsh options which
> auto-set for compatibility?  It wouldn't be hard, since the
> infrastructure's all already in place.  Call the zsh option
> BASH_REMATCH to set the BASH_REMATCH variable.  :^)

That would be perfectly sensible.

> If I code this up, is it likely to make it in?  If not, I won't bother
> as full bash compatibility isn't so important to me, only having =~.
> It's not like POSIX is involved here ...

Well, actually it is, since basic shell features should use basic system
features wherever possible rather than requiring optional libraries.  If
we're going to add =~ because it's in bash I don't seen any real point
in duplicating -pcre-match to do it, and the POSIX
regcomp/regexec/regerror/regfree should be available just about
everywhere.

When that happens...

> I just double-checked something in passing and discovered that Bash
> uses the equivalent of KSH_ARRAYS, so the variable would need to be
> marked similarly to that and provided with the entire matched portion
> of the string in index 0.

We'll do it the usual way and respect the setting of KSH_ARRAYS.  This
is on in bash compatibility mode.  If that's not set, but BASH_REMATCH
is, we'll put the first match in $BASH_REMATCH[1].

-- 
Peter Stephenson <pws@csr.com>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


To access the latest news from CSR copy this link into a web browser:  http://www.csr.com/email_sig.php

To get further information regarding CSR, please visit our Investor Relations page at http://ir.csr.com/csr/about/overview


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-04-27  9:35 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-26  4:19 PATCH: =~ regex match Phil Pennock
2007-04-26  5:12 ` Phil Pennock
2007-04-26  9:31 ` Peter Stephenson
2007-04-26 20:19   ` Phil Pennock
2007-04-27  0:06     ` Phil Pennock
2007-04-27  9:33     ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).