zsh-workers
 help / color / mirror / code / Atom feed
* PATCH: short-circuiting glob exclusion operator
@ 2016-03-21 18:36 Peter Stephenson
  2016-03-21 20:39 ` Mikael Magnusson
  2016-03-21 22:54 ` Bart Schaefer
  0 siblings, 2 replies; 14+ messages in thread
From: Peter Stephenson @ 2016-03-21 18:36 UTC (permalink / raw)
  To: Zsh Hackers' List

This is a request for comments for now.  I'm not yet entirely sure the
pattern syntax with two tildes isn't going to cause problems (though I
don't see what they would be), or that I've identified all the side
effects that can happen.

As an ultra brief summary, ~~ in a glob introduces a pattern that will
prune directories on the way through in addition to acting in the normal
exclusion fashion.  So **/*~~*/foo stops at and ignores any directories
called foo at any level(#1) on the way down.  This is unlike a single
tilde where exclusions are only performed after the complete set of
matches is generated, so it's potentially much more efficient.

(#1) Except the top.  It's a bit anhoying you need  (|*/).

pws

diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo
index c6e7b6f..13ef139 100644
--- a/Doc/Zsh/expn.yo
+++ b/Doc/Zsh/expn.yo
@@ -2083,6 +2083,31 @@ Multiple patterns can be excluded by
 In the exclusion pattern (var(y)), `tt(/)' and `tt(.)' are not treated
 specially the way they usually are in globbing.
 )
+item(var(x)tt(~~)var(y))(
+(Requires tt(EXTENDED_GLOB) to be set.)
+This alternative form of the tt(~) pattern is only handled during
+filename generation (globbing).  The second tilde is not special
+in other forms of pattern matching.
+
+This is similar to the form with a single tilde in that the pattern var(x)
+must match and anything matched by var(y) is excluded from the match.
+However, in this case var(y) is used additionally to prune intermediate
+directories.  In other words, var(y) can match not just a complete file
+name to be returned, but also any directory by which the file is
+reached.  This is much more efficient than the previous form for deep
+directory hierarchies as the directory is excluded as soon as it is
+reached and is never entered.  As with the single-tilde form, tt(/)
+is not special in var(y) and so may be matched by pattern characters.
+
+Only a single tt(~~) is useful in each filename generation pattern.
+
+For example, `tt(**/*.o~~(*/|)lib)' searches recursively for files with
+the suffix `tt(.o)', but if a directory named `tt(lib)' is reached
+at the top level or in any subdirectory found recursively it is not
+searched.  With the pattern `tt(**/*~~(*/|)lib+LPAR()/+RPAR()), which searches
+recursively for directories, directories name tt(lib) are not only not
+searched but are not returned as a match.
+)
 item(var(x)tt(#))(
 (Requires tt(EXTENDED_GLOB) to be set.)
 Matches zero or more occurrences of the pattern var(x).
diff --git a/Src/glob.c b/Src/glob.c
index 2051016..a7a44f4 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -185,6 +185,7 @@ struct globdata {
     int gd_gf_follow, gd_gf_sorts, gd_gf_nsorts;
     struct globsort gd_gf_sortlist[MAX_SORTS];
     LinkList gd_gf_pre_words, gd_gf_post_words;
+    Patprog gd_exclude;
 
     char *gd_glob_pre, *gd_glob_suf;
 };
@@ -259,7 +260,7 @@ struct complist {
 
 /**/
 static void
-addpath(char *s, int l)
+addpath(const char *s, int l)
 {
     DPUTS(!pathbuf, "BUG: pathbuf not initialised");
     while (pathpos + l + 1 >= pathbufsz)
@@ -459,6 +460,38 @@ insert(char *s, int checked)
     return;
 }
 
+/*
+ * Decide whether to prune at this point in scanning. 1 = yes.
+ *
+ * We'll make temporary use of pathbuf for this; it's convenient because
+ * most of a long path won't need copying and if we are later going to
+ * add this file to it we know the buffer will be large enough.
+ *
+ * The filename fn is here metafied.
+ */
+/**/
+static int
+scanner_prune(const char *fn, int len)
+{
+    int ret, savepathpos, saverrsfound, savforceerrs;
+    if (!curglobdata.gd_exclude)
+	return 0;
+    savepathpos = pathpos;
+    addpath(fn, len);
+    if (pathpos && pathbuf[pathpos-1] == '/')
+	pathbuf[pathpos-1] = '\0';
+    /*
+     * As we're in the middle of a glob, be careful about
+     * the counters needed for approximation across directories.
+     */
+    patsaveerrs(&saverrsfound, &savforceerrs);
+    ret = pattry(curglobdata.gd_exclude, pathbuf);
+    patrestoreerrs(saverrsfound, savforceerrs);
+    pathbuf[pathpos = savepathpos] = '\0';
+
+    return ret;
+}
+
 /* Do the globbing:  scanner is called recursively *
  * with successive bits of the path until we've    *
  * tried all of it.                                */
@@ -510,6 +543,8 @@ scanner(Complist q, int shortcircuit)
 	    }
 	    pathbufcwd = pathpos;
 	}
+	if (scanner_prune(str, l))
+	    return;
 	if (q->next) {
 	    /* Not the last path section. Just add it to the path. */
 	    int oppos = pathpos;
@@ -563,6 +598,8 @@ scanner(Complist q, int shortcircuit)
 		continue;
 	    errsfound = errssofar;
 	    if (pattry(p, fn)) {
+		if (scanner_prune(fn, strlen(fn)))
+		    continue;
 		/* if this name matchs the pattern... */
 		if (pbcwdsav == pathbufcwd &&
 		    strlen(fn) + pathpos - pathbufcwd >= PATH_MAX) {
@@ -675,9 +712,10 @@ scanner(Complist q, int shortcircuit)
 
 /**/
 static Complist
-parsecomplist(char *instr)
+parsecomplist(char *instr, char **post_exclude)
 {
     Patprog p1;
+    struct patcompargs compargs;
     Complist l1;
     char *str;
     int compflags = gf_noglobdots ? (PAT_FILE|PAT_NOGLD) : PAT_FILE;
@@ -699,11 +737,11 @@ parsecomplist(char *instr)
 
 	    /* Now get the next path component if there is one. */
 	    l1 = (Complist) zhalloc(sizeof *l1);
-	    if ((l1->next = parsecomplist(instr)) == NULL) {
+	    if ((l1->next = parsecomplist(instr, post_exclude)) == NULL) {
 		errflag |= ERRFLAG_ERROR;
 		return NULL;
 	    }
-	    l1->pat = patcompile(NULL, compflags | PAT_ANY, NULL);
+	    l1->pat = patcompile(NULL, compflags | PAT_ANY, &compargs);
 	    l1->closure = 1;		/* ...zero or more times. */
 	    l1->follow = follow;
 	    return l1;
@@ -715,8 +753,9 @@ parsecomplist(char *instr)
 	!skipparens(Inpar, Outpar, (char **)&str) &&
         *str == zpc_special[ZPC_HASH] && str[-2] == '/') {
 	instr++;
-	if (!(p1 = patcompile(instr, compflags, &instr)))
+	if (!(p1 = patcompile(instr, compflags, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	if (instr[0] == '/' && instr[1] == Outpar && instr[2] == Pound) {
 	    int pdflag = 0;
 
@@ -730,21 +769,24 @@ parsecomplist(char *instr)
 	    /* special case (/)# to avoid infinite recursion */
 	    l1->closure = (*((char *)p1 + p1->startoff)) ? 1 + pdflag : 0;
 	    l1->follow = 0;
-	    l1->next = parsecomplist(instr);
+	    l1->next = parsecomplist(instr, NULL);
 	    return (l1->pat) ? l1 : NULL;
 	}
     } else {
 	/* parse single path component */
-	if (!(p1 = patcompile(instr, compflags|PAT_FILET, &instr)))
+	if (!(p1 = patcompile(instr, compflags|PAT_FILET, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	/* then do the remaining path components */
 	if (*instr == '/' || !*instr) {
 	    int ef = *instr == '/';
 
+	    if (post_exclude && compargs.post_exclude)
+		*post_exclude = dupstring(compargs.post_exclude);
 	    l1 = (Complist) zhalloc(sizeof *l1);
 	    l1->pat = p1;
 	    l1->closure = 0;
-	    l1->next = ef ? parsecomplist(instr+1) : NULL;
+	    l1->next = ef ? parsecomplist(instr+1, post_exclude) : NULL;
 	    return (ef && !l1->next) ? NULL : l1;
 	}
     }
@@ -756,7 +798,7 @@ parsecomplist(char *instr)
 
 /**/
 static Complist
-parsepat(char *str)
+parsepat(char *str, char **post_exclude)
 {
     long assert;
     int ignore;
@@ -784,8 +826,9 @@ parsepat(char *str)
 	pathbuf[pathpos = 1] = '\0';
     } else			/* pattern is relative to pwd */
 	pathbuf[pathpos = 0] = '\0';
+    *post_exclude = NULL;
 
-    return parsecomplist(str);
+    return parsecomplist(str, post_exclude);
 }
 
 /* get number after qualifier */
@@ -1187,6 +1230,7 @@ zglob(LinkList list, LinkNode np, int nountok)
     int first = 0, end = -1;		/* index of first match to return */
 					/* and index+1 of the last match */
     struct globdata saved;		/* saved glob state              */
+    char *post_exclude;			/* text after ~~ */
     int nobareglob = !isset(BAREGLOBQUAL);
     int shortcircuit = 0;		/* How many files to match;      */
 					/* 0 means no limit              */
@@ -1774,7 +1818,7 @@ zglob(LinkList list, LinkNode np, int nountok)
 			    qfirst = qn;
 			    for (qlast = qfirst; qlast->next;
 				 qlast = qlast->next)
-				;			    
+				;
 			} else
 			    qfirst = dup_qual_list(qn, &qlast);
 			/* ... link into new `or' chain ... */
@@ -1804,8 +1848,15 @@ zglob(LinkList list, LinkNode np, int nountok)
 	} else if (newquals)
 	    quals = newquals;
     }
-    q = parsepat(str);
-    if (!q || errflag) {	/* if parsing failed */
+    q = parsepat(str, &post_exclude);
+    if (q && !errflag && post_exclude) {
+	/* Exclusion after ~~: this is a normal pattern, not a file. */
+	curglobdata.gd_exclude = patcompile(post_exclude, 0, NULL);
+    } else {
+	curglobdata.gd_exclude = NULL;
+    }
+    if (!q || errflag || (post_exclude && !curglobdata.gd_exclude)) {
+	/* if parsing failed */
 	restore_globstate(saved);
 	if (unset(BADPATTERN)) {
 	    if (!nountok)
diff --git a/Src/pattern.c b/Src/pattern.c
index 72c7d97..4e5cc0c 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -521,21 +521,24 @@ patcompstart(void)
  * Top level pattern compilation subroutine
  * exp is a null-terminated, metafied string.
  * inflags is an or of some PAT_* flags.
- * endexp, if non-null, is set to a pointer to the end of the
- *   part of exp which was compiled.  This is used when
- *   compiling patterns for directories which must be
+ *
+ * compargs, if non-null, contains pointers that may be set
+ * to indicate the following. This is used when matching files
+ * as in this case the pattern has multiple parts.
+ * - endexp is set to the end of the part of exp which was compiled.
+ *   This is used when compiling patterns for directories which must be
  *   matched recursively.
  */
 
 /**/
 mod_export Patprog
-patcompile(char *exp, int inflags, char **endexp)
+patcompile(char *exp, int inflags, Patcompargs compargs)
 {
     int flags = 0;
     long len = 0;
     long startoff;
     Upat pscan;
-    char *lng, *strp = NULL;
+    char *lng, *strp = NULL, *post_exclude = NULL;
     Patprog p;
 
     queue_signals();
@@ -602,7 +605,7 @@ patcompile(char *exp, int inflags, char **endexp)
 	if (!strp || (*strp && *strp != '/')) {
 	    /* No, do normal compilation. */
 	    strp = NULL;
-	    if (patcompswitch(0, &flags) == 0) {
+	    if (patcompswitch(0, &flags, &post_exclude) == 0) {
 		unqueue_signals();
 		return NULL;
 	    }
@@ -735,13 +738,53 @@ patcompile(char *exp, int inflags, char **endexp)
 	p = newp;
     }
 
-    if (endexp)
-	*endexp = patparse;
+    if (compargs) {
+	compargs->endexp = patparse;
+	if ((compargs->post_exclude = post_exclude))
+	{
+	    /*
+	     * The top-level ~~ is the last thing that can appear.
+	     */
+	    compargs->endexp += strlen(patparse);
+	}
+    }
 
     unqueue_signals();
     return p;
 }
 
+/* Flags for exclusion with ~ */
+enum {
+    TILDE_SINGLE = 1,
+    TILDE_DOUBLE = 2,
+    TILDE_SLASH = 4,
+};
+
+/*
+ * Look for an active tilde; if 2 doubled, which can only happen
+ * at top level (not in parentheses) for a file glob.
+ *
+ * If term_match, we're looking for a terminator (something that can't
+ * be part of a pattern itself) to follow; otherwise, we're
+ * looking for a tilde without a terminator following.
+ */
+
+static int
+pattilde(int toplevel, int term_match)
+{
+    int ret;
+    if (*patparse != zpc_special[ZPC_TILDE])
+	return 0;
+    ret = (toplevel &&
+	   (patflags & PAT_FILET) &&
+	   patparse[1] == zpc_special[ZPC_TILDE]) ? TILDE_DOUBLE : TILDE_SINGLE;
+    if (patparse[ret] == '/' || !memchr(zpc_special, patparse[ret],
+					ZPC_SEG_COUNT))
+	return term_match ? ret : 0;
+    else
+	return term_match ? 0 : ret;
+}
+
 /*
  * Main body or parenthesized subexpression in pattern
  * Parenthesis (and any ksh_glob gubbins) will have been removed.
@@ -749,7 +792,7 @@ patcompile(char *exp, int inflags, char **endexp)
 
 /**/
 static long
-patcompswitch(int paren, int *flagp)
+patcompswitch(int paren, int *flagp, char **post_exclude)
 {
     long starter, br, ender, excsync = 0;
     int parno = 0;
@@ -783,17 +826,35 @@ patcompswitch(int paren, int *flagp)
 
     *flagp |= flags & (P_HSTART|P_PURESTR);
 
-    while (*patparse == zpc_chars[ZPC_BAR] ||
-	   (*patparse == zpc_special[ZPC_TILDE] &&
-	    (patparse[1] == '/' ||
-	     !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
-	int tilde = *patparse++ == zpc_special[ZPC_TILDE];
-	long gfnode = 0, newbr;
+    for (;;) {
+	int tilde;
+	long gfnode, newbr;
+	if (*patparse == zpc_chars[ZPC_BAR]) {
+	    /* handle alternation */
+	    tilde = 0;
+	    patparse++;
+	} else if ((tilde = pattilde(!paren, 1))) {
+	    patparse += tilde;
+	    /* handle exclusion... */
+	} else {
+	    break;
+	}
 
+	gfnode = 0;
 	*flagp &= ~P_PURESTR;
 
 	if (tilde) {
 	    union upat up;
+	    if (tilde == TILDE_DOUBLE) {
+		/*
+		 * Double tilde in top-level glob indicates pruning a
+		 * path: handle specially
+		 */
+		*post_exclude = patparse;
+		patparse += strlen(patparse);
+		break;
+	    }
+
 	    /* excsync remembers the P_EXCSYNC node before a chain of
 	     * exclusions:  all point back to this.  only the
 	     * original (non-excluded) branch gets a trailing P_EXCSYNC.
@@ -825,7 +886,7 @@ patcompswitch(int paren, int *flagp)
 	    patadd((char *)&up, 0, sizeof(up), 0);
 	    /* / is not treated as special if we are at top level */
 	    if (!paren && zpc_special[ZPC_SLASH] == '/') {
-		tilde++;
+		tilde |= TILDE_SLASH;
 		zpc_special[ZPC_SLASH] = Marker;
 	    }
 	} else {
@@ -864,7 +925,7 @@ patcompswitch(int paren, int *flagp)
 	    }
 	}
 	newbr = patcompbranch(&flags, paren);
-	if (tilde == 2) {
+	if (tilde & TILDE_SLASH) {
 	    /* restore special treatment of / */
 	    zpc_special[ZPC_SLASH] = '/';
 	}
@@ -935,8 +996,7 @@ patcompbranch(int *flagp, int paren)
 
     starter = chain = 0;
     while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) ||
-	   (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' &&
-	    memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) {
+	   pattilde(!paren, 0)) {
 	if ((*patparse == zpc_special[ZPC_INPAR] &&
 	     patparse[1] == zpc_special[ZPC_HASH]) ||
 	    (*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar &&
@@ -1298,9 +1358,7 @@ patcomppiece(int *flagp, int paren)
 	     */
 	    if (kshchar ||
 		(memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) &&
-		 (*patparse != zpc_special[ZPC_TILDE] ||
-		  patparse[1] == '/' ||
-		  !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
+		 !pattilde(!paren, 0))) {
 		break;
 	    }
     	}
@@ -1507,7 +1565,7 @@ patcomppiece(int *flagp, int paren)
 		 */
 		if (!(starter = patcompnot(1, &flags2)))
 		    return 0;
-	    } else if (!(starter = patcompswitch(1, &flags2)))
+	    } else if (!(starter = patcompswitch(1, &flags2, NULL)))
 		return 0;
 	    flags |= flags2 & P_HSTART;
 	    break;
@@ -1759,7 +1817,8 @@ patcompnot(int paren, int *flagsp)
     pattail(starter, excl = patnode(P_EXCLUDE));
     up.p = NULL;
     patadd((char *)&up, 0, sizeof(up), 0);
-    if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0))))
+    if (!(br = (paren ? patcompswitch(1, &dummy, NULL) :
+		patcompbranch(&dummy, 0))))
 	return 0;
     pattail(br, patnode(P_EXCEND));
     n = patnode(P_NOTHING); /* just so much easier */
@@ -2024,6 +2083,26 @@ pattrystart(void)
     errsfound = 0;
 }
 
+/* Save the error count during a glob. */
+
+/**/
+void
+patsaveerrs(int *errsfoundp, int *forceerrsp)
+{
+    *errsfoundp = errsfound;
+    *forceerrsp = forceerrs;
+}
+
+/* Restore the error count during a glob. */
+
+/**/
+void
+patrestoreerrs(int errsfoundin, int forceerrsin)
+{
+    errsfound = errsfoundin;
+    forceerrs = forceerrsin;
+}
+
 /*
  * Fix up string length stuff.
  *
diff --git a/Src/zsh.h b/Src/zsh.h
index eee31da..765392a 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -511,6 +511,7 @@ typedef struct options	 *Options;
 typedef struct optname   *Optname;
 typedef struct param     *Param;
 typedef struct paramdef  *Paramdef;
+typedef struct patcompargs *Patcompargs;
 typedef struct patstralloc  *Patstralloc;
 typedef struct patprog   *Patprog;
 typedef struct prepromptfn *Prepromptfn;
@@ -1500,6 +1501,14 @@ struct patstralloc {
     int progstrunmetalen;	/* Length of the foregoing */
 };
 
+struct patcompargs {
+    char *endexp;		/* End of the part of the input
+				 * expression that was compiled.*/
+    char *post_exclude;         /* Expression following ~ in top-level
+				 * file match: handled specially.
+				 * Enabled by PAT_FILET flag. */
+};
+
 /* Flags used in pattern matchers (Patprog) and passed down to patcompile */
 
 #define PAT_FILE	0x0001	/* Pattern is a file name */
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index a6b704a..f140b30 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -655,3 +655,28 @@
   [[ "1" = [$~cset] ]] || print Fail 5
   [[ "b" != [$~cset] ]] || print Fail 6
 0:character set specified as active variabe
+
+  (cd glob.tmp
+   setopt extendedglob
+   mkdir DT DT/one DT/two DT/three
+   for dir in one two three; do
+     mkdir DT/$dir/sub{1..3}
+     touch DT/$dir/sub{1..3}/file.{a,b,c}
+   done
+   print -l DT/o*/*3/file.*
+   print -l DT/**/*~~DT(.N)
+   # Attention: EXCLUSIONS BLOW YOUR MIND warning...
+   print -l DT/**/*.*~~(*sub[123]*.[ab]|*sub[13]*)
+   print -l DT/**/*.*~~*/(sub[12]|one|two)
+  )
+0:~~ exclusions
+>DT/one/sub3/file.a
+>DT/one/sub3/file.b
+>DT/one/sub3/file.c
+>
+>DT/one/sub2/file.c
+>DT/three/sub2/file.c
+>DT/two/sub2/file.c
+>DT/three/sub3/file.a
+>DT/three/sub3/file.b
+>DT/three/sub3/file.c


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-21 18:36 PATCH: short-circuiting glob exclusion operator Peter Stephenson
@ 2016-03-21 20:39 ` Mikael Magnusson
  2016-03-21 22:54 ` Bart Schaefer
  1 sibling, 0 replies; 14+ messages in thread
From: Mikael Magnusson @ 2016-03-21 20:39 UTC (permalink / raw)
  To: Peter Stephenson; +Cc: Zsh Hackers' List

On Mon, Mar 21, 2016 at 7:36 PM, Peter Stephenson
<p.stephenson@samsung.com> wrote:
> This is a request for comments for now.  I'm not yet entirely sure the
> pattern syntax with two tildes isn't going to cause problems (though I
> don't see what they would be), or that I've identified all the side
> effects that can happen.
>
> As an ultra brief summary, ~~ in a glob introduces a pattern that will
> prune directories on the way through in addition to acting in the normal
> exclusion fashion.  So **/*~~*/foo stops at and ignores any directories
> called foo at any level(#1) on the way down.  This is unlike a single
> tilde where exclusions are only performed after the complete set of
> matches is generated, so it's potentially much more efficient.
>
> (#1) Except the top.  It's a bit anhoying you need  (|*/).

Is it like (^foo/)##* then, except possibly easier to combine with
other patterns, and/or to stop on more complicated expressions? I
assume you could do **/*~~*/foo/(|*/)bar where * also can match across
multiple slashes to stop recursing in bar directories if we're already
in a foo directory. It's not immediately obvious to me how to achieve
that with (^foo/)##* so it's probably not possible ;).

Here's a possible problem with double tilde as the syntax:

% touch foo~ hi~ hello
% echo *~~hi*
foo~

If ~~ was special, this would mean * with hi* removed, and print hello
foo~ instead of just foo~, which could break existing users. Or more
dramatically like in this fictional example,

# remove all backup files except some I
# like to keep for plot purposes
rm *~~importantfile*

With the change this would wipe out both backups and originals.

-- 
Mikael Magnusson


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-21 18:36 PATCH: short-circuiting glob exclusion operator Peter Stephenson
  2016-03-21 20:39 ` Mikael Magnusson
@ 2016-03-21 22:54 ` Bart Schaefer
  2016-03-22  9:46   ` Peter Stephenson
  1 sibling, 1 reply; 14+ messages in thread
From: Bart Schaefer @ 2016-03-21 22:54 UTC (permalink / raw)
  To: Zsh Hackers' List

On Mar 21,  6:36pm, Peter Stephenson wrote:
}
} This is a request for comments for now.  I'm not yet entirely sure the
} pattern syntax with two tildes isn't going to cause problems (though I
} don't see what they would be), or that I've identified all the side
} effects that can happen.

As Mikael has already pointed out, files with names ending in a tilde
followed by an exclusion pattern may cause trouble with this.

(Incidentally, you already can't use nameddirs or ~/ in exclusions, so
that's not at issue.)

} As an ultra brief summary, ~~ in a glob introduces a pattern that will
} prune directories on the way through in addition to acting in the normal
} exclusion fashion.

I don't suppose it would instead be possible to special-case something at
the END of the pattern?  E.g. **/*~*/foo/** where the trailing /** means
to prune at whatever precedes it.  I suppose that conflicts with the new
GLOB_STAR_SHORT option, although ultimately it means exactly the same in
context.

Aside:  Do we really want to add this before we've figured out the
background for "Extended globbing seems to have become much slower"?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-21 22:54 ` Bart Schaefer
@ 2016-03-22  9:46   ` Peter Stephenson
  2016-03-22 11:46     ` Jesper Nygårds
  2016-03-26 15:40     ` Bart Schaefer
  0 siblings, 2 replies; 14+ messages in thread
From: Peter Stephenson @ 2016-03-22  9:46 UTC (permalink / raw)
  To: Zsh Hackers' List

On Mon, 21 Mar 2016 15:54:21 -0700
Bart Schaefer <schaefer@brasslantern.com> wrote:
> On Mar 21,  6:36pm, Peter Stephenson wrote:
> }
> } This is a request for comments for now.  I'm not yet entirely sure the
> } pattern syntax with two tildes isn't going to cause problems (though I
> } don't see what they would be), or that I've identified all the side
> } effects that can happen.
> 
> As Mikael has already pointed out, files with names ending in a tilde
> followed by an exclusion pattern may cause trouble with this.

Yes, they do: the logic was that if it saw "~~" in the middle, the first
"~" couldn't be an exclusion, because there was nothing after it other
than another "~" (exclusions have to be non-empty patterns), so it was
treated as quoted.  This would change.

~# might work instead.

> } As an ultra brief summary, ~~ in a glob introduces a pattern that will
> } prune directories on the way through in addition to acting in the normal
> } exclusion fashion.
> 
> I don't suppose it would instead be possible to special-case something at
> the END of the pattern?  E.g. **/*~*/foo/** where the trailing /** means
> to prune at whatever precedes it.

That's quite hard to parse; you're happily looking at a normal "~" and
then you find right at the end that actually it doesn't belong within
the pattern at all, instead it belongs higher up with the hierarchy of
file patterns (although ~ is already hacked a bit to apply to the whole
pattern in ways I haven't quite got to the bottom of again).

What worries me most is pat~foo~bar is supposed to exclude both
foo and bar, and I'm not sure how that could fit in here.

> I suppose that conflicts with the new GLOB_STAR_SHORT option, although
> ultimately it means exactly the same in context.

It doesn't, really, since directories aren't specially handled after the
~, or could in principle be specially handled by different means.

> Aside:  Do we really want to add this before we've figured out the
> background for "Extended globbing seems to have become much slower"?

No hurry, but I have no idea where that's going and we already seem to
have excluded most of the things we know about...

pws


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-22  9:46   ` Peter Stephenson
@ 2016-03-22 11:46     ` Jesper Nygårds
  2016-03-23  9:56       ` Peter Stephenson
  2016-03-26 15:40     ` Bart Schaefer
  1 sibling, 1 reply; 14+ messages in thread
From: Jesper Nygårds @ 2016-03-22 11:46 UTC (permalink / raw)
  To: Zsh Hackers' List

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

On Tue, Mar 22, 2016 at 10:46 AM, Peter Stephenson <p.stephenson@samsung.com
> wrote:

> On Mon, 21 Mar 2016 15:54:21 -0700
> Bart Schaefer <schaefer@brasslantern.com> wrote:
>
> > Aside:  Do we really want to add this before we've figured out the
> > background for "Extended globbing seems to have become much slower"?
>
> No hurry, but I have no idea where that's going and we already seem to
> have excluded most of the things we know about...
>
>
Concerning the "Extended globbing..." thread: since I switched to using
'find' instead of zsh globbing, I haven't looked any more at the problem
discussed in that thread. I'd be happy to help with whatever testing anyone
finds relevant, but I was a bit worried that this was a problem only I was
seeing.

In fact, the directory I was having problems with, and running tests
within, is a collection of Java open source projects, so it would certainly
be possible to reconstruct the directory hierarchy on someone else's
computer, if anyone wants to run tests with my particular hierarchy. I'd be
happy to provide instructions if anyone wants to do that. But apart from
that, I'm afraid I can't be of much help in fixing it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-22 11:46     ` Jesper Nygårds
@ 2016-03-23  9:56       ` Peter Stephenson
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Stephenson @ 2016-03-23  9:56 UTC (permalink / raw)
  To: Zsh Hackers' List

On Tue, 22 Mar 2016 12:46:43 +0100
Jesper Nygårds <jesper.nygards@gmail.com> wrote:
> In fact, the directory I was having problems with, and running tests
> within, is a collection of Java open source projects, so it would certainly
> be possible to reconstruct the directory hierarchy on someone else's
> computer, if anyone wants to run tests with my particular hierarchy. I'd be
> happy to provide instructions if anyone wants to do that. But apart from
> that, I'm afraid I can't be of much help in fixing it.

If it's easy to assemble the instructions, it might be useful, for the
record, and one of us might get a chance...

pws


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-22  9:46   ` Peter Stephenson
  2016-03-22 11:46     ` Jesper Nygårds
@ 2016-03-26 15:40     ` Bart Schaefer
  2016-04-07 20:10       ` Peter Stephenson
  1 sibling, 1 reply; 14+ messages in thread
From: Bart Schaefer @ 2016-03-26 15:40 UTC (permalink / raw)
  To: zsh-workers

On Mar 22,  9:46am, Peter Stephenson wrote:
} Subject: Re: PATCH: short-circuiting glob exclusion operator
}
} On Mon, 21 Mar 2016 15:54:21 -0700
} Bart Schaefer <schaefer@brasslantern.com> wrote:
} > 
} > As Mikael has already pointed out, files with names ending in a tilde
} > followed by an exclusion pattern may cause trouble with this.
} 
} Yes, they do: [...]
} 
} ~# might work instead.

Maybe this should be a glob qualifier instead of a pattern, since it does
not make sense in generic pattern context.  We have Y3 to short-circuit
after 3 matches; perhaps Y~ to short-circuit when an exclusion matches?
Or are qualifiers also parsed too late?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-03-26 15:40     ` Bart Schaefer
@ 2016-04-07 20:10       ` Peter Stephenson
  2016-04-08  3:18         ` Bart Schaefer
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Stephenson @ 2016-04-07 20:10 UTC (permalink / raw)
  To: zsh-workers

On Sat, 26 Mar 2016 08:40:42 -0700
Bart Schaefer <schaefer@brasslantern.com> wrote:
> Maybe this should be a glob qualifier instead of a pattern, since it does
> not make sense in generic pattern context.  We have Y3 to short-circuit
> after 3 matches; perhaps Y~ to short-circuit when an exclusion matches?
> Or are qualifiers also parsed too late?

It looks like it has to be Y- (or something else); a ~ in parentheses is
taken as an indication that the parentheses are for grouping, not flags,
and it's too late to change that assumption now.  But '-' means 'not' or
'something other than the postive integer you might have been lead to
expect', and stuff.

Otherwise this seems to work.

pws


commit 30bace3e5571530abc68d8629e8f215bd87e7457
Author: Peter Stephenson <p.w.stephenson@ntlworld.com>
Date:   Mon Mar 21 20:02:22 2016 +0000

    (Y-) globbing flag: addapted patch from 38199
    
    This allows exclusions with ~ to match against intermediate directories
    in order to prune searches early.

diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo
index c6e7b6f..81bb897 100644
--- a/Doc/Zsh/expn.yo
+++ b/Doc/Zsh/expn.yo
@@ -2082,6 +2082,23 @@ Multiple patterns can be excluded by
 `var(foo)tt(~)var(bar)tt(~)var(baz)'.
 In the exclusion pattern (var(y)), `tt(/)' and `tt(.)' are not treated
 specially the way they usually are in globbing.
+
+In filename generation a top-level pattern after a tt(~) (i.e. outside
+any parentheses) is treated specially if the globbing flag tt(Y-) is
+also preseent.  As before, the pattern var(x) must match and anything
+matched by var(y) is excluded from the match.  However, in this case
+var(y) is used additionally to prune intermediate directories.  In other
+words, var(y) can match not just a complete file name to be returned,
+but also any directory by which the file is reached.  This is much more
+efficient than the previous form for deep directory hierarchies as the
+directory is excluded as soon as it is reached and is never entered.
+
+For example, `tt(**/*.o~(*/|)lib+LPAR()Y-+RPAR())' searches recursively
+for files with the suffix `tt(.o)', but if a directory named `tt(lib)'
+is reached at the top level or in any subdirectory found recursively it
+is not searched.  With the pattern `tt(**/*~(*/|)lib+LPAR()/Y-+RPAR()),
+which searches recursively for directories, directories named tt(lib)
+are not only not searched but are not returned as a match.
 )
 item(var(x)tt(#))(
 (Requires tt(EXTENDED_GLOB) to be set.)
@@ -2734,6 +2751,12 @@ matches in directory traversal order will be considered.
 
 Implies tt(oN) when no tt(o)var(c) qualifier is used.
 )
+item(tt(Y-))(
+Also implies short-circuit, but in this case with exclusions using the
+tt(~) extended glob pattern.  Any intermediate directory that matches
+the excluded pattern causes globbing to stop at that point.  For
+more detail see the description of the var(x)tt(~)var(y) pattern above.
+)
 item(tt(o)var(c))(
 specifies how the names of the files should be sorted. If var(c) is
 tt(n) they are sorted by name; if it is tt(L) they
diff --git a/Src/glob.c b/Src/glob.c
index 2051016..5c59be5 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -183,8 +183,10 @@ struct globdata {
     int gd_gf_nullglob, gd_gf_markdirs, gd_gf_noglobdots, gd_gf_listtypes;
     int gd_gf_numsort;
     int gd_gf_follow, gd_gf_sorts, gd_gf_nsorts;
+    int gd_exclude_shortcut;
     struct globsort gd_gf_sortlist[MAX_SORTS];
     LinkList gd_gf_pre_words, gd_gf_post_words;
+    Patprog gd_exclude;
 
     char *gd_glob_pre, *gd_glob_suf;
 };
@@ -259,7 +261,7 @@ struct complist {
 
 /**/
 static void
-addpath(char *s, int l)
+addpath(const char *s, int l)
 {
     DPUTS(!pathbuf, "BUG: pathbuf not initialised");
     while (pathpos + l + 1 >= pathbufsz)
@@ -459,6 +461,38 @@ insert(char *s, int checked)
     return;
 }
 
+/*
+ * Decide whether to prune at this point in scanning. 1 = yes.
+ *
+ * We'll make temporary use of pathbuf for this; it's convenient because
+ * most of a long path won't need copying and if we are later going to
+ * add this file to it we know the buffer will be large enough.
+ *
+ * The filename fn is here metafied.
+ */
+/**/
+static int
+scanner_prune(const char *fn, int len)
+{
+    int ret, savepathpos, saverrsfound, savforceerrs;
+    if (!curglobdata.gd_exclude)
+	return 0;
+    savepathpos = pathpos;
+    addpath(fn, len);
+    if (pathpos && pathbuf[pathpos-1] == '/')
+	pathbuf[pathpos-1] = '\0';
+    /*
+     * As we're in the middle of a glob, be careful about
+     * the counters needed for approximation across directories.
+     */
+    patsaveerrs(&saverrsfound, &savforceerrs);
+    ret = pattry(curglobdata.gd_exclude, pathbuf);
+    patrestoreerrs(saverrsfound, savforceerrs);
+    pathbuf[pathpos = savepathpos] = '\0';
+
+    return ret;
+}
+
 /* Do the globbing:  scanner is called recursively *
  * with successive bits of the path until we've    *
  * tried all of it.                                */
@@ -510,6 +544,8 @@ scanner(Complist q, int shortcircuit)
 	    }
 	    pathbufcwd = pathpos;
 	}
+	if (scanner_prune(str, l))
+	    return;
 	if (q->next) {
 	    /* Not the last path section. Just add it to the path. */
 	    int oppos = pathpos;
@@ -563,6 +599,8 @@ scanner(Complist q, int shortcircuit)
 		continue;
 	    errsfound = errssofar;
 	    if (pattry(p, fn)) {
+		if (scanner_prune(fn, strlen(fn)))
+		    continue;
 		/* if this name matchs the pattern... */
 		if (pbcwdsav == pathbufcwd &&
 		    strlen(fn) + pathpos - pathbufcwd >= PATH_MAX) {
@@ -675,9 +713,10 @@ scanner(Complist q, int shortcircuit)
 
 /**/
 static Complist
-parsecomplist(char *instr)
+parsecomplist(char *instr, char **post_exclude)
 {
     Patprog p1;
+    struct patcompargs compargs;
     Complist l1;
     char *str;
     int compflags = gf_noglobdots ? (PAT_FILE|PAT_NOGLD) : PAT_FILE;
@@ -699,11 +738,11 @@ parsecomplist(char *instr)
 
 	    /* Now get the next path component if there is one. */
 	    l1 = (Complist) zhalloc(sizeof *l1);
-	    if ((l1->next = parsecomplist(instr)) == NULL) {
+	    if ((l1->next = parsecomplist(instr, post_exclude)) == NULL) {
 		errflag |= ERRFLAG_ERROR;
 		return NULL;
 	    }
-	    l1->pat = patcompile(NULL, compflags | PAT_ANY, NULL);
+	    l1->pat = patcompile(NULL, compflags | PAT_ANY, &compargs);
 	    l1->closure = 1;		/* ...zero or more times. */
 	    l1->follow = follow;
 	    return l1;
@@ -715,8 +754,9 @@ parsecomplist(char *instr)
 	!skipparens(Inpar, Outpar, (char **)&str) &&
         *str == zpc_special[ZPC_HASH] && str[-2] == '/') {
 	instr++;
-	if (!(p1 = patcompile(instr, compflags, &instr)))
+	if (!(p1 = patcompile(instr, compflags, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	if (instr[0] == '/' && instr[1] == Outpar && instr[2] == Pound) {
 	    int pdflag = 0;
 
@@ -730,21 +770,28 @@ parsecomplist(char *instr)
 	    /* special case (/)# to avoid infinite recursion */
 	    l1->closure = (*((char *)p1 + p1->startoff)) ? 1 + pdflag : 0;
 	    l1->follow = 0;
-	    l1->next = parsecomplist(instr);
+	    l1->next = parsecomplist(instr, NULL);
 	    return (l1->pat) ? l1 : NULL;
 	}
     } else {
 	/* parse single path component */
-	if (!(p1 = patcompile(instr, compflags|PAT_FILET, &instr)))
+	if (post_exclude)
+	    compflags |= PAT_FILET|PAT_EXCL_PRUNE;
+	else
+	    compflags |= PAT_FILET;
+	if (!(p1 = patcompile(instr, compflags, &compargs)))
 	    return NULL;
+	instr = compargs.endexp;
 	/* then do the remaining path components */
 	if (*instr == '/' || !*instr) {
 	    int ef = *instr == '/';
 
+	    if (post_exclude && compargs.post_exclude)
+		*post_exclude = dupstring(compargs.post_exclude);
 	    l1 = (Complist) zhalloc(sizeof *l1);
 	    l1->pat = p1;
 	    l1->closure = 0;
-	    l1->next = ef ? parsecomplist(instr+1) : NULL;
+	    l1->next = ef ? parsecomplist(instr+1, post_exclude) : NULL;
 	    return (ef && !l1->next) ? NULL : l1;
 	}
     }
@@ -756,7 +803,7 @@ parsecomplist(char *instr)
 
 /**/
 static Complist
-parsepat(char *str)
+parsepat(char *str, char **post_exclude)
 {
     long assert;
     int ignore;
@@ -785,7 +832,7 @@ parsepat(char *str)
     } else			/* pattern is relative to pwd */
 	pathbuf[pathpos = 0] = '\0';
 
-    return parsecomplist(str);
+    return parsecomplist(str, post_exclude);
 }
 
 /* get number after qualifier */
@@ -1187,6 +1234,7 @@ zglob(LinkList list, LinkNode np, int nountok)
     int first = 0, end = -1;		/* index of first match to return */
 					/* and index+1 of the last match */
     struct globdata saved;		/* saved glob state              */
+    char *post_exclude = NULL;		/* text after ~~ */
     int nobareglob = !isset(BAREGLOBQUAL);
     int shortcircuit = 0;		/* How many files to match;      */
 					/* 0 means no limit              */
@@ -1220,7 +1268,7 @@ zglob(LinkList list, LinkNode np, int nountok)
     gf_listtypes = gf_follow = 0;
     gf_noglobdots = unset(GLOBDOTS);
     gf_numsort = isset(NUMERICGLOBSORT);
-    gf_sorts = gf_nsorts = 0;
+    gf_sorts = gf_nsorts = curglobdata.gd_exclude_shortcut = 0;
     gf_pre_words = gf_post_words = NULL;
 
     /* Check for qualifiers */
@@ -1550,12 +1598,18 @@ zglob(LinkList list, LinkNode np, int nountok)
 		    shortcircuit = !(sense & 1);
 		    if (shortcircuit) {
 			/* Parse the argument. */
-			data = qgetnum(&s);
-			if ((shortcircuit = data) != data) {
-			    /* Integer overflow */
-			    zerr("value too big: Y%s", s_saved);
-			    restore_globstate(saved);
-			    return;
+			if (*s == '-') {
+			    curglobdata.gd_exclude_shortcut = 1;
+			    shortcircuit = 0; /* handled differently */
+			    s++;
+			} else {
+			    data = qgetnum(&s);
+			    if ((shortcircuit = data) != data) {
+				/* Integer overflow */
+				zerr("value too big: Y%s", s_saved);
+				restore_globstate(saved);
+				return;
+			    }
 			}
 		    }
 		    break;
@@ -1774,7 +1828,7 @@ zglob(LinkList list, LinkNode np, int nountok)
 			    qfirst = qn;
 			    for (qlast = qfirst; qlast->next;
 				 qlast = qlast->next)
-				;			    
+				;
 			} else
 			    qfirst = dup_qual_list(qn, &qlast);
 			/* ... link into new `or' chain ... */
@@ -1804,8 +1858,20 @@ zglob(LinkList list, LinkNode np, int nountok)
 	} else if (newquals)
 	    quals = newquals;
     }
-    q = parsepat(str);
-    if (!q || errflag) {	/* if parsing failed */
+    q = parsepat(str, curglobdata.gd_exclude_shortcut ? &post_exclude : NULL);
+    if (q && !errflag && post_exclude) {
+	/*
+	 * Exclusion after ~ with (Y-).
+	 * This is a normal pattern, not a file, but indicate the
+	 * special case as if we encounter a further top-level tilde
+	 * it indicates patterns to be or'd in the exclusion.
+	 */
+	curglobdata.gd_exclude = patcompile(post_exclude, PAT_EXCL_PRUNE, NULL);
+    } else {
+	curglobdata.gd_exclude = NULL;
+    }
+    if (!q || errflag || (post_exclude && !curglobdata.gd_exclude)) {
+	/* if parsing failed */
 	restore_globstate(saved);
 	if (unset(BADPATTERN)) {
 	    if (!nountok)
diff --git a/Src/pattern.c b/Src/pattern.c
index 4e2f236..7ef64a7 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -521,21 +521,24 @@ patcompstart(void)
  * Top level pattern compilation subroutine
  * exp is a null-terminated, metafied string.
  * inflags is an or of some PAT_* flags.
- * endexp, if non-null, is set to a pointer to the end of the
- *   part of exp which was compiled.  This is used when
- *   compiling patterns for directories which must be
+ *
+ * compargs, if non-null, contains pointers that may be set
+ * to indicate the following. This is used when matching files
+ * as in this case the pattern has multiple parts.
+ * - endexp is set to the end of the part of exp which was compiled.
+ *   This is used when compiling patterns for directories which must be
  *   matched recursively.
  */
 
 /**/
 mod_export Patprog
-patcompile(char *exp, int inflags, char **endexp)
+patcompile(char *exp, int inflags, Patcompargs compargs)
 {
     int flags = 0;
     long len = 0;
     long startoff;
     Upat pscan;
-    char *lng, *strp = NULL;
+    char *lng, *strp = NULL, *post_exclude = NULL;
     Patprog p;
 
     queue_signals();
@@ -602,7 +605,7 @@ patcompile(char *exp, int inflags, char **endexp)
 	if (!strp || (*strp && *strp != '/')) {
 	    /* No, do normal compilation. */
 	    strp = NULL;
-	    if (patcompswitch(0, &flags) == 0) {
+	    if (patcompswitch(0, &flags, &post_exclude) == 0) {
 		unqueue_signals();
 		return NULL;
 	    }
@@ -735,13 +738,47 @@ patcompile(char *exp, int inflags, char **endexp)
 	p = newp;
     }
 
-    if (endexp)
-	*endexp = patparse;
+    if (compargs) {
+	compargs->endexp = patparse;
+	if ((compargs->post_exclude = post_exclude))
+	{
+	    /*
+	     * The top-level ~~ is the last thing that can appear.
+	     */
+	    compargs->endexp += strlen(patparse);
+	}
+    }
 
     unqueue_signals();
     return p;
 }
 
+/* Flags for exclusion with ~ */
+enum {
+    TILDE_SINGLE = 1,
+    TILDE_SLASH = 2,
+};
+
+/*
+ * Look for an active tilde.
+ *
+ * If term_match, we're looking for a terminator (something that can't
+ * be part of a pattern itself) to follow; otherwise, we're
+ * looking for a tilde without a terminator following.
+ */
+
+static int
+pattilde(int toplevel, int term_match)
+{
+    if (*patparse != zpc_special[ZPC_TILDE])
+	return 0;
+    if (patparse[1] == '/' || !memchr(zpc_special, patparse[1],
+				      ZPC_SEG_COUNT))
+	return term_match ? TILDE_SINGLE : 0;
+    else
+	return term_match ? 0 : TILDE_SINGLE;
+}
+
 /*
  * Main body or parenthesized subexpression in pattern
  * Parenthesis (and any ksh_glob gubbins) will have been removed.
@@ -749,7 +786,7 @@ patcompile(char *exp, int inflags, char **endexp)
 
 /**/
 static long
-patcompswitch(int paren, int *flagp)
+patcompswitch(int paren, int *flagp, char **post_exclude)
 {
     long starter, br, ender, excsync = 0;
     int parno = 0;
@@ -783,17 +820,45 @@ patcompswitch(int paren, int *flagp)
 
     *flagp |= flags & (P_HSTART|P_PURESTR);
 
-    while (*patparse == zpc_chars[ZPC_BAR] ||
-	   (*patparse == zpc_special[ZPC_TILDE] &&
-	    (patparse[1] == '/' ||
-	     !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
-	int tilde = *patparse++ == zpc_special[ZPC_TILDE];
-	long gfnode = 0, newbr;
+    for (;;) {
+	int tilde;
+	long gfnode, newbr;
+	if (*patparse == zpc_chars[ZPC_BAR]) {
+	    /* handle alternation */
+	    tilde = 0;
+	    patparse++;
+	} else if ((tilde = pattilde(!paren, 1))) {
+	    patparse += tilde;
+	    if ((patflags & (PAT_FILET|PAT_EXCL_PRUNE)) == PAT_EXCL_PRUNE)
+	    {
+		/*
+		 * We're looking at something like the second tilde in
+		 * foo~bar1~bar2(Y-).  We need to treat this like
+		 * foo excluding bar1|bar2, so treat the tilde as a bar.
+		 */
+		tilde = 0;
+	    }
+	    /* else handle exclusion... */
+	} else {
+	    break;
+	}
 
+	gfnode = 0;
 	*flagp &= ~P_PURESTR;
 
 	if (tilde) {
 	    union upat up;
+	    if ((patflags & (PAT_FILET|PAT_EXCL_PRUNE)) ==
+		(PAT_FILET|PAT_EXCL_PRUNE)) {
+		/*
+		 * Prruning a partial path, not just the fully expanded
+		 * file: handle specially
+		 */
+		*post_exclude = patparse;
+		patparse += strlen(patparse);
+		break;
+	    }
+
 	    /* excsync remembers the P_EXCSYNC node before a chain of
 	     * exclusions:  all point back to this.  only the
 	     * original (non-excluded) branch gets a trailing P_EXCSYNC.
@@ -825,7 +890,7 @@ patcompswitch(int paren, int *flagp)
 	    patadd((char *)&up, 0, sizeof(up), 0);
 	    /* / is not treated as special if we are at top level */
 	    if (!paren && zpc_special[ZPC_SLASH] == '/') {
-		tilde++;
+		tilde |= TILDE_SLASH;
 		zpc_special[ZPC_SLASH] = Marker;
 	    }
 	} else {
@@ -864,7 +929,7 @@ patcompswitch(int paren, int *flagp)
 	    }
 	}
 	newbr = patcompbranch(&flags, paren);
-	if (tilde == 2) {
+	if (tilde & TILDE_SLASH) {
 	    /* restore special treatment of / */
 	    zpc_special[ZPC_SLASH] = '/';
 	}
@@ -935,8 +1000,7 @@ patcompbranch(int *flagp, int paren)
 
     starter = chain = 0;
     while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) ||
-	   (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' &&
-	    memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) {
+	   pattilde(!paren, 0)) {
 	if ((*patparse == zpc_special[ZPC_INPAR] &&
 	     patparse[1] == zpc_special[ZPC_HASH]) ||
 	    (*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar &&
@@ -1298,9 +1362,7 @@ patcomppiece(int *flagp, int paren)
 	     */
 	    if (kshchar ||
 		(memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) &&
-		 (*patparse != zpc_special[ZPC_TILDE] ||
-		  patparse[1] == '/' ||
-		  !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
+		 !pattilde(!paren, 0))) {
 		break;
 	    }
     	}
@@ -1507,7 +1569,7 @@ patcomppiece(int *flagp, int paren)
 		 */
 		if (!(starter = patcompnot(1, &flags2)))
 		    return 0;
-	    } else if (!(starter = patcompswitch(1, &flags2)))
+	    } else if (!(starter = patcompswitch(1, &flags2, NULL)))
 		return 0;
 	    flags |= flags2 & P_HSTART;
 	    break;
@@ -1759,7 +1821,8 @@ patcompnot(int paren, int *flagsp)
     pattail(starter, excl = patnode(P_EXCLUDE));
     up.p = NULL;
     patadd((char *)&up, 0, sizeof(up), 0);
-    if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0))))
+    if (!(br = (paren ? patcompswitch(1, &dummy, NULL) :
+		patcompbranch(&dummy, 0))))
 	return 0;
     pattail(br, patnode(P_EXCEND));
     n = patnode(P_NOTHING); /* just so much easier */
@@ -2040,6 +2103,26 @@ pattrystart(void)
     errsfound = 0;
 }
 
+/* Save the error count during a glob. */
+
+/**/
+void
+patsaveerrs(int *errsfoundp, int *forceerrsp)
+{
+    *errsfoundp = errsfound;
+    *forceerrsp = forceerrs;
+}
+
+/* Restore the error count during a glob. */
+
+/**/
+void
+patrestoreerrs(int errsfoundin, int forceerrsin)
+{
+    errsfound = errsfoundin;
+    forceerrs = forceerrsin;
+}
+
 /*
  * Fix up string length stuff.
  *
diff --git a/Src/zsh.h b/Src/zsh.h
index eee31da..7f58354 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -511,6 +511,7 @@ typedef struct options	 *Options;
 typedef struct optname   *Optname;
 typedef struct param     *Param;
 typedef struct paramdef  *Paramdef;
+typedef struct patcompargs *Patcompargs;
 typedef struct patstralloc  *Patstralloc;
 typedef struct patprog   *Patprog;
 typedef struct prepromptfn *Prepromptfn;
@@ -1500,6 +1501,14 @@ struct patstralloc {
     int progstrunmetalen;	/* Length of the foregoing */
 };
 
+struct patcompargs {
+    char *endexp;		/* End of the part of the input
+				 * expression that was compiled.*/
+    char *post_exclude;         /* Expression following ~ in top-level
+				 * file match: handled specially.
+				 * Enabled by PAT_FILET flag. */
+};
+
 /* Flags used in pattern matchers (Patprog) and passed down to patcompile */
 
 #define PAT_FILE	0x0001	/* Pattern is a file name */
@@ -1515,6 +1524,12 @@ struct patstralloc {
 #define PAT_NOTEND	0x0400	/* End of string is not real end */
 #define PAT_HAS_EXCLUDP	0x0800	/* (internal): top-level path1~path2. */
 #define PAT_LCMATCHUC   0x1000  /* equivalent to setting (#l) */
+#define PAT_EXCL_PRUNE  0x2000  /* With PAT_FILET, any trailing ~ pattern
+				 * can prune directories, too.
+				 * Without PAT_FILET, this is that
+				 * pattern, so subsequent "~"s have the
+				 * effect of "|".
+				 */
 
 /**
  * Indexes into the array of active pattern characters.
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index a6b704a..83d36f6 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -655,3 +655,28 @@
   [[ "1" = [$~cset] ]] || print Fail 5
   [[ "b" != [$~cset] ]] || print Fail 6
 0:character set specified as active variabe
+
+  (cd glob.tmp
+   setopt extendedglob
+   mkdir DT DT/one DT/two DT/three
+   for dir in one two three; do
+     mkdir DT/$dir/sub{1..3}
+     touch DT/$dir/sub{1..3}/file.{a,b,c}
+   done
+   print -l DT/o*/*3/file.*
+   print -l DT/**/*~DT(.NY-)
+   # Attention: EXCLUSIONS BLOW YOUR MIND warning...
+   print -l DT/**/*.*~(*sub[123]*.[ab]|*sub[13]*)(Y-)
+   print -l DT/**/*.*~*/(sub[12]|one|two)(Y-)
+  )
+0:~~ exclusions
+>DT/one/sub3/file.a
+>DT/one/sub3/file.b
+>DT/one/sub3/file.c
+>
+>DT/one/sub2/file.c
+>DT/three/sub2/file.c
+>DT/two/sub2/file.c
+>DT/three/sub3/file.a
+>DT/three/sub3/file.b
+>DT/three/sub3/file.c


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-07 20:10       ` Peter Stephenson
@ 2016-04-08  3:18         ` Bart Schaefer
  2016-04-09 18:22           ` Peter Stephenson
  0 siblings, 1 reply; 14+ messages in thread
From: Bart Schaefer @ 2016-04-08  3:18 UTC (permalink / raw)
  To: zsh-workers

On Apr 7,  9:10pm, Peter Stephenson wrote:
}
} It looks like it has to be Y- (or something else)

I considered suggesting (Y/) for "short-circuit directories".

} @@ -735,13 +738,47 @@ patcompile(char *exp, int inflags, char **endexp)
} +	    /*
} +	     * The top-level ~~ is the last thing that can appear.
} +	     */

Is that comment about ~~ still meaningful?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-08  3:18         ` Bart Schaefer
@ 2016-04-09 18:22           ` Peter Stephenson
  2016-04-09 19:00             ` Mikael Magnusson
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Stephenson @ 2016-04-09 18:22 UTC (permalink / raw)
  To: zsh-workers

On Thu, 7 Apr 2016 20:18:30 -0700
Bart Schaefer <schaefer@brasslantern.com> wrote:
> On Apr 7,  9:10pm, Peter Stephenson wrote:
> }
> } It looks like it has to be Y- (or something else)
> 
> I considered suggesting (Y/) for "short-circuit directories".

That might be a good idea.

> } @@ -735,13 +738,47 @@ patcompile(char *exp, int inflags, char **endexp)
> } +	    /*
> } +	     * The top-level ~~ is the last thing that can appear.
> } +	     */
> 
> Is that comment about ~~ still meaningful?

There are a few comments that need updating.

pws


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-09 18:22           ` Peter Stephenson
@ 2016-04-09 19:00             ` Mikael Magnusson
  2016-04-09 19:51               ` Peter Stephenson
  0 siblings, 1 reply; 14+ messages in thread
From: Mikael Magnusson @ 2016-04-09 19:00 UTC (permalink / raw)
  To: Peter Stephenson; +Cc: zsh workers

On Sat, Apr 9, 2016 at 8:22 PM, Peter Stephenson
<p.w.stephenson@ntlworld.com> wrote:
> On Thu, 7 Apr 2016 20:18:30 -0700
> Bart Schaefer <schaefer@brasslantern.com> wrote:
>> On Apr 7,  9:10pm, Peter Stephenson wrote:
>> }
>> } It looks like it has to be Y- (or something else)
>>
>> I considered suggesting (Y/) for "short-circuit directories".
>
> That might be a good idea.

The only argument against this syntax (and I'm not saying it's a
strong argument) is that you have to choose between being able to
shortcircuit a subtree, or excluding a particular match that happens
to be a directory (while still including children of that directory).
Unless like before this only applies to the final ~ pattern in the,
er, pattern?

-- 
Mikael Magnusson


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-09 19:00             ` Mikael Magnusson
@ 2016-04-09 19:51               ` Peter Stephenson
  2016-04-10  5:40                 ` Bart Schaefer
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Stephenson @ 2016-04-09 19:51 UTC (permalink / raw)
  To: zsh workers

On Sat, 9 Apr 2016 21:00:36 +0200
Mikael Magnusson <mikachu@gmail.com> wrote:
> The only argument against this syntax (and I'm not saying it's a
> strong argument) is that you have to choose between being able to
> shortcircuit a subtree, or excluding a particular match that happens
> to be a directory (while still including children of that directory).
> Unless like before this only applies to the final ~ pattern in the,
> er, pattern?

They're not actually in competition; I meant to ensure it was possible
to specify both, which I've done.

pws



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-09 19:51               ` Peter Stephenson
@ 2016-04-10  5:40                 ` Bart Schaefer
  2016-04-10 12:11                   ` Peter Stephenson
  0 siblings, 1 reply; 14+ messages in thread
From: Bart Schaefer @ 2016-04-10  5:40 UTC (permalink / raw)
  To: zsh workers

On Apr 9,  8:51pm, Peter Stephenson wrote:
} Subject: Re: PATCH: short-circuiting glob exclusion operator
}
} On Sat, 9 Apr 2016 21:00:36 +0200
} Mikael Magnusson <mikachu@gmail.com> wrote:
} > The only argument against this syntax (and I'm not saying it's a
} > strong argument) is that you have to choose between being able to
} > shortcircuit a subtree, or excluding a particular match that happens
} > to be a directory (while still including children of that directory).
} 
} They're not actually in competition; I meant to ensure it was possible
} to specify both, which I've done.

I think Mikael is asking about specifying both conditions in the *same*
glob pattern, i.e. **/*~*/foo~*/bar(Y-) such that bar directories are
short-circuited whereas only paths ending in foo are excluded, but any
files below a directory foo remain.  Your doc and the comments about
treating multiple ~ as if they were | seem to imply that both foo and
bar would be short-circuited in this example.

Of course one can still write **/*~*/foo~*/bar/* without (Y-).


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: PATCH: short-circuiting glob exclusion operator
  2016-04-10  5:40                 ` Bart Schaefer
@ 2016-04-10 12:11                   ` Peter Stephenson
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Stephenson @ 2016-04-10 12:11 UTC (permalink / raw)
  To: zsh workers

On Sat, 09 Apr 2016 22:40:42 -0700
Bart Schaefer <schaefer@brasslantern.com> wrote:
> On Apr 9,  8:51pm, Peter Stephenson wrote:
> I think Mikael is asking about specifying both conditions in the *same*
> glob pattern, i.e. **/*~*/foo~*/bar(Y-) such that bar directories are
> short-circuited whereas only paths ending in foo are excluded, but any
> files below a directory foo remain.  Your doc and the comments about
> treating multiple ~ as if they were | seem to imply that both foo and
> bar would be short-circuited in this example.
> 
> Of course one can still write **/*~*/foo~*/bar/* without (Y-).

Right, you have to pick one or the other meaning.

If this becomes important we could force (#e) only to match on
a complete match, which is sort of what it's for.  But I wouldn't
expect this to be a real problem.

pws


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-04-10 12:11 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-21 18:36 PATCH: short-circuiting glob exclusion operator Peter Stephenson
2016-03-21 20:39 ` Mikael Magnusson
2016-03-21 22:54 ` Bart Schaefer
2016-03-22  9:46   ` Peter Stephenson
2016-03-22 11:46     ` Jesper Nygårds
2016-03-23  9:56       ` Peter Stephenson
2016-03-26 15:40     ` Bart Schaefer
2016-04-07 20:10       ` Peter Stephenson
2016-04-08  3:18         ` Bart Schaefer
2016-04-09 18:22           ` Peter Stephenson
2016-04-09 19:00             ` Mikael Magnusson
2016-04-09 19:51               ` Peter Stephenson
2016-04-10  5:40                 ` Bart Schaefer
2016-04-10 12:11                   ` Peter Stephenson

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/zsh/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).