* Set operations @ 2008-09-27 9:02 Phil Pennock 2008-09-27 20:48 ` Peter Stephenson 0 siblings, 1 reply; 3+ messages in thread From: Phil Pennock @ 2008-09-27 9:02 UTC (permalink / raw) To: Zsh hackers list Somehow, until a recent post from Bart, I'd missed ${array:#element} to remove an element from an array. That was the clean removal method I'd been missing for using -U unique arrays as sets. So the below is a first pass implementation of set arithmetic; however, the safesub stuff shows that my big problem is, when joining existing elements together with |, escaping pattern-characters in the original array elements. It does try to be fairly resistant to non-normal setups without going full emulate, since it only uses a few features. So given 'b' naming the set variable with the elements to remove, I do a first pass approximation like this: safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") instead of just, for set subtraction of b from a: "${(P@)a:#${(Pj,|,)~b}}" However, this obviously doesn't deal with the full set of pattern-matching characters, so is incomplete. So, before I head off and implement a parameter expansion flag to escape pattern-matching characters (honouring current pattern-affecting options), the question is whether I'm missing something obvious to do this set arithmetic cleanly already, or if there's already a clean way to escape pattern characters? Any feedback on the complete set (so to speak) of functionality or naming or implementation appreciated, with an eye to adding this to the standard zsh install (autoloaded function set_operations to define these?) Or is the thing to do to overload the arithmetic operations in arithmetic expansion, for let, to be able to handle -U arrays as sets natively? :-) (let diff=a-b ...) (The number of times I work with large lists of machines interactively in shell and want to deal with them as sets and manipulate appropriately is somewhat high and I'm fed up of switching to Python.) Regards, -Phil #----------------------------8< cut here >8------------------------------ function newset { setopt local_options no_ksh_arrays local name="$1"; shift typeset -gUa $name set -A $name "$@" } function copyset_tofrom { setopt local_options no_ksh_arrays local new="$1" old="$2" typeset -gUa $new set -A $new "${(P@)old}" } function copyset_fromto { copyset_tofrom "$2" "$1" } function set_add_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new set -A $new "${(P@)a}" "${(P@)b}" } function set_add_in { setopt local_options no_ksh_arrays local name="$1" b="$2" set -A $name "${(P@)name}" "${(P@)b}" } function set_add_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp tmp=("${(P@)a}" "${(P@)b}") print -r -- ${(q)tmp} } function set_subtract_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new typeset -Ua safesub safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $new "${(P@)a:#${(j,|,)~safesub}}" } function set_subtract_in { setopt local_options no_ksh_arrays local name="$1" b="$2" typeset -Ua safesub safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $name "${(P@)name:#${(j,|,)~safesub}}" } function set_subtract_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp typeset -Ua safesub safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") tmp=("${(P@)a:#${(j,|,)~safesub}}") print -r -- ${(q)tmp} } function set_intersection_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new typeset -Ua safesub safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $new "${(@PM)a:#${(j,|,)~safesub}}" } function set_union_new { set_add_new "$@" } function set_union_in { set_add_in "$@" } function set_union_print { set_add_print "$@" } function set_difference_new { set_subtract_new "$@" } function set_difference_in { set_subtract_in "$@" } function set_difference_print { set_subtract_print "$@" } function set_symmetric_difference_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new typeset -Ua safe_a safe_b safe_b=("${(@)${(@)${(@)${(P@)a//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") safe_b=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $new "${(P@)a:#${(j,|,)~safe_b}}" "${(P@)b:#${(j,|,)~safe_a}}" } function set_symmetric_difference_in { setopt local_options no_ksh_arrays local name="$1" b="$2" typeset -Ua safe_name safe_b safe_a=("${(@)${(@)${(@)${(P@)a//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") safe_b=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $name "${(P@)name:#${(j,|,)~safe_b}}" "${(P@)b:#${(j,|,)~safe_name}}" } function set_symmetric_difference_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp typeset -Ua safe_a safe_b safe_b=("${(@)${(@)${(@)${(P@)a//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") safe_b=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") tmp=("${(P@)a:#${(j,|,)~safe_b}}" "${(P@)b:#${(j,|,)~safe_a}}") print -r -- ${(q)tmp} } function set_insert_list { setopt local_options no_ksh_arrays local name="$1"; shift set -A $name "${(P@)name}" "$@" } function set_remove_list { setopt local_options no_ksh_arrays local name="$1"; shift typeset -Ua safesub safesub=("${(@)${(@)${(@)${(P)@//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") set -A $name "${(P@)name:#${(j,|,)~safesub}}" } #----------------------------8< cut here >8------------------------------ ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Set operations 2008-09-27 9:02 Set operations Phil Pennock @ 2008-09-27 20:48 ` Peter Stephenson 2008-09-28 1:50 ` Phil Pennock 0 siblings, 1 reply; 3+ messages in thread From: Peter Stephenson @ 2008-09-27 20:48 UTC (permalink / raw) To: Zsh hackers list On Sat, 27 Sep 2008 02:02:25 -0700 Phil Pennock <zsh-workers+phil.pennock@spodhuis.org> wrote: > So given 'b' naming the set variable with the elements to remove, I do a > first pass approximation like this: > > safesub=("${(@)${(@)${(@)${(P@)b//\\/\\\\}//\|/\\|}//\?/\\?}//\*/\\*}") > > instead of just, for set subtraction of b from a: > > "${(P@)a:#${(Pj,|,)~b}}" I appreciate there's a problem here, but I'm a little bit lost as to what exactly you're after. Do you mean you'd like the "|" in the (j,|,) to be treated as a pattern character, but the characters substituted from the paramater named by b not to be expanded? I *think*, in that case, the right thing to do might be a flag to cause the argument to j to be tokenized, rather than forcing the substitution of b to be quoted, since this is much simpler. Currently, I don't think we have any code that will quote a string so that future token expansion will leave it as it was at the start. (We do do this with quoting, but it's not the same thing---we're in too deep here for quoting and unquoting to be useful.) An obvious way to do this would be to allow '~' in the parentheses to have that meaning. It's easier to do this if that '~' only applies to arguments to its right, which I think is natural (and is true of the current (p) flag, although this doesn't seem to be documented). So, if I'm following, you want "${(P@)a:#${(P~j,|,)b}" If I'm not following, you may want white chococlate mousse followed by a cappuccino. Anyway, I'm not sure why I didn't do this ages ago, it makes testing lists of alternatives vastly easier. I've also achieved a long-standing ambition of editing out a pointless and ugly macro. Index: Doc/Zsh/expn.yo =================================================================== RCS file: /cvsroot/zsh/zsh/Doc/Zsh/expn.yo,v retrieving revision 1.93 diff -u -r1.93 expn.yo --- Doc/Zsh/expn.yo 27 Sep 2008 19:57:33 -0000 1.93 +++ Doc/Zsh/expn.yo 27 Sep 2008 20:46:18 -0000 @@ -910,7 +910,19 @@ startitem() item(tt(p))( Recognize the same escape sequences as the tt(print) builtin -in string arguments to any of the flags described below. +in string arguments to any of the flags described below that +follow this argument. +) +item(tt(~))( +Force string arguments to any of the flags below that follow within +the parentheses to be treated as patterns. Compare with a tt(~) +outside parentheses, which forces the entire substituted string to +be treated as a pattern. Hence, for example, +example([[ "?" = ${(~j.|.)array} ]]) +with the tt(EXTENDED_GLOB) option set succeeds if and only if tt($array) +contains the string `tt(?)' as an element. The argument may be +repeated to toggle the behaviour; its effect only lasts to the +end of the parenthesised group. ) item(tt(j:)var(string)tt(:))( Join the words of arrays together using var(string) as a separator. Index: Src/subst.c =================================================================== RCS file: /cvsroot/zsh/zsh/Src/subst.c,v retrieving revision 1.88 diff -u -r1.88 subst.c --- Src/subst.c 27 Sep 2008 19:57:33 -0000 1.88 +++ Src/subst.c 27 Sep 2008 20:46:20 -0000 @@ -1232,6 +1232,22 @@ return metafy(ptr, len, META_USEHEAP); } +static char * +untok_and_escape(char *s, int escapes, int tok_arg) +{ + int klen; + char *dst; + + untokenize(dst = dupstring(s)); + if (escapes) { + dst = getkeystring(dst, &klen, GETKEYS_SEP, NULL); + dst = metafy(dst, klen, META_HREALLOC); + } + if (tok_arg) + shtokenize(dst); + return dst; +} + /* parameter substitution */ #define isstring(c) ((c) == '$' || (char)(c) == String || (char)(c) == Qstring) @@ -1501,23 +1517,16 @@ int tt = 0; zlong num; /* - * The (p) flag is (uniquely) only remembered within + * The (p) flag is only remembered within * this block. It says we do print-style handling * on the values for flags, but only on those. - * This explains the ghastly macro, but why can't it - * be a function? UNTOK_AND_ESCAPE is defined - * so that the argument must be an lvalue. */ int escapes = 0; - int klen; -#define UNTOK(C) (itok(C) ? ztokens[(C) - Pound] : (C)) -#define UNTOK_AND_ESCAPE(X, S) {\ - untokenize(X = dupstring(S));\ - if (escapes) {\ - X = getkeystring(X, &klen, GETKEYS_SEP, NULL);\ - X = metafy(X, klen, META_HREALLOC);\ - }\ - } + /* + * '~' in parentheses caused tokenization of string arg: + * similar to (p). + */ + int tok_arg = 0; for (s++; (c = *s) != ')' && c != Outpar; s++, tt = 0) { int arglen; /* length of modifier argument */ @@ -1528,6 +1537,10 @@ case ')': case Outpar: break; + case '~': + case Tilde: + tok_arg = !tok_arg; + break; case 'A': ++arrasg; break; @@ -1642,9 +1655,11 @@ sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(spsep, s + arglen) + spsep = untok_and_escape(s + arglen, + escapes, tok_arg); else - UNTOK_AND_ESCAPE(sep, s + arglen) + sep = untok_and_escape(s + arglen, + escapes, tok_arg); *t = sav; s = t + arglen - 1; } else @@ -1677,9 +1692,11 @@ sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(premul, s + arglen) + premul = untok_and_escape(s + arglen, escapes, + tok_arg); else - UNTOK_AND_ESCAPE(postmul, s + arglen) + postmul = untok_and_escape(s + arglen, escapes, + tok_arg); *t = sav; sav = *s; s = t + arglen; @@ -1695,9 +1712,11 @@ sav = *t; *t = '\0'; if (tt) - UNTOK_AND_ESCAPE(preone, s + arglen) + preone = untok_and_escape(s + arglen, + escapes, tok_arg); else - UNTOK_AND_ESCAPE(postone, s + arglen) + postone = untok_and_escape(s + arglen, + escapes, tok_arg); *t = sav; /* -1 since loop will increment */ s = t + arglen - 1; Index: Test/D04parameter.ztst =================================================================== RCS file: /cvsroot/zsh/zsh/Test/D04parameter.ztst,v retrieving revision 1.34 diff -u -r1.34 D04parameter.ztst --- Test/D04parameter.ztst 5 Sep 2008 21:02:16 -0000 1.34 +++ Test/D04parameter.ztst 27 Sep 2008 20:46:20 -0000 @@ -1,4 +1,5 @@ # Test parameter expansion. Phew. +# (By the way, did I say "phew"?) %prep @@ -1062,3 +1063,26 @@ >1 >1 >1 + + foo=("|" "?") + [[ "|" = ${(j.|.)foo} ]] && print yes || print no + [[ "|" = ${(j.|.)~foo} ]] && print yes || print no + [[ "|" = ${(~j.|.)foo} ]] && print yes || print no + [[ "|" = ${(~~j.|.)foo} ]] && print yes || print no + [[ "|" = ${(j.|.~)foo} ]] && print yes || print no + [[ "x" = ${(j.|.)foo} ]] && print yes || print no + [[ "x" = ${(j.|.)~foo} ]] && print yes || print no + [[ "x" = ${(~j.|.)foo} ]] && print yes || print no + [[ "x" = ${(~~j.|.)foo} ]] && print yes || print no + [[ "x" = ${(j.|.~)foo} ]] && print yes || print no +0:GLOBSUBST only on parameter substitution arguments +>no +>yes +>yes +>no +>no +>no +>yes +>no +>no +>no -- Peter Stephenson <p.w.stephenson@ntlworld.com> Web page now at http://homepage.ntlworld.com/p.w.stephenson/ ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Set operations 2008-09-27 20:48 ` Peter Stephenson @ 2008-09-28 1:50 ` Phil Pennock 0 siblings, 0 replies; 3+ messages in thread From: Phil Pennock @ 2008-09-28 1:50 UTC (permalink / raw) To: Peter Stephenson; +Cc: Zsh hackers list [-- Attachment #1: Type: text/plain, Size: 2127 bytes --] On 2008-09-27 at 21:48 +0100, Peter Stephenson wrote: > I appreciate there's a problem here, but I'm a little bit lost as to > what exactly you're after. Do you mean you'd like the "|" in the (j,|,) > to be treated as a pattern character, but the characters substituted > from the paramater named by b not to be expanded? What I was after wasn't anywhere near as good as what you provided. I was thinking that just as we have (q) to apply shell quoting, we should have a flag to escape pattern characters, which would have laboriously allowed me to do what you've just done much more easily by avoiding the whole problem. :) Escape+join+parse_that vs only-parse-the-join > If I'm not following, you may want white chococlate mousse followed by a > cappuccino. No, but my wife would love that. > Anyway, I'm not sure why I didn't do this ages ago, it makes testing > lists of alternatives vastly easier. Oh good, that's two different uses for it which means it's definitely more broadly helpful and it's not just me. *phew* Thanks. Okay, the two attachments provide the resulting clean set operations, with a stupid bug fixed and the intersection operations cleaned up, and the Test file for these. I'm not attached to any names, but I put the first file in Functions/Misc/load_dataset and the test file depends upon that. I just named load_dataset "load_dataset-post-20080927.zsh" to remind anyone using this that it uses your new (~) parameter expansion option. I'm also not attached to this code, since messing with 'let' to have zsh understand sets in arithmetic context is more appealing to me, every time I think about it. But the two approaches should be mutually compatible and this is the one I opted for. And I *just* realised that I needed set_equal, which sets $?, to provide a clean way to test for equality, since not only are the sets not maintained in sorted order, but the need to deal with arbitrary data would lead to bugginess. *sigh* Arithmetic with no equality tests. What was I thinking? Done. Do people think superset/subset/strict_superset/strict_subset would be useful too? -Phil [-- Attachment #2: load_dataset-post-20080927.zsh --] [-- Type: text/plain, Size: 2941 bytes --] # provide functions for manipulating unique lists as sets. function newset { setopt local_options no_ksh_arrays local name="$1"; shift typeset -gUa $name set -A $name "$@" } function copyset_tofrom { setopt local_options no_ksh_arrays local new="$1" old="$2" typeset -gUa $new set -A $new "${(P@)old}" } function copyset_fromto { copyset_tofrom "$2" "$1" } function set_add_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new set -A $new "${(P@)a}" "${(P@)b}" } function set_add_in { setopt local_options no_ksh_arrays local name="$1" b="$2" set -A $name "${(P@)name}" "${(P@)b}" } function set_add_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp tmp=("${(P@)a}" "${(P@)b}") print -r -- ${(q)tmp} } function set_subtract_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new set -A $new "${(P@)a:#${(P~j,|,)b}}" } function set_subtract_in { setopt local_options no_ksh_arrays local name="$1" b="$2" set -A $name "${(P@)name:#${(P~j,|,)b}}" } function set_subtract_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp tmp=("${(P@)a:#${(P~j,|,)b}}") print -r -- ${(q)tmp} } function set_intersection_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new set -A $new "${(@PM)a:#${(P~j,|,)b}}" } function set_intersection_in { setopt local_options no_ksh_arrays local name="$1" b="$2" set -A $name "${(@PM)name:#${(P~j,|,)b}}" } function set_intersection_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp tmp=("${(@PM)a:#${(P~j,|,)b}}") print -r -- ${(q)tmp} } function set_union_new { set_add_new "$@" } function set_union_in { set_add_in "$@" } function set_union_print { set_add_print "$@" } function set_difference_new { set_subtract_new "$@" } function set_difference_in { set_subtract_in "$@" } function set_difference_print { set_subtract_print "$@" } function set_symmetric_difference_new { setopt local_options no_ksh_arrays local new="$1" a="$2" b="$3" typeset -gUa $new set -A $new "${(P@)a:#${(P~j,|,)b}}" "${(P@)b:#${(P~j,|,)a}}" } function set_symmetric_difference_in { setopt local_options no_ksh_arrays local name="$1" b="$2" set -A $name "${(P@)name:#${(P~j,|,)b}}" "${(P@)b:#${(P~j,|,)name}}" } function set_symmetric_difference_print { setopt local_options no_ksh_arrays local a="$1" b="$2" typeset -Ua tmp tmp=("${(P@)a:#${(P~j,|,)b}}" "${(P@)b:#${(P~j,|,)a}}") print -r -- ${(q)tmp} } function set_insert_list { setopt local_options no_ksh_arrays local name="$1"; shift set -A $name "${(P@)name}" "$@" } function set_remove_list { setopt local_options no_ksh_arrays local name="$1"; shift set -A $name "${(P@)name:#${(~j,|,)@}}" } function set_equal { setopt local_options no_ksh_arrays local a="$1" b="$2" [[ "${${(P@oq)a}}" == "${${(P@oq)b}}" ]] } [-- Attachment #3: Z01datasets.ztst --] [-- Type: text/plain, Size: 1583 bytes --] # Is there a way to print the variable, in the style of "typeset -p", # but showing the -U unique-flag? %prep fpath=(../Functions/Misc) autoload load_dataset load_dataset %test newset x a b c d newset y c d e f copyset_tofrom z x print -l "$x" "$y" "$z" 0:Testing basic set creation >a b c d >c d e f >a b c d set_add_print x y set_add_in z y print $z set_add_new z x y print $z 0:Testing set addition (union) >a b c d e f >a b c d e f >a b c d e f set_subtract_new z x y print $z copyset_fromto x z set_subtract_in z y print $z set_subtract_print x y set_subtract_print y x 0:Testing set subtraction (asymmetric difference) >a b >a b >a b >e f set_intersection_new z x y print $z copyset_tofrom z x set_intersection_in z y print $z set_intersection_print x y 0:Testing set intersection >c d >c d >c d set_symmetric_difference_new z x y print $z copyset_tofrom z x set_symmetric_difference_in z y print $z set_symmetric_difference_print x y set_symmetric_difference_print y x 0:Testing set symmetric difference >a b e f >a b e f >a b e f >e f a b set_insert_list y 'a|b' '*' set_remove_list x d print -l "$x" "$y" 0:Testing basic set item addition and removal >a b c >c d e f a|b * set_intersection_print x y set_symmetric_difference_print x y set_union_print x y set_difference_print x y set_difference_print y x 0:Testing set resilience to meta characters >c >a b d e f a\|b \* >a b c d e f a\|b \* >a b >d e f a\|b \* newset z b c a set_equal x z 0:Testing set equality newset z a 'b c' set_equal x z 1:Testing set inequality ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2008-09-28 1:51 UTC | newest] Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2008-09-27 9:02 Set operations Phil Pennock 2008-09-27 20:48 ` Peter Stephenson 2008-09-28 1:50 ` Phil Pennock
Code repositories for project(s) associated with this public inbox https://git.vuxu.org/mirror/zsh/ This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).