From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 6494 invoked from network); 25 Dec 1999 00:41:18 -0000 Received: from sunsite.auc.dk (130.225.51.30) by ns1.primenet.com.au with SMTP; 25 Dec 1999 00:41:18 -0000 Received: (qmail 23611 invoked by alias); 25 Dec 1999 00:41:13 -0000 Mailing-List: contact zsh-workers-help@sunsite.auc.dk; run by ezmlm Precedence: bulk X-No-Archive: yes X-Seq: 9154 Received: (qmail 23604 invoked from network); 25 Dec 1999 00:41:11 -0000 To: zsh-workers@sunsite.auc.dk Subject: PATCH: regexparse MIME-Version: 1.0 (generated by AKEMI 1.13.2 - =?ISO-2022-JP?B?Ig==?= =?ISO-2022-JP?B?GyRCQTA0Y0s8GyhCIg==?=) Content-Type: text/plain; charset=US-ASCII From: Tanaka Akira Date: 25 Dec 1999 09:41:08 +0900 Message-ID: User-Agent: Chao-gnus/6.12.5 AKEMI/1.13.2 (=?ISO-2022-JP?B?GyRCQTAbKEI=?= =?ISO-2022-JP?B?GyRCNGNLPBsoQg==?=) FLAM-DOODLE/1.12.6 (=?ISO-2022-JP?B?GyRCM3cbKEI=?= 10R4.0/5.0) Emacs/20.4 (sparc-sun-solaris2.6) MULE/4.0 (HANANOEN) Finally, I re-implemented most of `_regex_arguments' in C. This should accelerate `_apt'. This patch adds a builtin `regexparse' to the module `zsh/zutil' for internal use of `_regex_arguments'. I select `zsh/zutil' because `regexparse' doesn't depend on completion codes --- it just executes strings supplied by arguments which may have compadd, etc. Its own module is another option, though. Index: Completion/Base/_regex_arguments =================================================================== RCS file: /projects/zsh/zsh/Completion/Base/_regex_arguments,v retrieving revision 1.1.1.10 diff -u -r1.1.1.10 _regex_arguments --- Completion/Base/_regex_arguments 1999/12/10 14:47:55 1.1.1.10 +++ Completion/Base/_regex_arguments 1999/12/25 00:23:52 @@ -22,16 +22,17 @@ # pattern = "/" ( glob | "[]" ) "/" [ "+" | "-" ] # lookahead = "%" glob "%" # guard = "-" zsh-code-to-eval -# action = ":" zsh-code-to-eval +# caction = ":" zsh-code-to-eval +# action = "{" zsh-code-to-eval "}" ## regex word sequence definition: -# element = pattern [ lookahead ] [ guard ] [ action ] -# +# element = pattern [ lookahead ] [ guard ] [ caction ] +# # regex = element # | "(" regex ")" # | regex "#" -# | regex regex +# | ( regex | action ) # # | regex "|" regex # example: @@ -56,323 +57,38 @@ # 1 : parse error # 2 : fatal parse error -_ra_parse_elt () { - local state act - if (( $#regex < index )); then - return 1 - else - case "$regex[index]" in - /*/([-+]|)) state=$index - first=($state) - last=($state) - nullable= - case "$regex[index]" in - */+) cutoff[$state]=+;; - */) cutoff[$state]=/;; - */-) cutoff[$state]=-;; - esac - pattern[$state]="${${regex[index++]#/}%/([-+]|)}" - if [[ $pattern[$state] != "[]" ]]; then - pattern[$state]="(#b)((#B)$pattern[$state])" - fi - if [[ $index -le $#regex && $regex[index] = %*% ]]; then - lookahead[$state]="(#B)${regex[index++][2,-2]}" - else - lookahead[$state]="" - fi - if [[ $index -le $#regex && $regex[index] = -* ]]; then - guard[$state]="${regex[index++][2,-1]}" - else - guard[$state]="" - fi - if [[ $index -le $#regex && $regex[index] = :* ]]; then - act="${regex[index++][2,-1]}" - action[$state]="$act" - : ${actions[$act]::="${actions[$act]} $state"} - else - action[$state]="" - fi - ;; - \() (( index++ )) - _ra_parse_alt || return $? - [[ $index -le $#regex && "$regex[$index]" = \) ]] || return 2 - (( index++ )) - ;; - *) return 1 - ;; - esac - fi - - return 0 -} - -_ra_parse_clo () { - _ra_parse_elt || return $? - - if (( index <= $#regex )) && [[ "$regex[$index]" = \# ]]; then - (( index++ )) - nullable=yes - - for i in $last; do tbl[$i]="$tbl[$i] $first"; done - fi - - return 0 +_ra_comp () { + _ra_actions=("$_ra_actions[@]" "$1") } -_ra_parse_seq () { - local last_seq - local first_seq nullable_seq - first_seq=() - nullable_seq=yes - - _ra_parse_clo || { - if (( $? == 2 )); then - return 2 - else - first=() - last=() - nullable=yes - return 0 - fi - } - first_seq=($first) - last_seq=($last) - [[ -n "$nullable" ]] || nullable_seq= - - while :; do - _ra_parse_clo || { - if (( $? == 2 )); then - return 2 - else - break - fi - } - for i in $last_seq; do tbl[$i]="${tbl[$i]} $first"; done - [[ -n "$nullable_seq" ]] && first_seq=($first_seq $first) - [[ -n "$nullable" ]] || { nullable_seq= last_seq=() } - last_seq=($last_seq $last) - done - - first=($first_seq) - nullable=$nullable_seq - last=($last_seq) - return 0 -} - -_ra_parse_alt () { - local last_alt - local first_alt nullable_alt - first_alt=() - nullable_alt= - - _ra_parse_seq || return $? - first_alt=($first_alt $first) - last_alt=($last_alt $last) - [[ -n "$nullable" ]] && nullable_alt=yes - - while :; do - (( index <= $#regex )) || break - [[ "$regex[$index]" = \| ]] || break - (( index++ )) - - _ra_parse_seq || { - if (( $? == 2 )); then - return 2 - else - break - fi - } - first_alt=($first_alt $first) - last_alt=($last_alt $last) - [[ -n "$nullable" ]] && nullable_alt=yes - done - - first=($first_alt) - last=($last_alt) - nullable=$nullable_alt - return 0 -} - -## function generator - -_ra_gen_func () { - local old new - local state index - local test tmp - local start="0" - - old=() - new=($start) - - print -lr - \ - "$funcname () {" \ - 'local _ra_state _ra_left _ra_right _ra_actions' \ - "_ra_state=$start" \ - '_ra_left=' \ - '_ra_right="${(pj:\0:)${(@)words[1,CURRENT - 1]:Q}}"$'\''\0'\''"$PREFIX"' \ - '_ra_actions=()' \ - 'while :; do' \ - 'case "$_ra_state" in' - - while (( $#new )); do - state="$new[1]" - shift new - old=("$old[@]" "$state") - print -lr - \ - "$state)" - _ra_gen_parse_state - print -lr - \ - ';;' - done - - print -lr - \ - 'esac' \ - 'done' \ - 'while (( $#_ra_actions )); do' \ - 'case "$_ra_actions[1]" in' - - for tmp in "${(@k)actions}"; do - if [[ "$tmp" != '' ]]; then - print -lr - "${(j:);&:)${=actions[$tmp]}})" $tmp ';;' - fi - done - - print -lr - \ - 'esac' \ - 'shift _ra_actions' \ - 'done' \ - '}' -} - -_ra_gen_parse_state () { - local actions i p - test='if' - for index in $=tbl[$state]; do - if [[ "$pattern[$index]" != "[]" ]]; then - p="$pattern[$index]$lookahead[$index]*" - if [[ -z "$guard[$index]" ]]; then - print -lr - \ - "$test [[ \$_ra_right = \${~:-${(qqqq)p}} ]]" - else - print -lr - \ - "$test [[ \$_ra_right = \${~:-${(qqqq)p}} ]] && {" \ - "$guard[$index]" \ - "}" - fi - test='elif' - (( $old[(I)$index] || $new[(I)$index] )) || new=($index "$new[@]") - print -lr - \ - "then" \ - "_ra_state=$index" \ - '_ra_right="${_ra_right[mend[1] + 1, -1]}"' - actions=() - for i in $=tbl[$index]; do - if [[ -n $action[$i] ]]; then - actions=($actions $i) - fi - done - case "$cutoff[$index]" in - +) print -lr - \ - '_ra_left="$_ra_left$match[1]"' - if (( $#actions )); then - print -lr - \ - "_ra_actions=($actions \$_ra_actions)" - fi - ;; - /) print -lr - \ - '_ra_left=' - print -lr - \ - 'if (( mend[1] )); then' \ - "_ra_actions=($actions)" - if (( $#actions )); then - print -lr - \ - 'else' \ - "_ra_actions=($actions \$_ra_actions)" - fi - print -lr - \ - 'fi' - ;; - -) print -lr - \ - '_ra_left=' \ - "_ra_actions=($actions)" - ;; - esac - fi - done - - if [[ $test != 'if' ]]; then - # Some branchs are exists. But all of them are failed. - print -lr - \ - 'else' \ - 'if [[ "$_ra_left$_ra_right" = *$'\''\0'\''* ]]; then' \ - '_message "parse failed before current word"' \ - '_ra_actions=()' \ - 'else' \ - 'compset -p $(( $#PREFIX - $#_ra_right - $#_ra_left ))' \ - 'fi' \ - 'break' \ - 'fi' - else - # There are no branch. - print -lr - \ - '_message "no more arguments"' \ - '_ra_actions=()' \ - 'break' - fi -} - _regex_arguments () { - local funcname="_regex_arguments_tmp" - local funcdef - - typeset -A tbl cutoff pattern lookahead guard action actions - local regex index first last nullable - local i state next - - local cache_dir - zstyle -s ":completion${curcontext}:regex" cache-path cache_dir - [[ -z "$cache_dir" ]] && cache_dir="$HOME/.zsh/regex_arguments" - local cache_file="$cache_dir/$1" - local cache_test - - if ! [[ -f "$cache_file" ]] || ! source "$cache_file" "$@"; then - funcname="$1" - - regex=("${(@)argv[2,-1]}") - index=1 - tbl=() - pattern=() - lookahead=() - guard=() - action=() - actions=() - _ra_parse_alt - - if (( $? == 2 || index != $#regex + 1 )); then - if (( index != $#regex + 1 )); then - print "regex parse error at $index: $regex[index]" >&2 + local regex funcname="$1" + shift + regex=(${@/(#b):(*)/":_ra_comp ${(qqqq)match[1]}"}) + + eval \ + "$funcname"' () { + local _ra_p1 _ra_p2 _ra_left _ra_right _ra_com + local _ra_actions _ra_line="${(pj:\0:)${(@)words[1,CURRENT - 1]:Q}}"$'\''\0'\''"$PREFIX" + regexparse _ra_p1 _ra_p2 "$_ra_line" '"${(j: :)${(qqqq)regex[@]}}"' + case "$?" in + 0|2) _message "no more arguments";; + 1) + if [[ "$_ra_line[_ra_p1 + 1, -1]" = *$'\''\0'\''* ]]; then + _message "parse failed before current word" else - print "regex parse error at $index (end)" >&2 + : $#PREFIX - $#_ra_line + $_ra_p1 : $_ra_p2 + _ra_left="$_ra_line[_ra_p1 + 1, _ra_p2]" + _ra_right="$_ra_line[_ra_p2 + 1, -1]" + compset -p $(( $#PREFIX - $#_ra_line + $_ra_p1 )) + for _ra_com in "$_ra_actions[@]"; do + eval "$_ra_com" + done fi - return 1 - fi - - tbl[0]=" $first" - - unfunction "$funcname" 2>/dev/null - funcdef="$(_ra_gen_func)" - - if [[ -d "$cache_dir" && -w "$cache_dir" ]]; then - print -lr - \ - 'if [[ $# -eq '$#' && "$*" = '"${(qqqq)*}"' ]]; then' \ - "$funcdef" \ - 'true; else false; fi' > "${cache_file}.$HOST.$$" - source "${cache_file}.$HOST.$$" "$@" - mv "${cache_file}.$HOST.$$" "${cache_file}" - else - source =(print -lr - "$funcdef") - fi - fi + ;; + 3) _message "invalid regex";; + esac + }' } _regex_arguments "$@" Index: Src/Modules/zutil.c =================================================================== RCS file: /projects/zsh/zsh/Src/Modules/zutil.c,v retrieving revision 1.1.1.4 diff -u -r1.1.1.4 zutil.c --- Src/Modules/zutil.c 1999/12/16 14:26:37 1.1.1.4 +++ Src/Modules/zutil.c 1999/12/25 00:23:55 @@ -721,9 +721,438 @@ return 1; } +/* Regexparse stuff. */ + +typedef struct { + int cutoff; + char *pattern; + Patprog patprog; + char *guard; + char *action; + LinkList branches; +} RParseState; + +typedef struct { + RParseState *state; + LinkList actions; +} RParseBranch; + +typedef struct { + LinkList nullacts; + LinkList in; + LinkList out; +} RParseResult; + +static char **rparseargs; +static LinkList rparsestates; + +static int rparsealt(RParseResult *result, jmp_buf *perr); + +static void +connectstates(LinkList out, LinkList in) +{ + LinkNode outnode, innode, ln; + for(outnode = firstnode(out); outnode; outnode = nextnode(outnode)) { + RParseBranch *outbranch = getdata(outnode); + for(innode = firstnode(in); innode; innode = nextnode(innode)) { + RParseBranch *inbranch = getdata(innode); + RParseBranch *br = ncalloc(sizeof(*br)); + br->state = inbranch->state; + br->actions = newlinklist(); + for(ln = firstnode(outbranch->actions); ln; ln = nextnode(ln)) + addlinknode(br->actions, getdata(ln)); + for(ln = firstnode(inbranch->actions); ln; ln = nextnode(ln)) + addlinknode(br->actions, getdata(ln)); + addlinknode(outbranch->state->branches, br); + } + } +} + +static int +rparseelt(RParseResult *result, jmp_buf *perr) +{ + int l; + char *s = *rparseargs; + + if(!s) + return 1; + + switch(s[0]) { + case '/': { + RParseState *st; + RParseBranch *br; + char *pattern, *lookahead; + int patternlen, lookaheadlen; + l = strlen(s); + if(!((2 <= l && s[l - 1] == '/') || + (3 <= l && s[l - 2] == '/' && (s[l - 1] == '+' || + s[l - 1] == '-')))) + return 1; + st = ncalloc(sizeof(*st)); + st->branches = newlinklist(); + st->cutoff = s[l - 1]; + if(s[l - 1] == '/') { + pattern = s + 1; + patternlen = l - 2; + } + else { + pattern = s + 1; + patternlen = l - 3; + } + rparseargs++; + if((s = *rparseargs) && s[0] == '%' && + 2 <= (l = strlen(s)) && s[l - 1] == '%') { + rparseargs++; + lookahead = s + 1; + lookaheadlen = l - 2; + } + else { + lookahead = NULL; + } + if(patternlen == 2 && !strncmp(pattern, "[]", 2)) + st->pattern = NULL; + else { + char *cp; + int l = patternlen + 12; /* (#b)((#B)...)...* */ + if(lookahead) + l += lookaheadlen + 4; /* (#B)... */ + cp = st->pattern = ncalloc(l); + strcpy(cp, "(#b)((#B)"); + cp += 9; + strcpy(cp, pattern); + cp += patternlen; + strcpy(cp, ")"); + cp += 1; + if(lookahead) { + strcpy(cp, "(#B)"); + cp += 4; + strcpy(cp, lookahead); + cp += lookaheadlen; + } + strcpy(cp, "*"); + } + st->patprog = NULL; + if((s = *rparseargs) && *s == '-') { + rparseargs++; + l = strlen(s); + st->guard = ncalloc(l); + memcpy(st->guard, s + 1, l - 1); + st->guard[l - 1] = '\0'; + } + else + st->guard = NULL; + if((s = *rparseargs) && *s == ':') { + rparseargs++; + l = strlen(s); + st->action = ncalloc(l); + memcpy(st->action, s + 1, l - 1); + st->action[l - 1] = '\0'; + } + else + st->action = NULL; + result->nullacts = NULL; + result->in = newlinklist(); + br = ncalloc(sizeof(*br)); + br->state = st; + br->actions = newlinklist(); + addlinknode(result->in, br); + result->out = newlinklist(); + br = ncalloc(sizeof(*br)); + br->state = st; + br->actions = newlinklist(); + addlinknode(result->out, br); + break; + } + case '(': + if(s[1]) + return 1; + rparseargs++; + if(rparsealt(result, perr)) + longjmp(*perr, 2); + s = *rparseargs; + if(!s || s[0] != ')' || s[1] != '\0') + longjmp(*perr, 2); + rparseargs++; + break; + default: + return 1; + } + + return 0; +} + +static int +rparseclo(RParseResult *result, jmp_buf *perr) +{ + if(rparseelt(result, perr)) + return 1; + + if(*rparseargs && !strcmp(*rparseargs, "#")) { + rparseargs++; + while(*rparseargs && !strcmp(*rparseargs, "#")) + rparseargs++; + + connectstates(result->out, result->in); + result->nullacts = newlinklist(); + } + return 0; +} + +static void +prependactions(LinkList acts, LinkList branches) +{ + LinkNode aln, bln; + for(bln = firstnode(branches); bln; bln = nextnode(bln)) { + RParseBranch *br = getdata(bln); + for(aln = lastnode(acts); aln != (LinkNode)acts; aln = prevnode(aln)) + pushnode(br->actions, getdata(aln)); + } +} + +static void +appendactions(LinkList acts, LinkList branches) +{ + LinkNode aln, bln; + for(bln = firstnode(branches); bln; bln = nextnode(bln)) { + RParseBranch *br = getdata(bln); + for(aln = firstnode(acts); aln; aln = nextnode(aln)) + addlinknode(br->actions, getdata(aln)); + } +} + +static int +rparseseq(RParseResult *result, jmp_buf *perr) +{ + int l; + char *s; + RParseResult sub; + + result->nullacts = newlinklist(); + result->in = newlinklist(); + result->out = newlinklist(); + + while(1) { + if((s = *rparseargs) && s[0] == '{' && s[(l = strlen(s)) - 1] == '}') { + char *action = ncalloc(l - 1); + LinkNode ln; + rparseargs++; + memcpy(action, s + 1, l - 2); + action[l - 2] = '\0'; + if(result->nullacts) + addlinknode(result->nullacts, action); + for(ln = firstnode(result->out); ln; ln = nextnode(ln)) { + RParseBranch *br = getdata(ln); + addlinknode(br->actions, action); + } + } + else if(!rparseclo(&sub, perr)) { + connectstates(result->out, sub.in); + + if(result->nullacts) { + prependactions(result->nullacts, sub.in); + insertlinklist(sub.in, lastnode(result->in), result->in); + } + if(sub.nullacts) { + appendactions(sub.nullacts, result->out); + insertlinklist(sub.out, lastnode(result->out), result->out); + } + else + result->out = sub.out; + + if(result->nullacts && sub.nullacts) + insertlinklist(sub.nullacts, lastnode(result->nullacts), result->nullacts); + else + result->nullacts = NULL; + } + else + break; + } + return 0; +} + +static int +rparsealt(RParseResult *result, jmp_buf *perr) +{ + RParseResult sub; + + if(rparseseq(result, perr)) + return 1; + + while(*rparseargs && !strcmp(*rparseargs, "|")) { + rparseargs++; + if(rparseseq(&sub, perr)) + longjmp(*perr, 2); + if(!result->nullacts && sub.nullacts) { + result->nullacts = sub.nullacts; + } + insertlinklist(sub.in, lastnode(result->in), result->in); + insertlinklist(sub.out, lastnode(result->out), result->out); + } + return 0; +} + +static int +rmatch(RParseResult *sm, char *subj, char *var1, char *var2) +{ + LinkNode ln, lnn; + LinkList nexts; + LinkList nextslist; + RParseBranch *br; + RParseState *st = NULL; + int point1 = 0, point2 = 0; + + setiparam(var1, point1); + setiparam(var2, point2); + + if(!*subj) { + if(sm->nullacts) + for(ln = firstnode(sm->nullacts); ln; ln = nextnode(ln)) { + char *action = getdata(ln); + if(action) + execstring(action, 1, 0); + } + return 0; + } + + nextslist = newlinklist(); + nexts = sm->in; + addlinknode(nextslist, nexts); + do { + char **savematch1, **savembegin1, **savemend1; + char **savematch2, **savembegin2, **savemend2; + PERMALLOC { + savematch1 = duparray(getaparam("match"), (VFunc) dupstring); + savembegin1 = duparray(getaparam("mbegin"), (VFunc) dupstring); + savemend1 = duparray(getaparam("mend"), (VFunc) dupstring); + } LASTALLOC; + for(ln = firstnode(nexts); ln; ln = nextnode(ln)) { + int i; + RParseState *next; + br = getdata(ln); + next = br->state; + if(next->pattern && !next->patprog) { + tokenize(next->pattern); + if(!(next->patprog = patcompile(next->pattern, 0, NULL))) { + return 2; + } + } + if(next->pattern && pattry(next->patprog, subj) && + (!next->guard || (execstring(next->guard, 1, 0), !lastval))) { + LinkNode aln; + char **mend = getaparam("mend"); + int len = atoi(mend[0]); + for(i = len; i; i--) + if(*subj++ == Meta) + subj++; + PERMALLOC { + savematch2 = duparray(getaparam("match"), (VFunc) dupstring); + savembegin2 = duparray(getaparam("mbegin"), (VFunc) dupstring); + savemend2 = duparray(getaparam("mend"), (VFunc) dupstring); + } LASTALLOC; + if(savematch1) setaparam("match", savematch1); + if(savembegin1) setaparam("mbegin", savembegin1); + if(savemend1) setaparam("mend", savemend1); + for(aln = firstnode(br->actions); aln; aln = nextnode(aln)) { + char *action = getdata(aln); + if(action) + execstring(action, 1, 0); + } + if(savematch2) setaparam("match", savematch2); + if(savembegin2) setaparam("mbegin", savembegin2); + if(savemend2) setaparam("mend", savemend2); + point2 += len; + setiparam(var2, point2); + st = br->state; + nexts = st->branches; + if(next->cutoff == '-' || (next->cutoff == '/' && len)) { + nextslist = newlinklist(); + point1 = point2; + setiparam(var1, point1); + } + addlinknode(nextslist, nexts); + break; + } + } + if(!ln) { + if(savematch1) freearray(savematch1); + if(savembegin1) freearray(savembegin1); + if(savemend1) freearray(savemend1); + } + } while(ln); + + if(!*subj) + for(ln = firstnode(sm->out); ln; ln = nextnode(ln)) { + br = getdata(ln); + if(br->state == st) { + for(ln = firstnode(br->actions); ln; ln = nextnode(ln)) { + char *action = getdata(ln); + if(action) + execstring(action, 1, 0); + } + return 0; + } + } + + for(lnn = firstnode(nextslist); lnn; lnn = nextnode(lnn)) { + nexts = getdata(lnn); + for(ln = firstnode(nexts); ln; ln = nextnode(ln)) { + br = getdata(ln); + if(br->state->action) + execstring(br->state->action, 1, 0); + } + } + return empty(nexts) ? 2 : 1; +} + +/* + usage: regexparse string regex... + status: + 0: matched + 1: unmatched (all next state candidates are failed) + 2: unmatched (there is no next state candidates) + 3: regex parse error +*/ + +static int +bin_regexparse(char *nam, char **args, char *ops, int func) +{ + int oldextendedglob = opts[EXTENDEDGLOB]; + char *var1 = args[0]; + char *var2 = args[1]; + char *subj = args[2]; + int ret; + jmp_buf rparseerr; + RParseResult result; + + opts[EXTENDEDGLOB] = 1; + + rparseargs = args + 3; + HEAPALLOC { + pushheap(); + rparsestates = newlinklist(); + if(setjmp(rparseerr) || rparsealt(&result, &rparseerr) || *rparseargs) { + if(*rparseargs) + zwarnnam(nam, "invalid regex : %s", *rparseargs, 0); + else + zwarnnam(nam, "not enough regex arguments", NULL, 0); + ret = 3; + } + else + ret = 0; + + if(!ret) + ret = rmatch(&result, subj, var1, var2); + popheap(); + } LASTALLOC; + + opts[EXTENDEDGLOB] = oldextendedglob; + return ret; +} + static struct builtin bintab[] = { BUILTIN("zstyle", 0, bin_zstyle, 0, -1, 0, NULL, NULL), BUILTIN("zformat", 0, bin_zformat, 3, -1, 0, NULL, NULL), + BUILTIN("regexparse", 0, bin_regexparse, 3, -1, 0, NULL, NULL), }; -- Tanaka Akira