From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <zsh-users-return-20409-mason-zsh=primenet.com.au@zsh.org>
Received: (qmail 27940 invoked by alias); 14 Aug 2015 23:19:31 -0000
Mailing-List: contact zsh-users-help@zsh.org; run by ezmlm
Precedence: bulk
X-No-Archive: yes
List-Id: Zsh Users List <zsh-users.zsh.org>
List-Post: <mailto:zsh-users@zsh.org>
List-Help: <mailto:zsh-users-help@zsh.org>
X-Seq: 20409
Received: (qmail 25037 invoked from network); 14 Aug 2015 23:19:27 -0000
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on f.primenet.com.au
X-Spam-Level: 
X-Spam-Status: No, score=-1.9 required=5.0 tests=BAYES_00,SPF_HELO_PASS
	autolearn=ham autolearn_force=no version=3.4.0
X-Injected-Via-Gmane: http://gmane.org/
To: zsh-users@zsh.org
From: Joep van Delft <joepvandelft@xs4all.nl>
Subject: Tip of the day: grep2awk zle function
Date: Sat, 15 Aug 2015 01:01:57 +0200
Message-ID: <20150815010157.696d904b@xs4all.nl>
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
X-Complaints-To: usenet@ger.gmane.org
X-Gmane-NNTP-Posting-Host: p5ddc4c32.dip0.t-ipconnect.de
X-Newsreader: Claws Mail 3.12.0 (GTK+ 2.24.28; i686-pc-linux-gnu)

Hi!

This summer's side project has resulted in grep2awk.

grep2awk is a zle function that turns the first grep command in the
current command line into its awk equivalent, leaving the cursor
right where you want it.  This operation includes a transformation of
Basic Regular Expression to Extended Regular Expression. 

Possibly of some interest, is an adaptation of the ZSH test suite,
with which zle functions can be tested with the currently
installed zsh, outside the scope of the ZSH source directory/make
file.

Code, installation instructions, and some details can be found at
https://github.com/joepvd/grep2awk. 

Kind regards,

Joep


# grep2awk:
# A zle convenience function to exchange a `grep` on the command line
# into a corresponding `awk`-line.
# How to use:
# Drop this file somewhere in your $fpath. And configure your shell to
# make use of it:
#
#    autoload grep2awk
#    zle -N grep2awk
#    bindkey "^X^A" grep2awk
#
# This makes the widget available under <CTRL-X><CTRL-A>.
#
# The files t/comptest and t/ztst.zsh are taken (and modified) from the
# zsh project. This project can be used under the same terms as the zsh
# project.
#
# By Joep van Delft, 2015


emulate -L zsh
setopt extendedglob
autoload -Uz split-shell-arguments

grep2awk() {
    # Needs to be in its own scope, to not have the variables leak to
    # the main shell process.
    local replacement_line REPLY REPLY2 flag cursor
    local -a reply
    typeset -A opt

    # Define some functions first:
    #  - regex2awkprog
    #  - fixed2ere
    #  - bre2ere
    #  - swapcommand

    regex2awkprog() {
        local re prelude action prog awk

        re=${(Q)1} # Remove one level of quoting, so we have what
                   # grep would receive.

        (( opt[F] > 0 )) && {
            re="$(fixed2ere $re)"
        }

        # A sanity check:
        (( opt[E] + opt[F] + opt[G] + opt[P] > 1 )) && {
            zle -M "grep2awk:Options E, F, G and P are mutually exclusive."
            zle -M "Exiting."
            return 1
        }
        (( opt[G] == 1 || opt[E] + opt[F] + opt[P] == 0 )) && {
            # It is Basic Regular Expression
            re="$(bre2ere $re)"
        }

        (( opt[w] > 0 )) && {
            re="\\<($re)\\>"
        }
        (( opt[x] > 0 )) && {
            re="^($re)$"
        }
        # Replace any forward slash with an escaped one:

        re=${re//\//\\/}

        # Wrap in slashes, and replace any single quote with its
        # ASCII code:
        re=/${re//\'/\\47}/

        # Negate the pattern if told so:
        (( opt[v] > 0 )) && {
            re="!$re"
        }

        (( opt[i] + opt[y] > 0 )) && {
            prelude+="BEGIN{IGNORECASE=1}; "
        }
        # The print action. Add some extra stuff if line count or file name
        # are enabled.
        action="{print ${opt[H]:+FILENAME\":\"}${opt[n]:+NR\":\"}\$0}"

        # c, H, l and L are mutually exclusive. One of them will win.
        (( opt[H] + opt[l] + opt[L] + opt[c] > 1 )) && {
            zle -M "grep2awk:Options c, H, l and L are mutually exclusive."
            zle -M "Exiting."
            return 1
        }

        (( opt[c] > 0 )) && {
            # This is incompatible with `grep -c`, as files where the count is 0
            # are not displayed.  This is saner behavior, IMHO.
            prelude+='BEGINFILE{c=0};ENDFILE{if(c>0)print FILENAME":"c}; '
            action='{c++}'
        }
        (( opt[l] > 0 )) && {
            action="{print FILENAME; nextfile}"
        }
        (( opt[L] > 0 )) && {
            prelude+="BEGINFILE{flag=0};ENDFILE{if (flag==0)print FILENAME}; "
            action="{flag=1; nextfile}"
        }
        prog="${prelude}${re} $action"
        printf -- ${(qq)prog}
    }

    fixed2ere() {
        local p ere
        # We already receive a backslash escaped version of the
        # string by virtue of the ``(q)``-flag when this function is called.
        # That means that we need to transform the characters in the
        # first line to the corresponding character sequences in the
        # second line:
        #
        #     +  |  {  }  [  ]  (  )  *  .  ?  \  $  ^
        #    \+ \| \{ \} \[ \] \( \) \* \. \? \\ \$ \^

        str=$1
        ere=""
        for i in {1..$#str}; do
            if [[ $str[$i] =~ '\+|\||\{|\}|\[|\]|\(|\)|\*|\.|\?|\$|\^' ]]; then
                ere+="\\$str[$i]"
            elif [[ $str[$i] == $'\n' ]]; then
                # This works after arrowing up into history, but not
                # directly after writing it. Is there a way to detect
                # if this is the case?
                ere+='|'
            elif [[ $str[$i] == '\' ]]; then
                ere+="\\\\$str[$i]"
            else
                ere+="$str[$i]"
            fi
        done

        printf -- "%s" "$ere"
    }

    bre2ere() {
        # Basic Regular Expression to Extended Regular Expression
        #
        # What needs to be done:
        # Translate BRE: \+,\{,\},\|,\(,\), +, {, }, |, (, )
        #        to ERE:  +, {, }, |, (, ),\+,\{,\},\|,\(,\)
        #
        # But only if the backslash is not escaped with a backslash.


        local bre ere i debug
        # Variables tracking states:
        local s_brack s_brack_start s_esc s_altern s_altern_start s_char_cls
        bre=$1
        ere=""
        s_esc=off
        s_altern=on
        s_altern_start=off
        s_brack=off
        s_brack_start=off
        s_char_cls=off
        zstyle -s ':grep2awk:bre2ere:' debug debug
        if [[ -n $debug ]]; then
            rm "$debug" 2>/dev/null
            debug() {
                printf "bre[$i]=$bre[$i] s_esc=$s_esc "                   >> "$debug"
                printf "s_brack=$s_brack s_brack_start=$s_brack_start "   >> "$debug"
                printf "s_altern=$s_altern s_alter_now=$s_altern_start\n" >> "$debug"
            }
        fi
        for i in {1..$#bre}; do
            [[ -n $debug ]] && debug
            # Bracket expressions
            # -------------------
            if [[ $bre[$i] == '[' && $s_esc == off && $s_brack == off ]]; then
                ere+="$bre[$i]"
                s_brack=on
                s_brack_start=on
            # Character class accounting within bracket expressions, in order
            # to keep track of the end of a bracket expression.
            elif [[ $bre[$i] == '[' && $s_brack == on ]]; then
                ere+="$bre[$i]"
                s_char_cls=on
            elif [[ $bre[$i] == ']' && $s_char_cls == on ]]; then
                [[ $s_brack == on ]] || return 1
                ere+="$bre[$i]"
                s_char_cls=off
            # Special casing the start of bracket expressions. 
            # s_brack_start allows for inclusion of ] as a character as per
            # POSIX.2.
            elif [[ $s_brack == on && $s_brack_start == on ]]; then
                ere+="$bre[$i]"
                [[ $bre[$i] == '^' ]] \
                    && s_brack_start=on \
                    || s_brack_start=off
            elif [[ $bre[$i] == ']' && $s_brack == on ]]; then
                [[ $s_char_cls == off ]] || return 1
                ere+="$bre[$i]"
                s_brack=off
            # Finally, the default bracket expansion action:
            elif [[ $s_brack == on ]]; then
                ere+="$bre[$i]"

            # Alternation and subexpressions
            # ------------------------------
            # Needed for special casing of *, ^ and $.
            elif [[ $bre[$i] =~ '\||\(' && $s_esc == on ]]; then
                s_altern=on
                s_altern_start=on
                ere+="$bre[$i]" # The escape sign is dropped now
                s_esc=off
            elif [[ $bre[$i] == '^' && $s_altern == on ]]; then
                [[ $s_esc == off ]] || return 1
                ere+='^'
            elif [[ $bre[$i] == "*" && $s_altern == on ]]; then
                # When * is at the start of an alternation/subexpression,
                # escape it:
                ere+='\\*'
            elif [[ $bre[$i] == '$' ]]; then
                # Dollar is a special character when it is at the end of
                # string, end of alternation, or end of subexpression.
                # Otherwise: it is not special.  As we do not need to know the
                # 'history' of the regex, we can use a lookahead to find out
                # with which thing we are dealing.
                #
                # The behavior of $ is not documented in man 1 grep; it is, 
                # however, in `man 7 regex`. Some tests indicate that GNU grep
                # adheres to the POSIX.2 specification.
                if [[ $#bre -eq $i || ${bre:$i:2} =~ '\\(\||\))' ]]; then
                    [[ $s_esc == off ]] && ere+='$' || ere+='\\$'
                else
                    [[ $s_esc == off ]] && ere+='\\$' || ere+='$'
                fi

            # Normal BRE to ERE:
            # ------------------
            elif [[ $bre[$i] =~ '\+|\{|\}|\)|\^' && $s_esc == on ]]; then
                ere+="$bre[$i]" # The escape sign is dropped now
                s_esc=off
            elif [[ $bre[$i] =~ '\+|\||\{|\}|\(|\)|\^' && $s_esc == off ]]; then
                ere+="\\\\$bre[$i]"
            elif [[ $bre[$i] == '\\' && $s_esc == on ]]; then
                ere+='\\\\\\\\'
                s_esc=off
            elif [[ $bre[$i] == "\\" && $s_esc == off ]]; then
                s_esc=on
            else
                ere+="$bre[$i]"
                s_esc=off
            fi
            [[ $s_altern == on        \
            && $s_altern_start == off \
            && $bre[$i] != "^"        \
            ]] && s_altern=off
            s_altern_start=off
        done
        printf -- "%s" "$ere"
    }

    # Some debugging/testing convenience functions.
    # If grep2awk is autoloaded, the regex replacement
    # functions can be called directly: 
    #
    #   grep2awk bre2ere 'a[^a-z]\(c$\|d\)'
    # 
    # See the test suite for some examples.

    [[ -n $1 ]] && {
        echo $($1 "$2")
        return $?
    }

    zstyle -s ':grep2awk:' awk awk
    : ${awk:=awk --}


    # Store line as $reply[@]:
    split-shell-arguments

    # `flag` keeps the program state while looping through the splitted
    # shell words. Its semantics:
    # flag==0:  No 'grep', egrep', 'fgrep' encountered yet.
    # flag==1:  Set when grep command is replaced by awk
    #           command. Options or regex.
    # flag==2:  Regex (End of options arrived).
    # flag==3:  The rest.
    flag=0
    cursor=0
    for (( i=1; i<=$#reply; i++ )); do
        if (( i%2==1 && flag < 3 )); then
            # Uneven "words" are whitespace. Only interested in changing the
            # cursor position if exchanging the regex is not replaced by the awk
            # program yet (flag < 3).
            (( cursor += $#reply[$i] ))
        elif (( flag == 0 )); then
            case $reply[$i] in
                (grep)
                    flag=1
                    reply[$i]="$awk"
                    ;;
                (egrep)
                    flag=1
                    reply[$i]="$awk"
                    typeset "opt[E]"=1
                    ;;
                (fgrep)
                    flag=1
                    reply[$i]="$awk"
                    typeset "opt[F]"=1
                    ;;
            esac
            (( cursor += $#reply[$i] ))
        elif [[ $flag == 1 && $reply[$i] == -* ]]; then
            # Is it the End of Options-marker?
            if [[ $reply[$i] == "--" ]]; then
                flag=2
            elif [[ $reply[$i] =~ ^--. ]]; then
                zle -M "grep2awk: Long options to grep are not supported"
                return 1
            else
                for ((l=1; l<$#reply[$i]; l++)); do
                    #check those options!
                    # Is it an option to grep? A regular expression of
                    # the options that are recognized:
                    o=${reply[$i]:$l:1}
                    if [[ $o =~ "c|E|e|F|G|H|i|l|L|n|v|w|x|y" ]]; then
                        typeset "opt[$o]"=1
                    else
                        zle -M "grep2awk: Option $o is not supported"
                        return 1
                    fi
                done
                [[ $o == "e" ]] && flag=2
            fi

            # Delete the options, and the now superfluous whitespace:
            reply[$i]=""
            reply[$i+1]=""
            # And leave the cursor pointer where it was.

        elif (( flag==1 || flag==2 )); then
            # The fall-through for opt_or_reg OR regex:
            # This is the regex.
            reply[$i]=$(regex2awkprog $reply[$i])
            (( cursor += $#reply[$i] - 2 ))
            flag=3
        fi
        # Append the current version of $reply[$i] to replacement_line:
        replacement_line+=$reply[$i]
    done

    # Now we are done. Apply the changes to the BUFFER:
    BUFFER="$replacement_line"
    CURSOR=$cursor
}

grep2awk "$@"