123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- #!/bin/bash
-
- shellfu import pretty
-
-
- uripecker() {
- #
- # Scan stdin for what looks like URI, ID or keyword
- #
- # Usage:
- # <SOURCE_TEXT uripecker MAP
- #
- # Search through SOURCE_TEXT and output in following order:
- #
- # 1. apparent URIs,
- # 2. hash IDs ("bug#1234"),
- # 3. equal sign expressions ("bug = 1234"),
- # 4. "tags" ("bug1234")
- # 5. keyword expressions ("BUG 1234" or "g hello world"),
- #
- # all (except the first one, obviously) converted to URI using mappings
- # from MAP file.
- #
- # Note that keyword expressions (e.g. "bug 123") work only if they start
- # the line; rest of the line is taken as query argument and URL-quoted,
- # so that "g what is bmo" would work as expected (given 'g' is defined
- # as Google search).
- #
- # Apply this filter to args or clipboard, and either use head -1 or
- # if you are brave, open all URIs.
- #
- # The MAP file can contain any number of query mappings in format:
- #
- # NAME = URI_PATTERN
- #
- # where NAME is any string without spaces, equal sign or dot, and
- # URI_PATTERN is a string with precisely one instance of '%s'.
- #
- # For example, given this MAP file:
- #
- # issue = http://gitlab.example.org/issue?id=%s
- # faq = http://faq.example.org/faq/%s
- #
- # following text
- #
- # issue = 1
- # faq#225
- # issue42
- # http://other.example.com/
- # faq 14
- # or faq 15
- #
- # would return
- #
- # http://other.example.com/
- # http://faq.example.org/faq/225
- # http://gitlab.example.org/issue?id=1
- # http://gitlab.example.org/issue?id=42
- # http://faq.example.org/faq/14
- #
- # Note that the URI_PATTERN can be any kind of URI, such as ftp:// URI,
- # (or any string, actually) but the '%s' is converted using HTTP URI rules.
- #
- local MapFile=$1; shift
- test -n "$MapFile" || {
- warn "usage: uripecker MAP"
- return 2
- }
- local MapBody
- local tmp # temporary pipe storage
- tmp=$(mktemp -d -t uripecker.XXXXXXXX)
- local maybe_uris="$tmp/maybe_uris"
- local maybe_ids="$tmp/maybe_ids"
- local maybe_exps="$tmp/maybe_exps"
- local maybe_tags="$tmp/maybe_tags"
- local maybe_kws="$tmp/maybe_kws"
- local uris="$tmp/uris"
- local uris_from_ids="$tmp/uris_from_ids"
- local uris_from_exps="$tmp/uris_from_exps"
- local uris_from_tags="$tmp/uris_from_tags"
- local uris_from_kws="$tmp/uris_from_kws"
- MapBody=$(<"$MapFile") || {
- warn "error reading map file: $MapFile"
- return 3
- }
- ##
- # heat up and fill pipes
- #
- mkfifo "$maybe_uris" "$maybe_ids" "$maybe_exps" "$maybe_tags" "$maybe_kws" \
- "$uris" "$uris_from_ids" "$uris_from_exps" "$uris_from_tags" "$uris_from_kws"
- __uripecker__minimize | tee "$maybe_uris" "$maybe_ids" "$maybe_exps" "$maybe_tags" "$maybe_kws" \
- >/dev/null &
- ##
- # process each pipe *async* by different filter
- #
- < "$maybe_uris" __uripecker__flt_uris > "$uris" &
- < "$maybe_ids" __uripecker__flt_ids | __uripecker__deref > "$uris_from_ids" &
- < "$maybe_exps" __uripecker__flt_exps | __uripecker__deref > "$uris_from_exps" &
- < "$maybe_tags" __uripecker__flt_tags | __uripecker__deref > "$uris_from_tags" &
- < "$maybe_kws" __uripecker__flt_kws | __uripecker__deref > "$uris_from_kws" &
- ##
- # print result *sync* in correct order
- #
- {
- cat "$uris"
- cat "$uris_from_ids"
- cat "$uris_from_exps"
- cat "$uris_from_tags"
- cat "$uris_from_kws"
- } | grep . # throw away empties; add missing LF
- rm -rf "$tmp"
- }
-
- __uripecker__map_lookup() {
- #
- # Look up query $1 in $Map
- #
- local key=${1,,}
- sed '
- s/^ *//
- s/ *$//
- s/ *= */=/
- ' <<<"$MapBody" \
- | grep "^$key=" \
- | cut -d= -f2-
- }
-
- __uripecker__deref() {
- #
- # Turn query (like "g hello" for google) to URI
- #
- local kw # keyword part, eg. "g" or "bug"
- local query # query part, eg. "hello+dolly" or "1234"
- local fmt # for queries: format string (from ini) to pass to printf
- while read -r kw query;
- do
- debug -v kw query
- fmt=$(__uripecker__map_lookup "$kw") || return 1
- debug -v fmt
- #shellcheck disable=SC2059
- printf "$fmt\n" "$(__uripecker__urlquote "$query")"
- done
- }
-
- __uripecker__flt_exps() {
- #
- # Hack expressions like bug = 123 out of the text
- #
- sed -e 's/(\d)\</\n/;' \
- | perl -CS -ne '
- next unless m/\b([a-zA-Z]\w*\s*=\s*[[:alnum:]][[:alnum:]_#-]*)\b/;
- print "$1\n";
- ' \
- | sed -re 's/\s*=\s*/ /'
- }
-
- __uripecker__flt_ids() {
- #
- # Hack doer-like id's (ID#123) out of the text
- #
- tr ' ' '\n' \
- | perl -CS -ne '
- next unless m/\b([a-zA-Z]\w*#[[:alnum:]][[:alnum:]_#-]*)\b/;
- print "$1\n";
- ' \
- | tr '#' ' '
- }
-
- __uripecker__flt_kws() {
- #
- # Hack out lines that look like kw expressions (word space text
- #
- # Eg. 'wiki hello world'
- #
- grep -Ee '^\s*[a-zA-Z][[:alpha:]_]*\s+[^=]'
- }
-
- __uripecker__flt_tags() {
- #
- # Hack "tags" like bug123 out of the text
- #
- tr -c '[:alnum:]' '\n' \
- | grep -E '^[a-zA-Z]+[0-9]+$' \
- | sed -r 's/([a-zA-Z]+)([0-9])/\1 \2/'
- }
-
- __uripecker__flt_uris() {
- #
- # Hack URIs out of the text.
- #
- # Uses URL parser regex Found as SO answer[1] and adapted to include
- # comments from the original Immo's gist[2].
- #
- # [1]: https://stackoverflow.com/a/30408189/835945
- # [2]: https://gist.github.com/imme-emosol/731338/810d83626a6d79f40a251f250ed4625cac0e731f
- #
- local uri_re # Imme Emosol's URI pattern
- local py_code # code to do the matching
- uri_re=$(
- echo -n '\b'
- ## protocol identifier
- #
- echo -n '(?:(?:https?|ftp)://)'
- ## user:pass authentication
- #
- echo -n '(?:\S+(?::\S*)?@)?'
- echo -n '(?:'
- ## IP address dotted notation octets
- #
- echo -n '(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
- echo -n '(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
- echo -n '(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
- echo -n '|'
- ## host name
- #
- echo -n '(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)'
- ## domain name
- #
- echo -n '(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*'
- ## TLD identifier
- #
- echo -n '(?:\.(?:[a-z\u00a1-\uffff]{2,}))'
- echo -n '|'
- ## "localhost"
- #
- echo -n '(?:localhost)'
- echo -n ')'
- ## port number
- #
- echo -n '(?::\d{2,5})?'
- ## resource path
- #
- echo -n '(?:/[^\s]*)?'
- echo -n '\b'
- )
- py_code=$(
- echo 'import os'
- echo 'import re'
- echo 'import sys'
- echo
- echo 'for uri in re.findall(os.environ["URI_RE"], sys.stdin.read()):'
- echo ' print uri'
- )
- URI_RE=$uri_re python -c "$py_code"
- }
-
- __uripecker__minimize() {
- #
- # Strip, squash spaces and replace tabs
- #
- sed '
- s/\t/ /g
- s/ */ /g
- s/^ //
- s/ $//
- '
- }
-
- __uripecker__urlquote() {
- #
- # URL-quote query $1 and print result
- #
- local query=$1
- debug -v query
- #shellcheck disable=SC2028
- python -c "$(
- echo 'import urllib'
- echo 'import sys'
- echo 'print urllib.quote_plus(sys.argv[1])'
- )" "$query"
- }
-
- #shellfu module-version=__MKIT_PROJ_VERSION__
|