Skip to content

Instantly share code, notes, and snippets.

@Artoria2e5
Last active February 3, 2016 12:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Artoria2e5/5694c113912a43fbdef1 to your computer and use it in GitHub Desktop.
Save Artoria2e5/5694c113912a43fbdef1 to your computer and use it in GitHub Desktop.
All those SE trash
#!/bin/bash
n=${1:-1024} TIMEFORMAT=${2:-$'real\t%3lR\nuser\t%3lU\nsys\t%3lS'}
export LC_ALL=C
sorted_lex(){ local prev; for i; do [[ $prev > $i ]] && return 1; prev=$i; done; }
sorted_int(){ local prev; for i; do (( prev <= i )) || return 1; prev=$i; done; }
chk(){
printf %b "${_sort_type}_chk\t"; local _suf=$'\t'
if (("${#arr[@]}" == "${#to_sort[@]}")); then
_suf+="LEN_EQ ${#arr[@]}"
else
_suf+="LEN_NE ${#to_sort[@]} -> ${#arr[@]}"
fi
"sorted_$_sort_type" "${arr[@]}" &&
echo "GOOD$_suf" ||
echo "BAD$_suf"
}
test_pass() {
for s in index alias fs bub # ext ext_44 newline_pipe
do
echo "$s:"
( time "${s}_sort" to_sort arr && chk || echo $'ERR:\t'$? )
echo
done
}
# http://unix.stackexchange.com/a/247659/73443
bub_sort () {
(( ${#arr[@]} <= 1024 )) || return 42
_sort_type=int
for ((i=0; i <= $((${#arr[@]} - 2)); ++i))
do
for ((j=((i + 1)); j <= ((${#arr[@]} - 1)); ++j))
do
if [[ ${arr[i]} -gt ${arr[j]} ]]
then
# echo $i $j ${arr[i]} ${arr[j]}
tmp=${arr[i]}
arr[i]=${arr[j]}
arr[j]=$tmp
fi
done
done
}
newline_pipe_sort() {
_sort_type=lex
IFS=$'\n'
arr=( $(sort <<< "${to_sort[*]}") )
}
fs_sort(){ # http://unix.stackexchange.com/a/247660/73443
_sort_type=lex
local OLDPWD # IFS=' /'
cd -- "$(mktemp -d)" || return
touch -- "${to_sort[@]}"; declare -ga arr=(*) &>/dev/null
cd - && rm -rf -- "$OLDPWD"
}
# http://unix.stackexchange.com/a/248137/73443
# index_sort <source_arr> [target_arr:-source_arr]
index_sort() {
_sort_type=int
# Not that surprising: using indirect expansions in a `for` loop is slow.
local _tmp=() _src="$1[@]" _sorted_nodup _sorted TIMEFORMAT=$'insert\t%3lR'; _src=("${!_src}")
time for i in "${_src[@]}"; do (( _tmp[i]++ )); done
# This eats duplicates.
_sorted_nodup=( "${!_tmp[@]}" )
# The numeric values in _sorted_nodup<int, int> gives us the occurrence of
# the element in the original sequence.. takes extra 1~4x time to expand.
# The extra time decreases as elems decreases, -> ~1.2x.
# CONSIDER SKIPPING THIS and use `_sorted_nodup` for the final eval instead.
TIMEFORMAT=$'expand\t%3lR'
time for i in "${_sorted_nodup[@]}"; do
j=${_tmp[i]}
while ((j--)); do _sorted+=("$i"); done
done
# Assign it back..
eval "${2:-$1}=(" '"${_sorted[@]}" )'
}
# alias_sort <source_arr> [target_arr:-source_arr]
# modified to fit in a function.
alias_sort(){
_sort_type=lex
local _s=() _e="$1[@]" IFS=$'\n' # does bash 1 support indirect expansion?
_s=($(
unalias -a && # clear all aliases
alias "${!_e/%/=}" && # (exp: map append '=') pass to alias
alias # sort (see src) and print the aliases
)) || return
_s=("${_s[@]#alias }") # strip off the `alias '
# strip the shortest trailing =* and assign back.
eval "${2:-$1}=("'"${_s[@]%=*}")'
}
# http://unix.stackexchange.com/a/248145/73443
ext_44_sort() {
_sort_type=lex
for array do
readarray -tf '' "$array" < <(
eval "printf '%s\0' \"\${$array[@]}\" | sort -z")
done
}
ext_sort() {
_sort_type=lex
for array do
tmp=() i=0
while IFS= read -rd '' "tmp[i++]"; do :; done < <(
eval "printf '%s\0' \"\${$array[@]}\" | sort -z")
eval "$array=("'"${tmp[@]}")'
done
}
echo "Testing performance on $n random elements... Filling array."
time while ((n--)); do to_sort+=($RANDOM); done
echo
echo "Testing array copy..."
time arr=("${to_sort[@]}")
echo
test_pass
echo
echo "Testing partly-sequential array {1..817} {9154..32768} {818..9153} {1..16384} {1..8192}{,,,}.."
to_sort=({1..817} {9154..32768} {818..9153} {1..16384} {1..4096}{,,,}) arr=("${to_sort[@]}")
test_pass
#!/bin/bash
((!EUID)) || exec sudo "$0" "$@" # Addicted to OR-logic
declare -A src_dst_pair src_dst_old; # dictionaries -- the older one not being used
oops(){ printf '%b\n' "oOpsS!\t$1">&2; exit "${2:-1}"; }
## oldlist file
## Gets the contents of a legacy list.
oldlist() {
local arr i s d;
awk '/^#BEGIN-log#/ {flag=1; next} /^#END-log#/ {flag=0} flag' "$1" | mapfile arr
for i in "${arr[@]}"; do
case "$i" in *'=('*=*=*')') :;; *) continue;; esac
i="${i#*(}"
i="${i%)}"
read s d <<< "$i"
remember "${s#*=}" "${d#*=}" src_dst_old
done
}
##remember key val <varname> Generates a basically safe-to-use line of script and appends it to self.
# TODO: write this back to a script.
remember() {
printf '%s\n' "${3:-src_dst_pair}[$(printf %q "$1")]=$(printf %q "$2")" >> "$0"
}
try_link(){
local SRC="$1" DST="$2" src_in_dst
[[ -e "$SRC" ]] || [[ -e "$DST" ]] || oops "$DST"
[[ -e "$SRC" ]] || mv "$DST" "$SRC"
[[ -e "$DST" ]] || ln -Tsf "$SRC" "$DST"
src_in_dst=("$DST"/*"$SRC"*)
((${#src_in_dst[@]})) || ln -Tsf "$SRC" "$DST"
}
# -> for thing in "$@"
for thing; do
[[ $thing == /* ]] || thing="$PWD/$thing" # relative path, hmm.
SRC=/.hd$thing DST=$thing
[[ "$SRC $DST" != *'.hd/.hd'* ]] || oops 'pathdup: .hd/.hd'
remember "$SRC" "$DST"
try_link "$SRC" "$DST"
done
# Another way to spell eval "$(blah)"
source <(awk '/^# __DATA_START__/ { flag=1; next } flag')
for those in "${!src_dst_pair[@]}"; do try_link "$those" "${src_dst_pair[$those]}"; done
exit
# __DATA_START__
@mikeserv
Copy link

mikeserv commented Dec 8, 2015

so what is this thing? is it possible you could just explain it?

@Artoria2e5
Copy link
Author

Re-tested on a DigitalOcean Ubuntu 15.04 instance (bash 4.3.30, -DCONF_MACHTYPE='x86_64-pc-linux-gnu' -D_FORTIFY_SOURCE=2 -g -O2 -fstack-protector-strong)

Interesting parts to see:

  1. index_sort takes a considerable of non-insert/expand time. This might be caused by the three array copy actions done. Therefore, writing it inline (or make it a big block of eval) might still be the best solution…
    • The initial copy meant to improve the performance, at least for Cygwin. Too lazy to check if it helps on Ubuntu for now.
    • Great to see that the adaptive part of it helps.
  2. alias_sort performs poorly with lots of duplicate keys. Bash's internal hashtable must be unhappy.
  3. fs_sort works great on Linux. On Cygwin it's like half a minute of sys.

(The 42 retcode is assigned for 'array too big, I am not gonna waste my time' in bub.)

nobody@A2cheap:/tmp$ bash sorttime.bash  65536
Testing performance on 65536 random elements... Filling array.
real    0m0.554s
user    0m0.544s
sys     0m0.008s

Testing array copy...
real    0m1.635s
user    0m1.604s
sys     0m0.028s

index:
insert  0m16.048s
expand  0m8.635s
real    0m31.998s
user    0m31.852s
sys     0m0.088s
int_chk GOOD    LEN_EQ 65536

alias:
real    0m18.202s
user    0m18.020s
sys     0m0.144s
lex_chk GOOD    LEN_NE 65536 -> 28141

fs:
/tmp
real    0m1.275s
user    0m0.196s
sys     0m1.056s
lex_chk GOOD    LEN_NE 65536 -> 28141

bub:
real    0m0.000s
user    0m0.000s
sys     0m0.000s
ERR:    42


Testing partly-sequential array {1..817} {9154..32768} {818..9153} {1..16384} {1..8192}{,,,}..
index:
insert  0m1.803s
expand  0m4.104s
real    0m15.670s
user    0m15.540s
sys     0m0.084s
int_chk GOOD    LEN_EQ 65536

alias:
real    1m55.814s
user    1m55.308s
sys     0m0.252s
lex_chk GOOD    LEN_NE 65536 -> 32768

fs:
/tmp
real    0m1.232s
user    0m0.192s
sys     0m1.020s
lex_chk GOOD    LEN_NE 65536 -> 32768

bub:
real    0m0.000s
user    0m0.000s
sys     0m0.000s
ERR:    42

Time for ifs_nl_arr_sep on Cygwin_x64:

eat ()
{
    local n=$1 to_sort=() arr IFS=$'\n';
    echo eat $n
    while ((n--)); do
        to_sort+=($RANDOM);
    done;
    time arr=($(sort -g <<< "${to_sort[*]}"))
}
$ eat 128; eat 1024; eat 65536
eat 128

real    0m0.034s
user    0m0.062s
sys     0m0.030s
eat 1024

real    0m0.050s
user    0m0.015s
sys     0m0.000s
eat 65536

real    0m1.486s
user    0m1.406s
sys     0m0.031s

@mikeserv
Copy link

mikeserv commented Dec 9, 2015

my bad. I used declare -A and not -a -- that was totally stupid.

#!/bin/bash

bub_sort(){     # http://unix.stackexchange.com/a/247659/73443
        arr=( "$@" )
        for     ((i=0; i <= $((${#arr[@]} - 2)); ++i))
        do      for     ((j=((i + 1)); j <= ((${#arr[@]} - 1)); ++j))
                do      if      [[ ${arr[i]} -gt ${arr[j]} ]]
                        then    # echo $i $j ${arr[i]} ${arr[j]}
                                tmp=${arr[i]}
                                arr[i]=${arr[j]}
                                arr[j]=$tmp         
                        fi
                done
        done
}

fs_sort(){      # http://unix.stackexchange.com/a/247660/73443
        local OLDPWD IFS=' /'
        cd -- "$(mktemp -d)" || return
        touch -- $*; declare -ga arr=(*)
        cd - && rm -rf -- "$OLDPWD"
}       >/dev/null

index_sort(){   # http://unix.stackexchange.com/a/248137/73443
        declare -a tmp
        for     i
        do      tmp[$i]+=' "$i"'
        done
        for     i in    "${!tmp[@]}"
        do      eval "  arr+=( ${tmp[$i]} )"
        done
}

ext_sort_44()   # http://unix.stackexchange.com/a/248145/73443
        for     array
        do      readarray -tf '' "$array" < <(
                eval '  printf "%s\0" "${'"$array"'[@]}" | sort -zg')
        done

ext_sort()
        for     array
        do      tmp=()  i=0
                while   IFS= read -rd '' "tmp[i++]"
                do :;   done < <(eval "
                        printf '%s\0' \"\${$array[@]}\" | sort -zg")
                eval "  $array"'=( "${tmp[@]}" )'
        done

urand(){
        local IFS
        unset IFS   "$1[@]" &&
        declare -ga "$1=( $(dd count="${2:-1}" </dev/urandom|od -An -vtu1) )"
}


TIMEFORMAT=$'real\t%3lR\nuser\t%3lU\nsys\t%3lS\n'
echo "Testing performance on $((n=${1:-1}))*512 random elements... Filling array."
time    urand rand "$n"

for     sort in index fs bub
do      unset   arr[@]
        declare -a arr
        echo    "$sort"_sort:
        time {  "$sort"_sort "${rand[@]}"
                eval "$sort"'_arr=( "${arr[@]}" )'
        }
        eval '  echo "${'"$sort"'_arr[@]}"'
done    >&2

@mikeserv
Copy link

2+0 records in
2+0 records out
1024 bytes (1.0 kB) copied, 0.000311307 s, 3.3 MB/s
real 0m0.011s
user 0m0.007s
sys 0m0.003s

index_sort:
real 0m0.022s
user 0m0.020s
sys 0m0.000s

fs_sort:
real 0m0.020s
user 0m0.007s
sys 0m0.003s

bub_sort:
real 0m7.317s
user 0m7.323s
sys 0m0.000s

@Artoria2e5
Copy link
Author

@mikeserv
A raw test on yours over 65536 elems gives

real    0m43.785s
user    0m43.235s
sys     0m0.031s

Let me lint this by a bit, perhaps without eval it will be faster..

index_sort(){   # http://unix.stackexchange.com/a/248137/73443
        declare -a tmp
        for     i
        do      tmp[i]+=' "$i"' # so does the $
        done
        for     i in    "${!tmp[@]}"
        do      arr+=( ${tmp[i]} ) # eval not necessary here
        done
}
time index_sort "${to_sort[@]}"
# real    0m33.592s
# user    0m33.047s
# sys     0m0.016s

Indeed faster.

Overhead test:

time : "${to_sort[@]}"
# real    0m0.285s
# user    0m0.281s
# sys     0m0.000s

@mikeserv
Copy link

Yeah! i didnt even think of that. so you can drop the dollar sign, too, right?

but... when i take out eval i just get a huge stack of i's
.

@Artoria2e5
Copy link
Author

@mikeserv Yes. It was only a leftover from the old -A one where [i] will be interpreted as a literal i. Will edit it tomorrow -- time for doing homework.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment