-
-
Save Artoria2e5/5694c113912a43fbdef1 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
n=${1:-1024} TIMEFORMAT=${2:-$'real\t%3lR\nuser\t%3lU\nsys\t%3lS'} | |
export LC_ALL=C | |
sorted_lex(){ local prev; for i; do [[ $prev > $i ]] && return 1; prev=$i; done; } | |
sorted_int(){ local prev; for i; do (( prev <= i )) || return 1; prev=$i; done; } | |
chk(){ | |
printf %b "${_sort_type}_chk\t"; local _suf=$'\t' | |
if (("${#arr[@]}" == "${#to_sort[@]}")); then | |
_suf+="LEN_EQ ${#arr[@]}" | |
else | |
_suf+="LEN_NE ${#to_sort[@]} -> ${#arr[@]}" | |
fi | |
"sorted_$_sort_type" "${arr[@]}" && | |
echo "GOOD$_suf" || | |
echo "BAD$_suf" | |
} | |
test_pass() { | |
for s in index alias fs bub # ext ext_44 newline_pipe | |
do | |
echo "$s:" | |
( time "${s}_sort" to_sort arr && chk || echo $'ERR:\t'$? ) | |
echo | |
done | |
} | |
# http://unix.stackexchange.com/a/247659/73443 | |
bub_sort () { | |
(( ${#arr[@]} <= 1024 )) || return 42 | |
_sort_type=int | |
for ((i=0; i <= $((${#arr[@]} - 2)); ++i)) | |
do | |
for ((j=((i + 1)); j <= ((${#arr[@]} - 1)); ++j)) | |
do | |
if [[ ${arr[i]} -gt ${arr[j]} ]] | |
then | |
# echo $i $j ${arr[i]} ${arr[j]} | |
tmp=${arr[i]} | |
arr[i]=${arr[j]} | |
arr[j]=$tmp | |
fi | |
done | |
done | |
} | |
newline_pipe_sort() { | |
_sort_type=lex | |
IFS=$'\n' | |
arr=( $(sort <<< "${to_sort[*]}") ) | |
} | |
fs_sort(){ # http://unix.stackexchange.com/a/247660/73443 | |
_sort_type=lex | |
local OLDPWD # IFS=' /' | |
cd -- "$(mktemp -d)" || return | |
touch -- "${to_sort[@]}"; declare -ga arr=(*) &>/dev/null | |
cd - && rm -rf -- "$OLDPWD" | |
} | |
# http://unix.stackexchange.com/a/248137/73443 | |
# index_sort <source_arr> [target_arr:-source_arr] | |
index_sort() { | |
_sort_type=int | |
# Not that surprising: using indirect expansions in a `for` loop is slow. | |
local _tmp=() _src="$1[@]" _sorted_nodup _sorted TIMEFORMAT=$'insert\t%3lR'; _src=("${!_src}") | |
time for i in "${_src[@]}"; do (( _tmp[i]++ )); done | |
# This eats duplicates. | |
_sorted_nodup=( "${!_tmp[@]}" ) | |
# The numeric values in _sorted_nodup<int, int> gives us the occurrence of | |
# the element in the original sequence.. takes extra 1~4x time to expand. | |
# The extra time decreases as elems decreases, -> ~1.2x. | |
# CONSIDER SKIPPING THIS and use `_sorted_nodup` for the final eval instead. | |
TIMEFORMAT=$'expand\t%3lR' | |
time for i in "${_sorted_nodup[@]}"; do | |
j=${_tmp[i]} | |
while ((j--)); do _sorted+=("$i"); done | |
done | |
# Assign it back.. | |
eval "${2:-$1}=(" '"${_sorted[@]}" )' | |
} | |
# alias_sort <source_arr> [target_arr:-source_arr] | |
# modified to fit in a function. | |
alias_sort(){ | |
_sort_type=lex | |
local _s=() _e="$1[@]" IFS=$'\n' # does bash 1 support indirect expansion? | |
_s=($( | |
unalias -a && # clear all aliases | |
alias "${!_e/%/=}" && # (exp: map append '=') pass to alias | |
alias # sort (see src) and print the aliases | |
)) || return | |
_s=("${_s[@]#alias }") # strip off the `alias ' | |
# strip the shortest trailing =* and assign back. | |
eval "${2:-$1}=("'"${_s[@]%=*}")' | |
} | |
# http://unix.stackexchange.com/a/248145/73443 | |
ext_44_sort() { | |
_sort_type=lex | |
for array do | |
readarray -tf '' "$array" < <( | |
eval "printf '%s\0' \"\${$array[@]}\" | sort -z") | |
done | |
} | |
ext_sort() { | |
_sort_type=lex | |
for array do | |
tmp=() i=0 | |
while IFS= read -rd '' "tmp[i++]"; do :; done < <( | |
eval "printf '%s\0' \"\${$array[@]}\" | sort -z") | |
eval "$array=("'"${tmp[@]}")' | |
done | |
} | |
echo "Testing performance on $n random elements... Filling array." | |
time while ((n--)); do to_sort+=($RANDOM); done | |
echo | |
echo "Testing array copy..." | |
time arr=("${to_sort[@]}") | |
echo | |
test_pass | |
echo | |
echo "Testing partly-sequential array {1..817} {9154..32768} {818..9153} {1..16384} {1..8192}{,,,}.." | |
to_sort=({1..817} {9154..32768} {818..9153} {1..16384} {1..4096}{,,,}) arr=("${to_sort[@]}") | |
test_pass |
#!/bin/bash | |
((!EUID)) || exec sudo "$0" "$@" # Addicted to OR-logic | |
declare -A src_dst_pair src_dst_old; # dictionaries -- the older one not being used | |
oops(){ printf '%b\n' "oOpsS!\t$1">&2; exit "${2:-1}"; } | |
## oldlist file | |
## Gets the contents of a legacy list. | |
oldlist() { | |
local arr i s d; | |
awk '/^#BEGIN-log#/ {flag=1; next} /^#END-log#/ {flag=0} flag' "$1" | mapfile arr | |
for i in "${arr[@]}"; do | |
case "$i" in *'=('*=*=*')') :;; *) continue;; esac | |
i="${i#*(}" | |
i="${i%)}" | |
read s d <<< "$i" | |
remember "${s#*=}" "${d#*=}" src_dst_old | |
done | |
} | |
##remember key val <varname> Generates a basically safe-to-use line of script and appends it to self. | |
# TODO: write this back to a script. | |
remember() { | |
printf '%s\n' "${3:-src_dst_pair}[$(printf %q "$1")]=$(printf %q "$2")" >> "$0" | |
} | |
try_link(){ | |
local SRC="$1" DST="$2" src_in_dst | |
[[ -e "$SRC" ]] || [[ -e "$DST" ]] || oops "$DST" | |
[[ -e "$SRC" ]] || mv "$DST" "$SRC" | |
[[ -e "$DST" ]] || ln -Tsf "$SRC" "$DST" | |
src_in_dst=("$DST"/*"$SRC"*) | |
((${#src_in_dst[@]})) || ln -Tsf "$SRC" "$DST" | |
} | |
# -> for thing in "$@" | |
for thing; do | |
[[ $thing == /* ]] || thing="$PWD/$thing" # relative path, hmm. | |
SRC=/.hd$thing DST=$thing | |
[[ "$SRC $DST" != *'.hd/.hd'* ]] || oops 'pathdup: .hd/.hd' | |
remember "$SRC" "$DST" | |
try_link "$SRC" "$DST" | |
done | |
# Another way to spell eval "$(blah)" | |
source <(awk '/^# __DATA_START__/ { flag=1; next } flag') | |
for those in "${!src_dst_pair[@]}"; do try_link "$those" "${src_dst_pair[$those]}"; done | |
exit | |
# __DATA_START__ |
Re-tested on a DigitalOcean Ubuntu 15.04 instance (bash 4.3.30, -DCONF_MACHTYPE='x86_64-pc-linux-gnu' -D_FORTIFY_SOURCE=2 -g -O2 -fstack-protector-strong
)
Interesting parts to see:
index_sort
takes a considerable of non-insert/expand time. This might be caused by the three array copy actions done. Therefore, writing it inline (or make it a big block of eval) might still be the best solution…- The initial copy meant to improve the performance, at least for Cygwin. Too lazy to check if it helps on Ubuntu for now.
- Great to see that the adaptive part of it helps.
alias_sort
performs poorly with lots of duplicate keys. Bash's internal hashtable must be unhappy.fs_sort
works great on Linux. On Cygwin it's like half a minute ofsys
.
(The 42
retcode is assigned for 'array too big, I am not gonna waste my time' in bub
.)
nobody@A2cheap:/tmp$ bash sorttime.bash 65536
Testing performance on 65536 random elements... Filling array.
real 0m0.554s
user 0m0.544s
sys 0m0.008s
Testing array copy...
real 0m1.635s
user 0m1.604s
sys 0m0.028s
index:
insert 0m16.048s
expand 0m8.635s
real 0m31.998s
user 0m31.852s
sys 0m0.088s
int_chk GOOD LEN_EQ 65536
alias:
real 0m18.202s
user 0m18.020s
sys 0m0.144s
lex_chk GOOD LEN_NE 65536 -> 28141
fs:
/tmp
real 0m1.275s
user 0m0.196s
sys 0m1.056s
lex_chk GOOD LEN_NE 65536 -> 28141
bub:
real 0m0.000s
user 0m0.000s
sys 0m0.000s
ERR: 42
Testing partly-sequential array {1..817} {9154..32768} {818..9153} {1..16384} {1..8192}{,,,}..
index:
insert 0m1.803s
expand 0m4.104s
real 0m15.670s
user 0m15.540s
sys 0m0.084s
int_chk GOOD LEN_EQ 65536
alias:
real 1m55.814s
user 1m55.308s
sys 0m0.252s
lex_chk GOOD LEN_NE 65536 -> 32768
fs:
/tmp
real 0m1.232s
user 0m0.192s
sys 0m1.020s
lex_chk GOOD LEN_NE 65536 -> 32768
bub:
real 0m0.000s
user 0m0.000s
sys 0m0.000s
ERR: 42
Time for ifs_nl_arr_sep
on Cygwin_x64:
eat ()
{
local n=$1 to_sort=() arr IFS=$'\n';
echo eat $n
while ((n--)); do
to_sort+=($RANDOM);
done;
time arr=($(sort -g <<< "${to_sort[*]}"))
}
$ eat 128; eat 1024; eat 65536
eat 128
real 0m0.034s
user 0m0.062s
sys 0m0.030s
eat 1024
real 0m0.050s
user 0m0.015s
sys 0m0.000s
eat 65536
real 0m1.486s
user 0m1.406s
sys 0m0.031s
my bad. I used declare -A
and not -a
-- that was totally stupid.
#!/bin/bash
bub_sort(){ # http://unix.stackexchange.com/a/247659/73443
arr=( "$@" )
for ((i=0; i <= $((${#arr[@]} - 2)); ++i))
do for ((j=((i + 1)); j <= ((${#arr[@]} - 1)); ++j))
do if [[ ${arr[i]} -gt ${arr[j]} ]]
then # echo $i $j ${arr[i]} ${arr[j]}
tmp=${arr[i]}
arr[i]=${arr[j]}
arr[j]=$tmp
fi
done
done
}
fs_sort(){ # http://unix.stackexchange.com/a/247660/73443
local OLDPWD IFS=' /'
cd -- "$(mktemp -d)" || return
touch -- $*; declare -ga arr=(*)
cd - && rm -rf -- "$OLDPWD"
} >/dev/null
index_sort(){ # http://unix.stackexchange.com/a/248137/73443
declare -a tmp
for i
do tmp[$i]+=' "$i"'
done
for i in "${!tmp[@]}"
do eval " arr+=( ${tmp[$i]} )"
done
}
ext_sort_44() # http://unix.stackexchange.com/a/248145/73443
for array
do readarray -tf '' "$array" < <(
eval ' printf "%s\0" "${'"$array"'[@]}" | sort -zg')
done
ext_sort()
for array
do tmp=() i=0
while IFS= read -rd '' "tmp[i++]"
do :; done < <(eval "
printf '%s\0' \"\${$array[@]}\" | sort -zg")
eval " $array"'=( "${tmp[@]}" )'
done
urand(){
local IFS
unset IFS "$1[@]" &&
declare -ga "$1=( $(dd count="${2:-1}" </dev/urandom|od -An -vtu1) )"
}
TIMEFORMAT=$'real\t%3lR\nuser\t%3lU\nsys\t%3lS\n'
echo "Testing performance on $((n=${1:-1}))*512 random elements... Filling array."
time urand rand "$n"
for sort in index fs bub
do unset arr[@]
declare -a arr
echo "$sort"_sort:
time { "$sort"_sort "${rand[@]}"
eval "$sort"'_arr=( "${arr[@]}" )'
}
eval ' echo "${'"$sort"'_arr[@]}"'
done >&2
2+0 records in
2+0 records out
1024 bytes (1.0 kB) copied, 0.000311307 s, 3.3 MB/s
real 0m0.011s
user 0m0.007s
sys 0m0.003s
index_sort:
real 0m0.022s
user 0m0.020s
sys 0m0.000s
fs_sort:
real 0m0.020s
user 0m0.007s
sys 0m0.003s
bub_sort:
real 0m7.317s
user 0m7.323s
sys 0m0.000s
@mikeserv
A raw test on yours over 65536 elems gives
real 0m43.785s
user 0m43.235s
sys 0m0.031s
Let me lint this by a bit, perhaps without eval
it will be faster..
index_sort(){ # http://unix.stackexchange.com/a/248137/73443
declare -a tmp
for i
do tmp[i]+=' "$i"' # so does the $
done
for i in "${!tmp[@]}"
do arr+=( ${tmp[i]} ) # eval not necessary here
done
}
time index_sort "${to_sort[@]}"
# real 0m33.592s
# user 0m33.047s
# sys 0m0.016s
Indeed faster.
Overhead test:
time : "${to_sort[@]}"
# real 0m0.285s
# user 0m0.281s
# sys 0m0.000s
Yeah! i didnt even think of that. so you can drop the dollar sign, too, right?
but... when i take out eval
i just get a huge stack of i's
.
@mikeserv Yes. It was only a leftover from the old -A
one where [i]
will be interpreted as a literal i
. Will edit it tomorrow -- time for doing homework.
so what is this thing? is it possible you could just explain it?