Skip to content

Instantly share code, notes, and snippets.

@devtin
Last active May 23, 2022 23:48
Show Gist options
  • Save devtin/fdad0023115ca3dcdc0989fa6e9c0d06 to your computer and use it in GitHub Desktop.
Save devtin/fdad0023115ca3dcdc0989fa6e9c0d06 to your computer and use it in GitHub Desktop.
fast process to sorts long csv files from command line
#!/bin/bash
# SPEEDS UP CSV SORTING (~2X) BY DISTRIBUTING THE LOAD INTO 8 DIFFERENT PROCESSES
# USAGE: $ < <input-csv-file> ./csv-tr-sort.sh -c <column-name-to-sort> -o <sort-direction=1,-1> > sorted.csv
# AVERAGES ~22 SECONDS SORTING A 300MB CSV FILE WITH 3MM ENTRIES ON AN M1
ORDER=1
usage() { echo "Usage: $0 [-c <column-name>] [-o <order=1,-1>]" 1>&2; exit 1; }
while getopts ":c:o:" o; do
case "${o}" in
c)
COLUMN=${OPTARG}
;;
o)
ORDER=${OPTARG}
;;
*)
usage
;;
esac
done
shift $((OPTIND-1))
if [ -z "${COLUMN}" ] || [ -z "${ORDER}" ]; then
usage
fi
PROCESS_RANGE="a-d e-h i-l m-p q-t u-x y-z ^a-z" # 8 cores
if [ "$ORDER" == "-1" ]; then
PROCESS_RANGE=$(echo $PROCESS_RANGE | awk '{ for (i=NF; i>1; i--) printf("%s ",$i); print $1; }')
fi
function get-tee-cmd () {
CMD="tee"
LINE=1
for range in $PROCESS_RANGE
do
CMD="$CMD >(csv-tr --filter \"/^[$range]/i.test(row['$COLUMN'])\" | csv-tr --sort '$COLUMN:$ORDER' | tail -n +$LINE > .$range.csv)"
LINE=2
done
CMD="$CMD"
echo $CMD
}
function wait-til-files-processed () {
LSOF="lsof"
LS="ls"
for range in $PROCESS_RANGE
do
LSOF="$LSOF .$range.csv"
LS="$LS .$range.csv"
done
while true; do
RES=$($LSOF || $LS 1>/dev/null || echo "not ready")
if [ "$RES" != "" ]; then
sleep .1
else
break
fi
done
}
write-output () {
for range in $PROCESS_RANGE
do
cat .$range.csv
done
}
function clean-tmp-files () {
for res_file in $PROCESS_RANGE
do
unlink .$res_file.csv
done
}
TEE_CMD=$(get-tee-cmd)
bash -c "$TEE_CMD" > /dev/null
wait-til-files-processed
write-output
clean-tmp-files
#!/bin/bash
INPUT=$1
SORT_DIRECTION="${3-1}"
OUTPUT_FILE="${4-.result.csv}"
# !!! WORK IN PROGRESS !!!
# CURRENTLY CAN ONLY SORT THE FIRST COLUMN OF A CSV FILE
# AVERAGES ~20 SECONDS SORTING A 300MB CSV FILE WITH 3MM ENTRIES ON AN M1
PROCESS_RANGE="a-d e-h i-l m-p q-t u-x y-z ^a-z" # 8 cores lol
function get-tee-cmd () {
CMD="< $1 tail -n +2 | tee -a "
for range in $PROCESS_RANGE
do
# TODO: IN ORDER TO BE ABLE TO SORT BY ANY COLUMN WE HAVE TO FIX THAT PERL REGEX
# ADD -r FLAG FOR REVERSE
CMD="$CMD >(perl -n -e'/^([$range].*)/msi && print \$1' | sort -t, -k1 > .$range.csv &)"
done
CMD="$CMD; wait"
echo $CMD
}
function wait-til-files-processed () {
FILES_SEEN=0
while true; do
LSOF="lsof"
for range in $PROCESS_RANGE
do
LSOF="$LSOF .$range.csv"
done
RES=$($LSOF)
if [ "$RES" != "" ]; then
FILES_SEEN=1
sleep .1
elif [ $FILES_SEEN -eq 1 ]; then
break
fi
done
}
function write-output () {
CAT="cat"
for range in $PROCESS_RANGE
do
CAT="$CAT .$range.csv"
done
CAT="$CAT"
(head -1 $INPUT; $CAT) > $OUTPUT_FILE
}
function clean-tmp-files () {
for res_file in $PROCESS_RANGE
do
unlink .$res_file.csv
done
}
# get-tee-cmd "$1"
bash -c "$(get-tee-cmd "$1")" > /dev/null
wait-til-files-processed
write-output
clean-tmp-files
if [ "$4" == "" ]; then
cat $OUTPUT_FILE
unlink $OUTPUT_FILE
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment