Last active
May 23, 2022 23:48
-
-
Save devtin/fdad0023115ca3dcdc0989fa6e9c0d06 to your computer and use it in GitHub Desktop.
fast process to sorts long csv files from command line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# SPEEDS UP CSV SORTING (~2X) BY DISTRIBUTING THE LOAD INTO 8 DIFFERENT PROCESSES | |
# USAGE: $ < <input-csv-file> ./csv-tr-sort.sh -c <column-name-to-sort> -o <sort-direction=1,-1> > sorted.csv | |
# AVERAGES ~22 SECONDS SORTING A 300MB CSV FILE WITH 3MM ENTRIES ON AN M1 | |
ORDER=1 | |
usage() { echo "Usage: $0 [-c <column-name>] [-o <order=1,-1>]" 1>&2; exit 1; } | |
while getopts ":c:o:" o; do | |
case "${o}" in | |
c) | |
COLUMN=${OPTARG} | |
;; | |
o) | |
ORDER=${OPTARG} | |
;; | |
*) | |
usage | |
;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
if [ -z "${COLUMN}" ] || [ -z "${ORDER}" ]; then | |
usage | |
fi | |
PROCESS_RANGE="a-d e-h i-l m-p q-t u-x y-z ^a-z" # 8 cores | |
if [ "$ORDER" == "-1" ]; then | |
PROCESS_RANGE=$(echo $PROCESS_RANGE | awk '{ for (i=NF; i>1; i--) printf("%s ",$i); print $1; }') | |
fi | |
function get-tee-cmd () { | |
CMD="tee" | |
LINE=1 | |
for range in $PROCESS_RANGE | |
do | |
CMD="$CMD >(csv-tr --filter \"/^[$range]/i.test(row['$COLUMN'])\" | csv-tr --sort '$COLUMN:$ORDER' | tail -n +$LINE > .$range.csv)" | |
LINE=2 | |
done | |
CMD="$CMD" | |
echo $CMD | |
} | |
function wait-til-files-processed () { | |
LSOF="lsof" | |
LS="ls" | |
for range in $PROCESS_RANGE | |
do | |
LSOF="$LSOF .$range.csv" | |
LS="$LS .$range.csv" | |
done | |
while true; do | |
RES=$($LSOF || $LS 1>/dev/null || echo "not ready") | |
if [ "$RES" != "" ]; then | |
sleep .1 | |
else | |
break | |
fi | |
done | |
} | |
write-output () { | |
for range in $PROCESS_RANGE | |
do | |
cat .$range.csv | |
done | |
} | |
function clean-tmp-files () { | |
for res_file in $PROCESS_RANGE | |
do | |
unlink .$res_file.csv | |
done | |
} | |
TEE_CMD=$(get-tee-cmd) | |
bash -c "$TEE_CMD" > /dev/null | |
wait-til-files-processed | |
write-output | |
clean-tmp-files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
INPUT=$1 | |
SORT_DIRECTION="${3-1}" | |
OUTPUT_FILE="${4-.result.csv}" | |
# !!! WORK IN PROGRESS !!! | |
# CURRENTLY CAN ONLY SORT THE FIRST COLUMN OF A CSV FILE | |
# AVERAGES ~20 SECONDS SORTING A 300MB CSV FILE WITH 3MM ENTRIES ON AN M1 | |
PROCESS_RANGE="a-d e-h i-l m-p q-t u-x y-z ^a-z" # 8 cores lol | |
function get-tee-cmd () { | |
CMD="< $1 tail -n +2 | tee -a " | |
for range in $PROCESS_RANGE | |
do | |
# TODO: IN ORDER TO BE ABLE TO SORT BY ANY COLUMN WE HAVE TO FIX THAT PERL REGEX | |
# ADD -r FLAG FOR REVERSE | |
CMD="$CMD >(perl -n -e'/^([$range].*)/msi && print \$1' | sort -t, -k1 > .$range.csv &)" | |
done | |
CMD="$CMD; wait" | |
echo $CMD | |
} | |
function wait-til-files-processed () { | |
FILES_SEEN=0 | |
while true; do | |
LSOF="lsof" | |
for range in $PROCESS_RANGE | |
do | |
LSOF="$LSOF .$range.csv" | |
done | |
RES=$($LSOF) | |
if [ "$RES" != "" ]; then | |
FILES_SEEN=1 | |
sleep .1 | |
elif [ $FILES_SEEN -eq 1 ]; then | |
break | |
fi | |
done | |
} | |
function write-output () { | |
CAT="cat" | |
for range in $PROCESS_RANGE | |
do | |
CAT="$CAT .$range.csv" | |
done | |
CAT="$CAT" | |
(head -1 $INPUT; $CAT) > $OUTPUT_FILE | |
} | |
function clean-tmp-files () { | |
for res_file in $PROCESS_RANGE | |
do | |
unlink .$res_file.csv | |
done | |
} | |
# get-tee-cmd "$1" | |
bash -c "$(get-tee-cmd "$1")" > /dev/null | |
wait-til-files-processed | |
write-output | |
clean-tmp-files | |
if [ "$4" == "" ]; then | |
cat $OUTPUT_FILE | |
unlink $OUTPUT_FILE | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment