Created
December 31, 2019 17:09
-
-
Save meowsbits/b44f02eab9b56fc2d061ffe95a278828 to your computer and use it in GitHub Desktop.
Script sorts a big (90GB+) CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Sorts a big CSV file by breaking it into small pieces, sorting those, then merging (while sorting) those | |
# small pieces back into a big one. | |
# Largely (get it?) taken from https://stackoverflow.com/a/34092506/4401322 | |
# Break big file into small chunk files. | |
# Set how many lines per chunk file you want. | |
split -l 1000000 "$1" chunk- | |
# Remove original big file. It is now redundant to its chunks. | |
rm "$1" | |
for f in chunk-*; do | |
echo "Sorting $f to sorted-$f" | |
# Set how many parallel procs you want. | |
# Set which columns you want to sort by. | |
# -n flags means sort numerically. | |
sort --parallel 6 -t',' -k 4,4 -k 5,5 -n < "$f" > "sorted-$f" | |
# Remove the unsorted chunk file. Saves redundant space. | |
rm "$f" | |
done | |
# Merge, with sorting, the sorted chunk files back into a big one. | |
sort -t',' -k 4,4 -k 5,5 -m -n sorted-* > "$1" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment