# Serial method uses 40-50% all available CPU prior to `sort` step. Assuming linear scaling, best we could achieve is halving the time. # Grand Assertion: this pipeline actually gives correct answer! This is a very complex way to calculate this, SQL would be so much easier... # cut -d ' ' -f 2,3,5 - Take fields 2, 3, and 5 (store, timestamp, transaction) # tr -d '[A-Za-z\"/\- ]' - Strip out all the characters and spaces, to just leave the store number, timestamp, and commas to represent the number of items # awk '{print (substr($1,1,5)"-"substr($1,6,6)), length(substr($1,14))+1}' - Split the string at the store, yearmo boundary, then count number of commas + 1 (since 3 commas = 4 items) # awk '{a[$1]+=$2;}END{for(i in a)print i" "a[i];}' - Sum by store-yearmo combo # sort - Sort such that the store number is together, then the month time cut -d ' ' -f 2,3,5 transactions.csv | tr -d '[A-Za-z\"/\- ]' | awk '{print (substr($1,1,5)"-"substr($1,6,6)), length(substr($1,14))+1}' | awk '{a[$1]+=$2;}END{for(i in a)print i" "a[i];}' | sort real 14m5.657s # Parallelize the substring awk step # Actually lowers processor utilization! awksub2 () { awk '{print (substr($1,1,5)"-"substr($1,6,6)), length(substr($1,14))+1}';} export -f awksub2 time cut -d ' ' -f 2,3,5 transactions.csv | tr -d '[A-Za-z\"/\- ]' | parallel --pipe -m awksub2 | awk '{a[$1]+=$2;}END{for(i in a)print i" "a[i];}' | sort real 19m27.407s (worse!) # Move parallel to aggregation step awksub3 () { awk '{a[$1]+=$2;}END{for(i in a)print i" "a[i];}';} export -f awksub3 time cut -d ' ' -f 2,3,5 transactions.csv | tr -d '[A-Za-z\"/\- ]' | awk '{print (substr($1,1,5)"-"substr($1,6,6)), length(substr($1,14))+1}' | parallel --pipe awksub3 | awksub3 | sort real 19m24.851s (Same as other parallel run)