soaxelbrooke/parallel_word_frequency_count.sh

## parallel_word_frequency_count.sh
# Need wf - install with `cargo install wf`

mkdir splits wfs
echo 'Splitting file into parts...'
split -a 5 -l 200000 $1 splits/split
ls splits/ | parallel 'echo "Counting {}..."; cat splits/{} | wf > wfs/{}_wf.txt'
echo 'Combining split counts...'
python -c 'from tqdm import tqdm; from functools import reduce; from glob import glob; from collections import Counter; of = open("wfs.txt", "w"); wf = reduce(lambda a, b: a + b, (Counter(dict((pair[0], int(pair[1])) for pair in (line.strip().split() for line in open(fpath)))) for fpath in tqdm(glob("wfs/*"))), Counter()); [of.write("{} {}\n".format(key, count)) for key, count in sorted(wf.items(), key=lambda p: -p[1])]'
rm -rf wfs splits
echo 'Word frequencies written to wfs.txt.'
	# Need wf - install with `cargo install wf`

	mkdir splits wfs
	echo 'Splitting file into parts...'
	split -a 5 -l 200000 $1 splits/split
	ls splits/ \| parallel 'echo "Counting {}..."; cat splits/{} \| wf > wfs/{}_wf.txt'
	echo 'Combining split counts...'
	python -c 'from tqdm import tqdm; from functools import reduce; from glob import glob; from collections import Counter; of = open("wfs.txt", "w"); wf = reduce(lambda a, b: a + b, (Counter(dict((pair[0], int(pair[1])) for pair in (line.strip().split() for line in open(fpath)))) for fpath in tqdm(glob("wfs/*"))), Counter()); [of.write("{} {}\n".format(key, count)) for key, count in sorted(wf.items(), key=lambda p: -p[1])]'
	rm -rf wfs splits
	echo 'Word frequencies written to wfs.txt.'