twolfe18/sort-compression.txt

## sort-compression.txt

time gzip -c <orig >orig.gz
real    0m0.311s
user    0m0.289s
sys     0m0.005s

time bzip2 -c <orig >orig.bz2
real    0m3.358s
user    0m2.981s
sys     0m0.367s

time sort orig | gzip -c >orig.sorted.gz
real    0m2.358s
user    0m2.345s
sys     0m0.047s

time sort orig | bzip2 -c >orig.sorted.bz2
real    0m6.839s
user    0m6.823s
sys     0m0.046s

du -sh orig*
4.3M    orig
1.4M    orig.gz
1010K   orig.bz2
667K    orig.sort.gz
596K    orig.sort.bz2

# to reconstruct the file, we need a permutation matrix
# that tells us how to get back the unsorted log file.
# in this case, there aren't many lines, but one 2-byte
# per line comes out to 402K, which totally wipes out the benefits!
wc -l orig
201100 orig

# how about a bigger file:
wc -l big
48244770 big

# 2^26 = 67,108,864 > 48,244,770
# 26 * 48244770 / 8 = 153M

time sort -o big.sorted -S 64M big
# note that sorting takes by far the longest,
# except for maybe bzip2 -9

8.7G  big
817M	big.bz2
1.1G	big.gz
1.1G	big.gz9
8.7G	big.sorted
722M	big.sorted.bz2
911M	big.sorted.gz
894M	big.sorted.gz9

# big.sorted.gz + 153M = 1064MB == 1.1G = big.gz
# ... tl;dr we didn't save any space and we wasted a lot of time

# TODO look at better forms of compressing similar lines
# you might want to consider clustering input lines,
# do delta encoding from cluster centroid to cluster elements

	time gzip -c <orig >orig.gz
	real 0m0.311s
	user 0m0.289s
	sys 0m0.005s

	time bzip2 -c <orig >orig.bz2
	real 0m3.358s
	user 0m2.981s
	sys 0m0.367s

	time sort orig \| gzip -c >orig.sorted.gz
	real 0m2.358s
	user 0m2.345s
	sys 0m0.047s

	time sort orig \| bzip2 -c >orig.sorted.bz2
	real 0m6.839s
	user 0m6.823s
	sys 0m0.046s

	du -sh orig*
	4.3M orig
	1.4M orig.gz
	1010K orig.bz2
	667K orig.sort.gz
	596K orig.sort.bz2

	# to reconstruct the file, we need a permutation matrix
	# that tells us how to get back the unsorted log file.
	# in this case, there aren't many lines, but one 2-byte
	# per line comes out to 402K, which totally wipes out the benefits!
	wc -l orig
	201100 orig

	# how about a bigger file:
	wc -l big
	48244770 big

	# 2^26 = 67,108,864 > 48,244,770
	# 26 * 48244770 / 8 = 153M

	time sort -o big.sorted -S 64M big
	# note that sorting takes by far the longest,
	# except for maybe bzip2 -9

	8.7G big
	817M big.bz2
	1.1G big.gz
	1.1G big.gz9
	8.7G big.sorted
	722M big.sorted.bz2
	911M big.sorted.gz
	894M big.sorted.gz9

	# big.sorted.gz + 153M = 1064MB == 1.1G = big.gz
	# ... tl;dr we didn't save any space and we wasted a lot of time

	# TODO look at better forms of compressing similar lines
	# you might want to consider clustering input lines,
	# do delta encoding from cluster centroid to cluster elements