Results for long list is long various sorting demonstrations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Results for long list is long various sorting demonstrations. | |
## Summary | |
## https://www.perlmonks.org/?node_id=11150872 | |
## https://www.perlmonks.org/?node_id=11149907 | |
## https://www.perlmonks.org/?node_id=11150293 | |
A) Preparation | |
B) 276 files (3 * 92), fixed string length = 12 | |
llil4vec: 0m18.918s 27.8 GB std::vector | |
llil4map: 0m21.048s 8.9 GB phmap::parallel_flat_hash_map | |
llil4hmap: 0m19.302s 8.4 GB phmap::flat_hash_map | |
llil4emh: 0m16.908s 9.5 GB emhash7::HashMap | |
C) 552 files (6 * 92), fixed string length = 12 | |
llil4vec: 0m32.568s 53.9 GB | |
llil4map: 0m35.329s 8.8 GB | |
llil4hmap: 0m31.339s 8.3 GB | |
llil4emh: 0m26.155s 9.4 GB | |
D) 276 files (3 * 92), fixed string length = 20 | |
llil4vec: 0m31.066s 55.2 GB | |
llil4map: 0m22.718s 12.4 GB | |
llil4hmap: 0m21.192s 11.9 GB | |
llil4emh: 0m18.786s 13.4 GB | |
E) 552 files (6 * 92), fixed string length = 20 | |
llil4vec: cannot run, exceeds memory capacity > 62 GB | |
llil4map: 0m37.205s 12.5 GB | |
llil4hmap: 0m33.268s 11.8 GB | |
llil4emh: 0m28.283s 13.5 GB | |
F) 552 files (6 * 92), fixed string length = 30 | |
llil4vec: cannot run, exceeds memory capacity > 62 GB | |
llil4map: 0m38.096s 18.3 GB | |
llil4hmap: 0m34.185s 17.6 GB | |
llil4emh: 0m30.457s 18.9 GB | |
G) 1104 files (12 * 92), fixed string length = 30 | |
llil4vec: cannot run, exceeds memory capacity > 62 GB | |
llil4map: 0m66.543s 18.4 GB | |
llil4hmap: 0m58.355s 17.5 GB | |
llil4emh: 0m50.414s 18.8 GB | |
H) 1104 files (12 * 92), MAX_STR_LEN_L undefined | |
llil4vec: cannot run, exceeds memory capacity > 62 GB | |
llil4map: 1m09.710s 21.3 GB | |
llil4hmap: 1m01.499s 20.0 GB | |
llil4emh: 0m53.278s 20.4 GB | |
I) 1104 files (12 * 92), Unix sort command | |
GNU parallel parsort, mcesort, and tally-count | |
parsort: 4m25.392s GNU Parallel | |
mcesort: 3m39.777s MCE variant | |
J) 1104 files (12 * 92), Unix sort command | |
mcesort with --tally="tallycmd [options]" | |
mcesort: 1m48.626s MCE variant | |
for comparison, llil4map ran faster in 1m09.710s | |
####################################################################### | |
# A) Preparation | |
# This requires a data partition with ample storage ~ 60 GB. | |
# Obtain gen-llil.pl, shuffle.pl, and llil4 cmds from PerlMonks. | |
# https://perlmonks.org/?node_id=11149907 | |
# | |
####################################################################### | |
cp gen-llil.pl shuffle.pl /data/. | |
cd /data | |
# 26 random files | |
for n in $(perl -le "print for 'aa'..'az'"); do | |
perl gen-llil.pl big$n 200 3 1 | |
perl shuffle.pl big$n >1; mv 1 big$n | |
done & | |
# 26 random files | |
for n in $(perl -le "print for 'ba'..'bz'"); do | |
perl gen-llil.pl big$n 200 3 1 | |
perl shuffle.pl big$n >2; mv 2 big$n | |
done & | |
# 26 random files | |
for n in $(perl -le "print for 'ca'..'cz'"); do | |
perl gen-llil.pl big$n 200 3 1 | |
perl shuffle.pl big$n >3; mv 3 big$n | |
done & | |
# 14 random files (total 92 files) | |
for n in $(perl -le "print for 'da'..'dn'"); do | |
perl gen-llil.pl big$n 200 3 1 | |
perl shuffle.pl big$n >4; mv 4 big$n | |
done & | |
wait | |
# llil4vec: https://www.perlmonks.org/?node_id=11149545 | |
# llil4map: https://www.perlmonks.org/?node_id=11149643 | |
# llil4hmap: https://gist.github.com/marioroy/3924c48e140f8330f25f67cd98a815ef | |
# llil4emh: https://gist.github.com/marioroy/7a69a422d88c0314f215e612ba607d2a | |
ln -s /path/to/llil4vec . | |
ln -s /path/to/llil4map . | |
ln -s /path/to/llil4hmap . | |
ln -s /path/to/llil4emh . | |
####################################################################### | |
# B) 276 files (3 * 92), fixed string length = 12 | |
# | |
####################################################################### | |
# Memory 27.8 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4vec big* big* big* | cksum | |
llil4vec (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 5.823 secs | |
sort properties 6.496 secs | |
vector reduce 1.652 secs | |
vector stable sort 1.150 secs | |
write stdout 3.794 secs | |
total time 18.918 secs | |
2057246516 1811140689 | |
# Memory 8.9 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map big* big* big* | cksum | |
llil4map (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 15.404 secs | |
phmap to vector 0.514 secs | |
vector stable sort 1.283 secs | |
write stdout 3.846 secs | |
total time 21.048 secs | |
2057246516 1811140689 | |
# Memory 8.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap big* big* big* | cksum | |
llil4hmap (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 13.738 secs | |
hmap to vector 0.446 secs | |
vector stable sort 1.276 secs | |
write stdout 3.840 secs | |
total time 19.302 secs | |
2057246516 1811140689 | |
# Memory 9.5 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap big* big* big* | cksum | |
llil4emh (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 11.297 secs | |
emhash to vector 0.422 secs | |
vector stable sort 1.263 secs | |
write stdout 3.835 secs | |
total time 16.908 secs | |
2057246516 1811140689 | |
####################################################################### | |
# C) 552 files (6 * 92), fixed string length = 12 | |
# | |
####################################################################### | |
# Memory 53.9 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4vec \ | |
big* big* big* big* big* big* | cksum | |
llil4vec (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 11.963 secs | |
sort properties 13.466 secs | |
vector reduce 2.273 secs | |
vector stable sort 1.133 secs | |
write stdout 3.731 secs | |
total time 32.568 secs | |
2511908988 1891299111 | |
# Memory 8.8 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map \ | |
big* big* big* big* big* big* | cksum | |
llil4map (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 29.869 secs | |
phmap to vector 0.501 secs | |
vector stable sort 1.260 secs | |
write stdout 3.697 secs | |
total time 35.329 secs | |
2511908988 1891299111 | |
# Memory 8.3 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap \ | |
big* big* big* big* big* big* | cksum | |
llil4hmap (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 25.893 secs | |
hmap to vector 0.448 secs | |
vector stable sort 1.276 secs | |
write stdout 3.721 secs | |
total time 31.339 secs | |
2511908988 1891299111 | |
# Memory 9.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh \ | |
big* big* big* big* big* big* | cksum | |
llil4emh (fixed string length=12) start | |
use OpenMP | |
use boost sort | |
get properties 20.733 secs | |
emhash to vector 0.424 secs | |
vector stable sort 1.257 secs | |
write stdout 3.653 secs | |
total time 26.155 secs | |
2511908988 1891299111 | |
####################################################################### | |
# D) 276 files (3 * 92), fixed string length = 20 | |
# | |
####################################################################### | |
# Memory 55.2 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4vec big* big* big* | cksum | |
llil4vec (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 8.902 secs | |
sort properties 13.090 secs | |
vector reduce 3.061 secs | |
vector stable sort 2.247 secs | |
write stdout 3.765 secs | |
total time 31.066 secs | |
2057246516 1811140689 | |
# Memory 12.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map big* big* big* | cksum | |
llil4map (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 15.534 secs | |
phmap to vector 0.659 secs | |
vector stable sort 2.496 secs | |
write stdout 4.029 secs | |
total time 22.718 secs | |
2057246516 1811140689 | |
# Memory 11.9 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap big* big* big* | cksum | |
llil4hmap (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 14.084 secs | |
hmap to vector 0.594 secs | |
vector stable sort 2.487 secs | |
write stdout 4.026 secs | |
total time 21.192 secs | |
2057246516 1811140689 | |
# Memory 13.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh big* big* big* | cksum | |
llil4emh (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 11.758 secs | |
emhash to vector 0.592 secs | |
vector stable sort 2.473 secs | |
write stdout 3.877 secs | |
total time 18.786 secs | |
2057246516 1811140689 | |
####################################################################### | |
# E) 552 files (6 * 92), fixed string length = 20 | |
# | |
####################################################################### | |
# llil4vec (cannot run, exceeds memory capacity > 62 GB) | |
# Memory 12.5 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map \ | |
big* big* big* big* big* big* | cksum | |
llil4map (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 30.132 secs | |
phmap to vector 0.659 secs | |
vector stable sort 2.509 secs | |
write stdout 3.904 secs | |
total time 37.205 secs | |
2511908988 1891299111 | |
# Memory 11.8 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap \ | |
big* big* big* big* big* big* | cksum | |
llil4hmap (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 26.330 secs | |
hmap to vector 0.618 secs | |
vector stable sort 2.469 secs | |
write stdout 3.850 secs | |
total time 33.268 secs | |
2511908988 1891299111 | |
# Memory 13.5 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh \ | |
big* big* big* big* big* big* | cksum | |
llil4emh (fixed string length=20) start | |
use OpenMP | |
use boost sort | |
get properties 21.275 secs | |
emhash to vector 0.588 secs | |
vector stable sort 2.474 secs | |
write stdout 3.863 secs | |
total time 28.283 secs | |
2511908988 1891299111 | |
####################################################################### | |
# F) 552 files (6 * 92), fixed string length = 30 | |
# | |
####################################################################### | |
# llil4vec (cannot run, exceeds memory capacity > 62 GB) | |
# Memory 18.3 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map \ | |
big* big* big* big* big* big* | cksum | |
llil4map (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 30.188 secs | |
phmap to vector 1.035 secs | |
vector stable sort 2.989 secs | |
write stdout 3.883 secs | |
total time 38.096 secs | |
2511908988 1891299111 | |
# Memory 17.6 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap \ | |
big* big* big* big* big* big* | cksum | |
llil4hmap (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 26.372 secs | |
hmap to vector 0.968 secs | |
vector stable sort 2.962 secs | |
write stdout 3.881 secs | |
total time 34.185 secs | |
2511908988 1891299111 | |
# Memory 18.9 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh \ | |
big* big* big* big* big* big* | cksum | |
llil4emh (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 22.495 secs | |
emhash to vector 0.963 secs | |
vector stable sort 3.029 secs | |
write stdout 3.879 secs | |
total time 30.457 secs | |
2511908988 1891299111 | |
####################################################################### | |
# G) 1104 files (12 * 92), fixed string length = 30 | |
# | |
####################################################################### | |
# llil4vec (cannot run, exceeds memory capacity > 62 GB) | |
# Memory 18.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4map (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 58.765 secs | |
phmap to vector 1.043 secs | |
vector stable sort 2.977 secs | |
write stdout 3.756 secs | |
total time 66.543 secs | |
2652563698 2004830900 | |
# Memory 17.5 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4hmap (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 50.659 secs | |
hmap to vector 0.973 secs | |
vector stable sort 2.954 secs | |
write stdout 3.768 secs | |
total time 58.355 secs | |
2652563698 2004830900 | |
# Memory 18.8 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4emh (fixed string length=30) start | |
use OpenMP | |
use boost sort | |
get properties 42.569 secs | |
emhash to vector 0.956 secs | |
vector stable sort 3.013 secs | |
write stdout 3.785 secs | |
total time 50.414 secs | |
2652563698 2004830900 | |
####################################################################### | |
# H) 1104 files (12 * 92), MAX_STR_LEN_L undefined | |
# | |
####################################################################### | |
# llil4vec (cannot run, exceeds memory capacity > 62 GB) | |
# Memory 21.3 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4map \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4map start | |
use OpenMP | |
use boost sort | |
get properties 59.725 secs | |
phmap to vector 2.209 secs | |
vector stable sort 3.723 secs | |
write stdout 4.051 secs | |
total time 69.710 secs | |
2652563698 2004830900 | |
# Memory 20.0 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4hmap \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4hmap start | |
use OpenMP | |
use boost sort | |
get properties 52.307 secs | |
hmap to vector 1.533 secs | |
vector stable sort 3.710 secs | |
write stdout 4.047 secs | |
total time 61.599 secs | |
2652563698 2004830900 | |
# Memory 20.4 GB | |
$ NUM_THREADS=24 taskset -c 0-31 ./llil4emh \ | |
big* big* big* big* big* big* \ | |
big* big* big* big* big* big* | cksum | |
llil4emh start | |
use OpenMP | |
use boost sort | |
get properties 43.731 secs | |
emhash to vector 1.601 secs | |
vector stable sort 3.734 secs | |
write stdout 4.121 secs | |
total time 53.278 secs | |
2652563698 2004830900 | |
####################################################################### | |
# I) 1104 files (12 * 92), Unix sort command | |
# GNU parallel parsort, mcesort, and tally-count | |
# https://gist.github.com/marioroy/d30a3408474612dc1d289acdc6fbf19a | |
# https://perlmonks.org/?node_id=11150254 | |
# | |
####################################################################### | |
mkdir -p /data/tmp | |
cd /data | |
ln -s /path/to/parsort . | |
ln -s /path/to/mcesort . | |
ln -s /path/to/tally-count . | |
# parsort | |
# cat files into parsort to prevent it spawning 1104 * 2 processes | |
time \ | |
cat big* big* big* big* big* big* big* big* big* big* big* big* | \ | |
LC_ALL=C ./parsort --parallel=32 -T /data/tmp | ./tally-count | \ | |
LC_ALL=C ./parsort --parallel=32 -T /data/tmp -k2nr | cksum | |
2652563698 2004830900 | |
real 4m25.392s | |
user 4m24.118s | |
sys 0m51.291s | |
# mcesort does not spawn more workers than requested, | |
# but running similarly for comparison. | |
time \ | |
cat big* big* big* big* big* big* big* big* big* big* big* big* | \ | |
LC_ALL=C ./mcesort --parallel=32 -T /data/tmp | ./tally-count | \ | |
LC_ALL=C ./mcesort --parallel=32 -T /data/tmp -k2nr | cksum | |
2652563698 2004830900 | |
real 3m39.777s | |
user 42m14.480s | |
sys 3m15.780s | |
####################################################################### | |
# J) 1104 files (12 * 92), Unix sort command | |
# mcesort with -A ; same as LC_ALL=C | |
# with -j32 ; same as --parallel=32 | |
# with --tally="tallycmd [options]" | |
# | |
####################################################################### | |
time \ | |
./mcesort -A -j32 -T /data/tmp --tally="/data/tally-count" \ | |
big* big* big* big* big* big* big* big* big* big* big* big* | \ | |
./mcesort -A -j32 -T /data/tmp -k2nr | cksum | |
2652563698 2004830900 | |
real 1m48.626s | |
user 38m35.256s | |
sys 2m23.022s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment