Last active
August 29, 2018 07:15
-
-
Save d136o/c46153410a4f0a874a1cdd77d4476216 to your computer and use it in GitHub Desktop.
Comparing mmap performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Compares the time to scan through a large tar.gz file when it is memory mapped vs when it isn't | |
In this example train_0.tar.gz is a 46 GB tar.gz file that decompresses to 47 GB. In other words its a file that | |
hasn't compressed much, in this case because it contains a bunch of already compressed files. | |
$ python3 ./mmap_test.py | |
seconds to execute w/o mmap: 134.61383328400552 | |
seconds to execute mmpa: 43.660543900041375 | |
''' | |
import tarfile | |
import timeit | |
import mmap | |
def iterate_over_tgz(fname): | |
with tarfile.open(fname, 'r:gz') as f: | |
entry_count = 0 | |
for entry in f: | |
entry_count += 1 | |
return entry_count | |
def iterate_over_tgz_mmap(fname): | |
with open(fname,'rb') as f: | |
mapped_f = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) | |
with tarfile.open(fileobj=mapped_f, mode='r:gz') as tar_f: | |
entry_count = 0 | |
for entry in f: | |
entry_count += 1 | |
return entry_count | |
if __name__=='__main__': | |
t = timeit.timeit( | |
'iterate_over_tgz("train_0.tar.gz")', | |
globals=globals(), | |
number=1) | |
print('seconds to execute w/o mmap: {}'.format(t)) | |
t = timeit.timeit( | |
'iterate_over_tgz_mmap("train_0.tar.gz")', | |
globals=globals(), | |
number=1) | |
print('seconds to execute mmpa: {}'.format(t)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment