Last active
June 4, 2022 18:02
-
-
Save xflr6/49a7be702939b8467310c42fdc44643e to your computer and use it in GitHub Desktop.
Compare while-loop with break to for-loop with two-argument iter() for iterating over a large file in chunks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Compare three ways to iterate over a large file in chunks.""" | |
import functools | |
import hashlib | |
import mmap | |
import os | |
import pathlib | |
import shutil | |
import time | |
import types | |
__all__ = ['sha256sum_while', | |
'sha256sum_iter', | |
'sha256sum_copyfileobj', | |
'sha256sum_mmap'] | |
PATH = pathlib.Path('spam.bin') | |
SIZE = 107_3741_824 | |
CHUNK = 1_048_576 | |
EXPECTED_RESULT = '49bc20df15e412a64472421e13fe86ff1c5165e18b2afccf160d4dc19fe68a14' | |
def sha256sum_while(path: os.PathLike | str, *, | |
bufsize: int = 32_768) -> str: | |
s = hashlib.sha256() | |
with open(path, 'rb') as f: | |
while True: | |
data = f.read(bufsize) | |
if not data: | |
break | |
s.update(data) | |
return s.hexdigest() | |
def sha256sum_iter(path: os.PathLike | str, *, | |
bufsize: int = 32_768) -> str: | |
s = hashlib.sha256() | |
with open(path, 'rb') as f: | |
for data in iter(functools.partial(f.read, bufsize), b''): | |
s.update(data) | |
return s.hexdigest() | |
def sha256sum_copyfileobj(path: os.PathLike | str, *, | |
bufsize: int = 32_768) -> str: | |
s = hashlib.sha256() | |
dest = types.SimpleNamespace(write=s.update) | |
with open(path, 'rb') as f: | |
shutil.copyfileobj(f, dest, length=bufsize) | |
return s.hexdigest() | |
def sha256sum_mmap(path: os.PathLike | str) -> str: | |
# poor performance under PY3? | |
s = hashlib.sha256() | |
with open(path, 'rb')as f, mmap.mmap(f.fileno(), 0, | |
access=mmap.ACCESS_READ) as data: | |
s.update(data) | |
return s.hexdigest() | |
if not PATH.exists(): | |
with PATH.open('wb') as f: | |
zeros = b'\x00' * CHUNK | |
for _ in range(SIZE // CHUNK): | |
f.write(zeros) | |
for func_name in __all__: | |
print(func_name) | |
func = eval(func_name) | |
print(func) | |
start = time.perf_counter_ns() | |
result = func(PATH) | |
duration = (time.perf_counter_ns() - start) / 1_000_000_000 | |
print('result:', 'OK' if result == EXPECTED_RESULT else result) | |
print('duration:', duration) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment