Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active June 4, 2022 18:02
Show Gist options
  • Save xflr6/49a7be702939b8467310c42fdc44643e to your computer and use it in GitHub Desktop.
Save xflr6/49a7be702939b8467310c42fdc44643e to your computer and use it in GitHub Desktop.
Compare while-loop with break to for-loop with two-argument iter() for iterating over a large file in chunks
"""Compare three ways to iterate over a large file in chunks."""
import functools
import hashlib
import mmap
import os
import pathlib
import shutil
import time
import types
__all__ = ['sha256sum_while',
'sha256sum_iter',
'sha256sum_copyfileobj',
'sha256sum_mmap']
PATH = pathlib.Path('spam.bin')
SIZE = 107_3741_824
CHUNK = 1_048_576
EXPECTED_RESULT = '49bc20df15e412a64472421e13fe86ff1c5165e18b2afccf160d4dc19fe68a14'
def sha256sum_while(path: os.PathLike | str, *,
bufsize: int = 32_768) -> str:
s = hashlib.sha256()
with open(path, 'rb') as f:
while True:
data = f.read(bufsize)
if not data:
break
s.update(data)
return s.hexdigest()
def sha256sum_iter(path: os.PathLike | str, *,
bufsize: int = 32_768) -> str:
s = hashlib.sha256()
with open(path, 'rb') as f:
for data in iter(functools.partial(f.read, bufsize), b''):
s.update(data)
return s.hexdigest()
def sha256sum_copyfileobj(path: os.PathLike | str, *,
bufsize: int = 32_768) -> str:
s = hashlib.sha256()
dest = types.SimpleNamespace(write=s.update)
with open(path, 'rb') as f:
shutil.copyfileobj(f, dest, length=bufsize)
return s.hexdigest()
def sha256sum_mmap(path: os.PathLike | str) -> str:
# poor performance under PY3?
s = hashlib.sha256()
with open(path, 'rb')as f, mmap.mmap(f.fileno(), 0,
access=mmap.ACCESS_READ) as data:
s.update(data)
return s.hexdigest()
if not PATH.exists():
with PATH.open('wb') as f:
zeros = b'\x00' * CHUNK
for _ in range(SIZE // CHUNK):
f.write(zeros)
for func_name in __all__:
print(func_name)
func = eval(func_name)
print(func)
start = time.perf_counter_ns()
result = func(PATH)
duration = (time.perf_counter_ns() - start) / 1_000_000_000
print('result:', 'OK' if result == EXPECTED_RESULT else result)
print('duration:', duration)
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment