Skip to content

Instantly share code, notes, and snippets.

@braingram
Last active April 28, 2023 21:43
Show Gist options
  • Save braingram/ba388b166bc08b2c42ce1abee317f1b0 to your computer and use it in GitHub Desktop.
Save braingram/ba388b166bc08b2c42ce1abee317f1b0 to your computer and use it in GitHub Desktop.
Performance of memmapping vs chunking (zarr)
import os
import shutil
import time
import numpy
import zarr
n_times = 10
image_size = (4000, 4000, 50)
dtype = 'uint8'
mm_fn = 'test_memmap.bin'
zarr_dir = 'test_zarr'
def setup_memmap():
# generate the memmap file
arr = numpy.zeros(image_size, dtype)
with open(mm_fn, 'wb') as f:
f.write(arr.tobytes())
return numpy.memmap(mm_fn, shape=image_size, dtype='uint8')
def re_memmap():
return numpy.memmap(mm_fn, shape=image_size, dtype='uint8')
def setup_zarr(chunk_shape):
# generate the zarr directory store
if os.path.exists(zarr_dir):
shutil.rmtree(zarr_dir)
z = zarr.open(zarr_dir, mode='w', shape=image_size, chunks=chunk_shape, dtype=dtype)
z[:] = 0
return z
chunk_shapes = [
image_size, # one chunk for whole image
[1000, 1000, 50], # first 2 dimensions
[4000, 4000, 10], # only z
[1000, 1000, 10], # all 3
[300, 300, 1], # very small chunks
]
accessors = [
((lambda arr: arr[:]), "whole array"),
((lambda arr: arr[:, 0, 0]), "first dimension slice"),
((lambda arr: arr[0, :, 0]), "second dimension slice"),
((lambda arr: arr[0, 0, :]), "third dimension slice"),
((lambda arr: arr[10:40, 20:50, 0]), "cutout for 1 z"),
((lambda arr: arr[:30, :30, 0]), "cutout aligned to one chunk"),
((lambda arr: arr[10:40, 20:50, :]), "chutout for all z"),
((lambda arr: arr[100:400, 200:500, 0]), "larger cutout for 1 z"),
((lambda arr: arr[:300, :300, 0]), "larger cutout aligned to one chunk"),
((lambda arr: arr[100:400, 200:500, :]), "larger cutout for all z"),
]
def time_it(arr, accessor, n_times, mean=True):
ts = []
v = 0
for _ in range(n_times):
t0 = time.monotonic()
v += numpy.sum(accessor(arr))
t1 = time.monotonic()
ts.append(t1 - t0)
if not mean:
return ts
return numpy.mean(ts)
if __name__ == '__main__':
print(f"Running each test {n_times} times")
print(f"Testing image shape: {image_size}")
mm = setup_memmap()
for cs in chunk_shapes:
print(f"\tchunk shape: {cs}")
z = setup_zarr(cs)
for (a, an) in accessors:
# to make this 'fair' make a new memmap every time
mm = re_memmap()
zt = time_it(z, a, n_times)
mmt = time_it(mm, a, n_times)
winner = 'memmap' if mmt < zt else 'chunking'
print(f"\t\t{winner} win! {an}: chunking={zt}, memmap={mmt}")
@braingram
Copy link
Author

Is the mmap accessor actually loading any data into memory?

Good point! I updated it to sum the array (and updated the results above).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment