Skip to content

Instantly share code, notes, and snippets.

@FrancescAlted
Created August 27, 2022 08:26
Show Gist options
  • Save FrancescAlted/e4d186404f4c87d9620cb6f89a03ba0d to your computer and use it in GitHub Desktop.
Save FrancescAlted/e4d186404f4c87d9620cb6f89a03ba0d to your computer and use it in GitHub Desktop.
Benchmark comparing npy, npz, jdb and blosc2 storage formats
# Benchmark comparing npy, npz, jdb and blosc2 storage formats
import sys
import numpy as np
import jdata as jd
import blosc2
from time import time
N = 10_000
nsplits = 50
# Bigger array
# N = 20_000
# nsplits = 200
nchunks = N // nsplits
t0 = time()
x = np.eye(N)
y = np.vsplit(x, nsplits) # split into smaller chunks
t = time() - t0
print(f"time for creating big array (and splits): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
# Do not save unless we are passing a parameter
save = False if len(sys.argv) == 1 else True
if save:
print("\n** Saving data **")
t0 = time()
np.save('eye5chunk.npy', y)
t = time() - t0
print(f"time for saving with npy: {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='w+', shape=x.shape)
fp[:] = x
fp.flush()
t = time() - t0
print(f"time for saving with np.memmap: {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
np.savez_compressed('eye5chunk.npz', y)
t = time() - t0
print(f"time for saving with npz: {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
# t0 = time()
# jd.save(y, 'eye5chunk_bjd_raw.jdb') # save as uncompressed bjd
# t = time() - t0
# print(f"time for saving with jdb (raw): {t:.3f}s"
# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
jd.save(y, 'eye5chunk_bjd_zlib.jdb', {'compression':'zlib'}) # zlib-compressed bjd
t = time() - t0
print(f"time for saving with jdb (zlib): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
jd.save(y, 'eye5chunk_bjd_lzma.jdb', {'compression':'lzma'}) # lzma-compressed bjd
t = time() - t0
print(f"time for saving with jdb (lzma): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
chunksize = y[0].size * y[0].itemsize
cparams = {"compcode": blosc2.Codec.BLOSCLZ, "typesize": 8, "nthreads": 8}
storage = {"contiguous": True, "urlpath": "eye5_blosc2_blosclz.b2frame"}
schunk = blosc2.SChunk(chunksize=chunksize, mode="w", cparams=cparams, **storage)
for z in y:
schunk.append_data(z)
# The next is equivalent to the loop above
# schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
t = time() - t0
print(f"time for saving with blosc2 (blosclz): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
chunksize = y[0].size * y[0].itemsize
cparams = {"compcode": blosc2.Codec.ZSTD, "typesize": 8, "nthreads": 8}
storage = {"contiguous": True, "urlpath": "eye5_blosc2_zstd.b2frame"}
schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
t = time() - t0
print(f"time for saving with blosc2 (zstd): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
print("\n** Load and operate **")
t0 = time()
total = 0
for a in y:
total += a.sum()
t = time() - t0
print(f"time for reducing with plain numpy (memory): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
total = 0
for a in np.load('eye5chunk.npy'):
total += a.sum()
t = time() - t0
print(f"time for reducing with npy (np.load, no mmap): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
# t0 = time()
# total = 0
# for a in np.load('eye5chunk.npy', mmap_mode='r'):
# total += a.sum()
# t = time() - t0
# print(f"time for reducing with npy (np.load, mmap_mode): {t:.3f}s"
# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
total = 0
fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='r', shape=x.shape)
total += fp[:].sum()
t = time() - t0
print(f"time for reducing with np.memmap: {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
total = 0
for a in np.load('eye5chunk.npz').values():
total += a.sum()
t = time() - t0
print(f"time for reducing with npz: {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
# t0 = time()
# total = 0
# for a in jd.load('eye5chunk_bjd_raw.jdb'):
# total += a.sum()
# t = time() - t0
# print(f"time for reducing with jdb (raw): {t:.3f}s"
# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
total = 0
for a in jd.load('eye5chunk_bjd_zlib.jdb'):
total += a.sum()
t = time() - t0
print(f"time for reducing with jdb (zlib): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
total = 0
for a in jd.load('eye5chunk_bjd_lzma.jdb'):
total += a.sum()
t = time() - t0
print(f"time for reducing with jdb (lzma): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
schunk = blosc2.open("eye5_blosc2_blosclz.b2frame")
c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
total = 0
for nchunk in range(nsplits):
schunk.decompress_chunk(nchunk, c)
total += c.sum()
t = time() - t0
print(f"time for reducing with blosc2 (blosclz): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
t0 = time()
schunk = blosc2.open("eye5_blosc2_zstd.b2frame")
c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
total = 0
for nchunk in range(nsplits):
schunk.decompress_chunk(nchunk, c)
total += c.sum()
t = time() - t0
print(f"time for reducing with blosc2 (zstd): {t:.3f}s"
f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")
print("Total sum:", total)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment