FrancescAlted/read-binary-data.py

## read-binary-data.py
# Benchmark comparing npy, npz, jdb and blosc2 storage formats

import sys

import numpy as np
import jdata as jd
import blosc2
from time import time

N = 10_000
nsplits = 50
# Bigger array
# N = 20_000
# nsplits = 200

nchunks = N // nsplits

t0 = time()
x = np.eye(N)
y = np.vsplit(x, nsplits)  # split into smaller chunks
t = time() - t0
print(f"time for creating big array (and splits): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

# Do not save unless we are passing a parameter
save = False if len(sys.argv) == 1 else True
if save:
    print("\n** Saving data **")
    t0 = time()
    np.save('eye5chunk.npy', y)
    t = time() - t0
    print(f"time for saving with npy: {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='w+', shape=x.shape)
    fp[:] = x
    fp.flush()
    t = time() - t0
    print(f"time for saving with np.memmap: {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    np.savez_compressed('eye5chunk.npz', y)
    t = time() - t0
    print(f"time for saving with npz: {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    # t0 = time()
    # jd.save(y, 'eye5chunk_bjd_raw.jdb')    # save as uncompressed bjd
    # t = time() - t0
    # print(f"time for saving with jdb (raw): {t:.3f}s"
    #       f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    jd.save(y, 'eye5chunk_bjd_zlib.jdb', {'compression':'zlib'})  # zlib-compressed bjd
    t = time() - t0
    print(f"time for saving with jdb (zlib): {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    jd.save(y, 'eye5chunk_bjd_lzma.jdb', {'compression':'lzma'})  # lzma-compressed bjd
    t = time() - t0
    print(f"time for saving with jdb (lzma): {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    chunksize = y[0].size * y[0].itemsize
    cparams = {"compcode": blosc2.Codec.BLOSCLZ, "typesize": 8, "nthreads": 8}
    storage = {"contiguous": True, "urlpath": "eye5_blosc2_blosclz.b2frame"}
    schunk = blosc2.SChunk(chunksize=chunksize, mode="w", cparams=cparams, **storage)
    for z in y:
        schunk.append_data(z)
    # The next is equivalent to the loop above
    # schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
    t = time() - t0
    print(f"time for saving with blosc2 (blosclz): {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

    t0 = time()
    chunksize = y[0].size * y[0].itemsize
    cparams = {"compcode": blosc2.Codec.ZSTD, "typesize": 8, "nthreads": 8}
    storage = {"contiguous": True, "urlpath": "eye5_blosc2_zstd.b2frame"}
    schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
    t = time() - t0
    print(f"time for saving with blosc2 (zstd): {t:.3f}s"
          f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")


print("\n** Load and operate **")
t0 = time()
total = 0
for a in y:
    total += a.sum()
t = time() - t0
print(f"time for reducing with plain numpy (memory): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
total = 0
for a in np.load('eye5chunk.npy'):
    total += a.sum()
t = time() - t0
print(f"time for reducing with npy (np.load, no mmap): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

# t0 = time()
# total = 0
# for a in np.load('eye5chunk.npy', mmap_mode='r'):
#     total += a.sum()
# t = time() - t0
# print(f"time for reducing with npy (np.load, mmap_mode): {t:.3f}s"
#       f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
total = 0
fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='r', shape=x.shape)
total += fp[:].sum()
t = time() - t0
print(f"time for reducing with np.memmap: {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
total = 0
for a in np.load('eye5chunk.npz').values():
    total += a.sum()
t = time() - t0
print(f"time for reducing with npz: {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

# t0 = time()
# total = 0
# for a in jd.load('eye5chunk_bjd_raw.jdb'):
#     total += a.sum()
# t = time() - t0
# print(f"time for reducing with jdb (raw): {t:.3f}s"
#       f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
total = 0
for a in jd.load('eye5chunk_bjd_zlib.jdb'):
    total += a.sum()
t = time() - t0
print(f"time for reducing with jdb (zlib): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
total = 0
for a in jd.load('eye5chunk_bjd_lzma.jdb'):
    total += a.sum()
t = time() - t0
print(f"time for reducing with jdb (lzma): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
schunk = blosc2.open("eye5_blosc2_blosclz.b2frame")
c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
total = 0
for nchunk in range(nsplits):
    schunk.decompress_chunk(nchunk, c)
    total += c.sum()
t = time() - t0
print(f"time for reducing with blosc2 (blosclz): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

t0 = time()
schunk = blosc2.open("eye5_blosc2_zstd.b2frame")
c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
total = 0
for nchunk in range(nsplits):
    schunk.decompress_chunk(nchunk, c)
    total += c.sum()
t = time() - t0
print(f"time for reducing with blosc2 (zstd): {t:.3f}s"
      f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

print("Total sum:", total)
	# Benchmark comparing npy, npz, jdb and blosc2 storage formats

	import sys

	import numpy as np
	import jdata as jd
	import blosc2
	from time import time

	N = 10_000
	nsplits = 50
	# Bigger array
	# N = 20_000
	# nsplits = 200

	nchunks = N // nsplits

	t0 = time()
	x = np.eye(N)
	y = np.vsplit(x, nsplits) # split into smaller chunks
	t = time() - t0
	print(f"time for creating big array (and splits): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	# Do not save unless we are passing a parameter
	save = False if len(sys.argv) == 1 else True
	if save:
	print("\n Saving data ")
	t0 = time()
	np.save('eye5chunk.npy', y)
	t = time() - t0
	print(f"time for saving with npy: {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='w+', shape=x.shape)
	fp[:] = x
	fp.flush()
	t = time() - t0
	print(f"time for saving with np.memmap: {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	np.savez_compressed('eye5chunk.npz', y)
	t = time() - t0
	print(f"time for saving with npz: {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	# t0 = time()
	# jd.save(y, 'eye5chunk_bjd_raw.jdb') # save as uncompressed bjd
	# t = time() - t0
	# print(f"time for saving with jdb (raw): {t:.3f}s"
	# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	jd.save(y, 'eye5chunk_bjd_zlib.jdb', {'compression':'zlib'}) # zlib-compressed bjd
	t = time() - t0
	print(f"time for saving with jdb (zlib): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	jd.save(y, 'eye5chunk_bjd_lzma.jdb', {'compression':'lzma'}) # lzma-compressed bjd
	t = time() - t0
	print(f"time for saving with jdb (lzma): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	chunksize = y[0].size * y[0].itemsize
	cparams = {"compcode": blosc2.Codec.BLOSCLZ, "typesize": 8, "nthreads": 8}
	storage = {"contiguous": True, "urlpath": "eye5_blosc2_blosclz.b2frame"}
	schunk = blosc2.SChunk(chunksize=chunksize, mode="w", cparams=cparams, **storage)
	for z in y:
	schunk.append_data(z)
	# The next is equivalent to the loop above
	# schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
	t = time() - t0
	print(f"time for saving with blosc2 (blosclz): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	chunksize = y[0].size * y[0].itemsize
	cparams = {"compcode": blosc2.Codec.ZSTD, "typesize": 8, "nthreads": 8}
	storage = {"contiguous": True, "urlpath": "eye5_blosc2_zstd.b2frame"}
	schunk = blosc2.SChunk(data=x, mode="w", chunksize=chunksize, cparams=cparams, **storage)
	t = time() - t0
	print(f"time for saving with blosc2 (zstd): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")


	print("\n Load and operate ")
	t0 = time()
	total = 0
	for a in y:
	total += a.sum()
	t = time() - t0
	print(f"time for reducing with plain numpy (memory): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	total = 0
	for a in np.load('eye5chunk.npy'):
	total += a.sum()
	t = time() - t0
	print(f"time for reducing with npy (np.load, no mmap): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	# t0 = time()
	# total = 0
	# for a in np.load('eye5chunk.npy', mmap_mode='r'):
	# total += a.sum()
	# t = time() - t0
	# print(f"time for reducing with npy (np.load, mmap_mode): {t:.3f}s"
	# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	total = 0
	fp = np.memmap('eye5chunk-memmap.npy', dtype=x.dtype, mode='r', shape=x.shape)
	total += fp[:].sum()
	t = time() - t0
	print(f"time for reducing with np.memmap: {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	total = 0
	for a in np.load('eye5chunk.npz').values():
	total += a.sum()
	t = time() - t0
	print(f"time for reducing with npz: {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	# t0 = time()
	# total = 0
	# for a in jd.load('eye5chunk_bjd_raw.jdb'):
	# total += a.sum()
	# t = time() - t0
	# print(f"time for reducing with jdb (raw): {t:.3f}s"
	# f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	total = 0
	for a in jd.load('eye5chunk_bjd_zlib.jdb'):
	total += a.sum()
	t = time() - t0
	print(f"time for reducing with jdb (zlib): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	total = 0
	for a in jd.load('eye5chunk_bjd_lzma.jdb'):
	total += a.sum()
	t = time() - t0
	print(f"time for reducing with jdb (lzma): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	schunk = blosc2.open("eye5_blosc2_blosclz.b2frame")
	c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
	total = 0
	for nchunk in range(nsplits):
	schunk.decompress_chunk(nchunk, c)
	total += c.sum()
	t = time() - t0
	print(f"time for reducing with blosc2 (blosclz): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	t0 = time()
	schunk = blosc2.open("eye5_blosc2_zstd.b2frame")
	c = np.empty(nchunks * N, dtype=np.float64).reshape(nchunks, N)
	total = 0
	for nchunk in range(nsplits):
	schunk.decompress_chunk(nchunk, c)
	total += c.sum()
	t = time() - t0
	print(f"time for reducing with blosc2 (zstd): {t:.3f}s"
	f" ({N * N * 8 / (t * 2**30):.3g} GB/s)")

	print("Total sum:", total)