Skip to content

Instantly share code, notes, and snippets.

@scottstanie
Created June 28, 2023 22:57
Show Gist options
  • Save scottstanie/10a86cff8c1b978e8134e918ba5207ce to your computer and use it in GitHub Desktop.
Save scottstanie/10a86cff8c1b978e8134e918ba5207ce to your computer and use it in GitHub Desktop.
Compares HDF5 3d stack loading time to loading chunks of flat binary files
import subprocess
import time
from pathlib import Path
import h5py
import numpy as np
from dolphin import io
class GDALStackReader:
def __init__(self, filenames: list[Filename]):
self.filenames = filenames
def __getitem__(self, index):
if isinstance(index, int):
return io.load_gdal(self.filenames[index], masked=self._masked)
n, rows, cols = index
if isinstance(n, int):
n = slice(n, n + 1)
if isinstance(rows, int):
rows = slice(rows, rows + 1)
if isinstance(cols, int):
cols = slice(cols, cols + 1)
# add threading option?
data = [
io.load_gdal(f, rows=rows, cols=cols, masked=self._masked)
for f in self.filenames[n]
]
return np.stack(data, axis=0).squeeze()
def make_binary_files(n=1000, shape=(1000, 1000), dtype="float32"):
slopes = np.arange(shape[0]).reshape(-1, 1) * np.arange(shape[1]).reshape(1, -1)
files = []
file_h5 = "stack.h5"
data = np.random.rand(n, *shape)
with h5py.File(file_h5, "w") as hf:
dset = hf.create_dataset("data", data=data, chunks=(n, 64, 64))
for i in range(n):
fname = f"file_{i:03d}.bin"
data = (np.random.rand() * slopes).astype(dtype)
io.write_arr(arr=data, output_name=fname, driver="ENVI")
files.append(fname)
dset[i] = data
if i % 50 == 0:
print(f"done with {i}")
return file_h5, files
if __name__ == "__main__":
file_h5, files = make_binary_files()
# remove from cache
subprocess.run("vmtouch .")
slices = list(io._slice_iterator((1000, 1000), (3 * 64, 3 * 64)))
files = sorted(Path(".").glob("file*bin"))
ss = GDALStackReader(files)
t0 = time.time()
cumsum = 0
for slice_ in slices:
d = ss[:, slice_[0], slice_[1]]
cumsum += d.sum()
print(f"For flat binary, time={time.time() - t0:.2f}, {cumsum = }")
hf = h5py.File(file_h5)
t0 = time.time()
cumsum = 0
for slice_ in slices:
d = hf['data'][:, slice_[0], slice_[1]]
cumsum += d.sum()
print(f"HDF5 stack, time={time.time() - t0:.2f}, {cumsum = }")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment