Created
June 28, 2023 22:57
-
-
Save scottstanie/10a86cff8c1b978e8134e918ba5207ce to your computer and use it in GitHub Desktop.
Compares HDF5 3d stack loading time to loading chunks of flat binary files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import time | |
from pathlib import Path | |
import h5py | |
import numpy as np | |
from dolphin import io | |
class GDALStackReader: | |
def __init__(self, filenames: list[Filename]): | |
self.filenames = filenames | |
def __getitem__(self, index): | |
if isinstance(index, int): | |
return io.load_gdal(self.filenames[index], masked=self._masked) | |
n, rows, cols = index | |
if isinstance(n, int): | |
n = slice(n, n + 1) | |
if isinstance(rows, int): | |
rows = slice(rows, rows + 1) | |
if isinstance(cols, int): | |
cols = slice(cols, cols + 1) | |
# add threading option? | |
data = [ | |
io.load_gdal(f, rows=rows, cols=cols, masked=self._masked) | |
for f in self.filenames[n] | |
] | |
return np.stack(data, axis=0).squeeze() | |
def make_binary_files(n=1000, shape=(1000, 1000), dtype="float32"): | |
slopes = np.arange(shape[0]).reshape(-1, 1) * np.arange(shape[1]).reshape(1, -1) | |
files = [] | |
file_h5 = "stack.h5" | |
data = np.random.rand(n, *shape) | |
with h5py.File(file_h5, "w") as hf: | |
dset = hf.create_dataset("data", data=data, chunks=(n, 64, 64)) | |
for i in range(n): | |
fname = f"file_{i:03d}.bin" | |
data = (np.random.rand() * slopes).astype(dtype) | |
io.write_arr(arr=data, output_name=fname, driver="ENVI") | |
files.append(fname) | |
dset[i] = data | |
if i % 50 == 0: | |
print(f"done with {i}") | |
return file_h5, files | |
if __name__ == "__main__": | |
file_h5, files = make_binary_files() | |
# remove from cache | |
subprocess.run("vmtouch .") | |
slices = list(io._slice_iterator((1000, 1000), (3 * 64, 3 * 64))) | |
files = sorted(Path(".").glob("file*bin")) | |
ss = GDALStackReader(files) | |
t0 = time.time() | |
cumsum = 0 | |
for slice_ in slices: | |
d = ss[:, slice_[0], slice_[1]] | |
cumsum += d.sum() | |
print(f"For flat binary, time={time.time() - t0:.2f}, {cumsum = }") | |
hf = h5py.File(file_h5) | |
t0 = time.time() | |
cumsum = 0 | |
for slice_ in slices: | |
d = hf['data'][:, slice_[0], slice_[1]] | |
cumsum += d.sum() | |
print(f"HDF5 stack, time={time.time() - t0:.2f}, {cumsum = }") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment