Skip to content

Instantly share code, notes, and snippets.

@rly
Created June 20, 2022 16:36
Show Gist options
  • Save rly/2e390a0d7be1c311fbc3ac58164f3a40 to your computer and use it in GitHub Desktop.
Save rly/2e390a0d7be1c311fbc3ac58164f3a40 to your computer and use it in GitHub Desktop.
Demonstration of how to trim and repack (change compression, chunking, etc.) of a very large HDF5 dataset in an NWB file
from pynwb import NWBHDF5IO
import pynwb
from hdmf.data_utils import GenericDataChunkIterator
from hdmf.backends.hdf5.h5_utils import H5DataIO
filepath = r"D:\GiocomoData_dandiset53\000053\sub-npI1\sub-npI1_ses-20190413_behavior+ecephys.nwb"
class H5DatasetDataChunkIterator(GenericDataChunkIterator):
"""A data chunk iterator that reads chunks over the 0th dimension of an HDF5 dataset up to a max length.
"""
def __init__(self, dataset, max_length, **kwargs):
self.dataset = dataset
self.max_length = max_length # in the time (0th) dimension
super().__init__(**kwargs)
def _get_data(self, selection):
return self.dataset[selection]
def _get_maxshape(self):
return (self.max_length, self.dataset.shape[1])
def _get_dtype(self):
return self.dataset.dtype
with NWBHDF5IO(filepath, "r", load_namespaces=True) as io:
nwbfile = io.read()
orig_eseries = nwbfile.acquisition["ElectricalSeries"]
electrodes = nwbfile.create_electrode_table_region(
region=orig_eseries.electrodes.data[:].tolist(),
name=orig_eseries.electrodes.name,
description=orig_eseries.electrodes.description
)
num_electrodes = orig_eseries.data.shape[1]
max_timestamps = int(2e6) # TODO set this to the maximum number of timestamps to be read
# the original dataset is already chunked. for optimal read, read 1 chunk at a time by
# setting the read chunk shape to align with the dataset chunk shape
assert orig_eseries.data.chunks
selection_size_time = orig_eseries.data.chunks[0]
# read the electricalseries data iteratively in chunks because it is too big to fit into RAM
data_iterator = H5DatasetDataChunkIterator(
dataset=nwbfile.acquisition["ElectricalSeries"].data,
max_length=max_timestamps,
chunk_shape=(selection_size_time, num_electrodes), # this chunk shape is for read
buffer_gb=4 # TODO set this to a little under the amount of free RAM available in GB
)
# create an H5DataIO object which sets HDF5-specific filters and other write options
data = H5DataIO(
data=data_iterator,
compression="gzip",
compression_opts=4,
chunks=(100, 100), # this chunk shape is for write TODO set this accordingly
# TODO pass other options to H5DataIO
)
# create the new electricalseries with the same parameters as the original electricalseries
# except with a different dataset
new_eseries = pynwb.ecephys.ElectricalSeries(
name=orig_eseries.name,
description=orig_eseries.description,
data=data,
electrodes=electrodes,
starting_time=orig_eseries.starting_time,
rate=orig_eseries.rate,
conversion=orig_eseries.conversion,
resolution=orig_eseries.resolution,
comments=orig_eseries.comments,
)
nwbfile.acquisition.pop("ElectricalSeries") # remove the existing electricalseries
nwbfile.add_acquisition(new_eseries) # add the newly chunked electricalseries
nwbfile.processing.pop("ecephys") # remove the ecephys processing module
with pynwb.NWBHDF5IO("dandiset53_trim_iterator.nwb", "w", manager=io.manager) as export_io:
export_io.export(io, nwbfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment