Skip to content

Instantly share code, notes, and snippets.

@rsignell-usgs
Last active May 15, 2020 12:40
Show Gist options
  • Save rsignell-usgs/6b26e1593ae38ea2047e9be6d3ac885b to your computer and use it in GitHub Desktop.
Save rsignell-usgs/6b26e1593ae38ea2047e9be6d3ac885b to your computer and use it in GitHub Desktop.
Code to extract hdf5 chunk locations and add them to zarr metadata
from urllib.parse import urlparse, urlunparse
import numpy as np
import h5py
import zarr
from zarr.storage import FileChunkStore
from numcodecs import Zlib
import fsspec
class Hdf5ToZarr:
"""Translate the content of one HDF5 file into Zarr metadata.
HDF5 groups become Zarr groups. HDF5 datasets become Zarr arrays. Zarr array
chunks remain in the HDF5 file.
Parameters
----------
h5f : file-like or str
Input HDF5 file as a string or file-like Python object.
store : MutableMapping
Zarr store.
xarray : bool, optional
Produce atributes required by the `xarray <http://xarray.pydata.org>`_
package to correctly identify dimensions (HDF5 dimension scales) of a
Zarr array. Default is ``False``.
"""
def __init__(self, h5f, store, xarray=False):
# Open HDF5 file in read mode...
self._h5f = h5py.File(h5f, mode='r')
self._xr = xarray
# Create Zarr store's root group...
self._zroot = zarr.group(store=store, overwrite=True)
# Figure out HDF5 file's URI...
if hasattr(h5f, 'name'):
self._uri = h5f.name
elif hasattr(h5f, 'url'):
parts = urlparse(h5f.url())
self._uri = urlunparse(parts[:3] + ('',) * 3)
else:
self._uri = None
def translate(self):
"""Translate content of one HDF5 file into Zarr storage format.
No data is copied out of the HDF5 file.
"""
self.transfer_attrs(self._h5f, self._zroot)
self._h5f.visititems(self._translator)
def transfer_attrs(self, h5obj, zobj):
"""Transfer attributes from an HDF5 object to its equivalent Zarr object.
Parameters
----------
h5obj : h5py.Group or h5py.Dataset
An HDF5 group or dataset.
zobj : zarr.hierarchy.Group or zarr.core.Array
An equivalent Zarr group or array to the HDF5 group or dataset with
attributes.
"""
for n, v in h5obj.attrs.items():
if n in ('REFERENCE_LIST', 'DIMENSION_LIST'):
continue
# Fix some attribute values to avoid JSON encoding exceptions...
if isinstance(v, bytes):
v = v.decode('utf-8')
elif isinstance(v, (np.ndarray, np.number)):
if v.size == 1:
v = v.flatten()[0].tolist()
else:
v = v.tolist()
if self._xr and v == 'DIMENSION_SCALE':
continue
try:
zobj.attrs[n] = v
except TypeError:
print(f'Caught TypeError: {n}@{h5obj.name} = {v} ({type(v)})')
def _translator(self, name, h5obj):
"""Produce Zarr metadata for all groups and datasets in the HDF5 file.
"""
if isinstance(h5obj, h5py.Dataset):
if (h5obj.scaleoffset or h5obj.fletcher32 or h5obj.shuffle or
h5obj.compression in ('szip', 'lzf')):
raise RuntimeError(
f'{h5obj.name} uses unsupported HDF5 filters')
if h5obj.compression == 'gzip':
compression = Zlib(level=h5obj.compression_opts)
else:
compression = None
# Get storage info of this HDF5 dataset...
cinfo = self.storage_info(h5obj)
if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo:
return
# Create a Zarr array equivalent to this HDF5 dataset...
za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
dtype=h5obj.dtype,
chunks=h5obj.chunks or False,
fill_value=h5obj.fillvalue,
compression=compression,
overwrite=True)
self.transfer_attrs(h5obj, za)
if self._xr:
# Do this for xarray...
za.attrs['_ARRAY_DIMENSIONS'] = self._get_array_dims(h5obj)
# Store chunk location metadata...
if cinfo:
cinfo['source'] = {'uri': self._uri,
'array_name': h5obj.name}
FileChunkStore.chunks_info(za, cinfo)
elif isinstance(h5obj, h5py.Group):
zgrp = self._zroot.create_group(h5obj.name)
self.transfer_attrs(h5obj, zgrp)
def _get_array_dims(self, dset):
"""Get a list of dimension scale names attached to input HDF5 dataset.
This is required by the xarray package to work with Zarr arrays. Only
one dimension scale per dataset dimension is allowed. If dataset is
dimension scale, it will be considered as the dimension to itself.
Parameters
----------
dset : h5py.Dataset
HDF5 dataset.
Returns
-------
list
List with HDF5 path names of dimension scales attached to input
dataset.
"""
dims = list()
rank = len(dset.shape)
if rank:
for n in range(rank):
num_scales = len(dset.dims[n])
if num_scales == 1:
dims.append(dset.dims[n][0].name[1:])
elif h5py.h5ds.is_scale(dset.id):
dims.append(dset.name[1:])
elif num_scales > 1:
raise RuntimeError(
f'{dset.name}: {len(dset.dims[n])} '
f'dimension scales attached to dimension #{n}')
return dims
def storage_info(self, dset):
"""Get storage information of an HDF5 dataset in the HDF5 file.
Storage information consists of file offset and size (length) for every
chunk of the HDF5 dataset.
Parameters
----------
dset : h5py.Dataset
HDF5 dataset for which to collect storage information.
Returns
-------
dict
HDF5 dataset storage information. Dict keys are chunk array offsets
as tuples. Dict values are pairs with chunk file offset and size
integers.
"""
# Empty (null) dataset...
if dset.shape is None:
return dict()
dsid = dset.id
if dset.chunks is None:
# Contiguous dataset...
if dsid.get_offset() is None:
# No data ever written...
return dict()
else:
key = (0,) * (len(dset.shape) or 1)
return {key: {'offset': dsid.get_offset(),
'size': dsid.get_storage_size()}}
else:
# Chunked dataset...
num_chunks = dsid.get_num_chunks()
if num_chunks == 0:
# No data ever written...
return dict()
# Go over all the dataset chunks...
stinfo = dict()
for index in range(num_chunks):
blob = dsid.get_chunk_info(index)
chunk_size = dset.chunks
key = tuple(
[a // b for a, b in zip(blob.chunk_offset, chunk_size)])
stinfo[key] = {'offset': blob.byte_offset,
'size': blob.size}
return stinfo
if __name__ == '__main__':
with fsspec.open('s3://pangeo-data-uswest2/esip/adcirc/adcirc_01d.nc',
mode='rb', anon=False, requester_pays=True,
default_fill_cache=False) as f:
store = zarr.DirectoryStore('../foobar.zarr')
h5chunks = Hdf5ToZarr(f, store, xarray=True)
h5chunks.translate()
# Consolidate Zarr metadata...
zarr.convenience.consolidate_metadata(store)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment