Skip to content

Instantly share code, notes, and snippets.

@janpipek
Last active June 16, 2021 06:13
Show Gist options
  • Save janpipek/9762602 to your computer and use it in GitHub Desktop.
Save janpipek/9762602 to your computer and use it in GitHub Desktop.
Create a copy of HDF5 with compressed everything that could be compressed.
import h5py
import os
def _report(operation, key, obj):
type_str = type(obj).__name__.split(".")[-1].lower()
print "%s %s: %s." % (operation, type_str, key)
def h5py_compatible_attributes(in_object):
'''Are all attributes of an object readable in h5py?'''
try:
# Force obtaining the attributes so that error may appear
[ 0 for at in in_object.attrs.iteritems() ]
return True
except:
return False
def copy_attributes(in_object, out_object):
'''Copy attributes between 2 HDF5 objects.'''
for key, value in in_object.attrs.iteritems():
out_object.attrs[key] = value
def walk(in_object, out_object, log=False, compression='gzip'):
'''Recursively copy&compress the tree.
If attributes cannot be transferred, a copy is created.
Otherwise, dataset are compressed.
'''
for key, in_obj in in_object.iteritems():
if not isinstance(in_obj, h5py.Datatype) and h5py_compatible_attributes(in_obj):
if isinstance(in_obj, h5py.Group):
out_obj = out_object.create_group(key)
walk(in_obj, out_obj, log, compression)
if log:
_report("Copied", key, in_obj)
elif isinstance(in_obj, h5py.Dataset):
out_obj = out_object.create_dataset(key, data=in_obj, compression=compression)
if log:
_report("Compressed", key, in_obj)
else:
raise "Invalid object type %s" % type(in_obj)
copy_attributes(in_obj, out_obj)
else:
# We copy datatypes and objects with non-understandable attributes
# identically.
if log:
_report("Copied", key, in_obj)
in_object.copy(key, out_object)
def recompress(path1, path2, log=False, compression='gzip'):
'''Compress a HDF5 file.
:param path1: Input path
:param path2: Output path
:param log: Whether to print results of operations'
:returns: A tuple(original_size, new_size)
'''
with h5py.File(path1, "r") as in_file, h5py.File(path2, "w") as out_file:
walk(in_file, out_file, log=log, compression=compression)
return os.stat(path1).st_size, os.stat(path2).st_size
@janpipek
Copy link
Author

There is a utility for that named h5repack :-(

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment