Last active
June 16, 2021 06:13
-
-
Save janpipek/9762602 to your computer and use it in GitHub Desktop.
Create a copy of HDF5 with compressed everything that could be compressed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import h5py | |
import os | |
def _report(operation, key, obj): | |
type_str = type(obj).__name__.split(".")[-1].lower() | |
print "%s %s: %s." % (operation, type_str, key) | |
def h5py_compatible_attributes(in_object): | |
'''Are all attributes of an object readable in h5py?''' | |
try: | |
# Force obtaining the attributes so that error may appear | |
[ 0 for at in in_object.attrs.iteritems() ] | |
return True | |
except: | |
return False | |
def copy_attributes(in_object, out_object): | |
'''Copy attributes between 2 HDF5 objects.''' | |
for key, value in in_object.attrs.iteritems(): | |
out_object.attrs[key] = value | |
def walk(in_object, out_object, log=False, compression='gzip'): | |
'''Recursively copy&compress the tree. | |
If attributes cannot be transferred, a copy is created. | |
Otherwise, dataset are compressed. | |
''' | |
for key, in_obj in in_object.iteritems(): | |
if not isinstance(in_obj, h5py.Datatype) and h5py_compatible_attributes(in_obj): | |
if isinstance(in_obj, h5py.Group): | |
out_obj = out_object.create_group(key) | |
walk(in_obj, out_obj, log, compression) | |
if log: | |
_report("Copied", key, in_obj) | |
elif isinstance(in_obj, h5py.Dataset): | |
out_obj = out_object.create_dataset(key, data=in_obj, compression=compression) | |
if log: | |
_report("Compressed", key, in_obj) | |
else: | |
raise "Invalid object type %s" % type(in_obj) | |
copy_attributes(in_obj, out_obj) | |
else: | |
# We copy datatypes and objects with non-understandable attributes | |
# identically. | |
if log: | |
_report("Copied", key, in_obj) | |
in_object.copy(key, out_object) | |
def recompress(path1, path2, log=False, compression='gzip'): | |
'''Compress a HDF5 file. | |
:param path1: Input path | |
:param path2: Output path | |
:param log: Whether to print results of operations' | |
:returns: A tuple(original_size, new_size) | |
''' | |
with h5py.File(path1, "r") as in_file, h5py.File(path2, "w") as out_file: | |
walk(in_file, out_file, log=log, compression=compression) | |
return os.stat(path1).st_size, os.stat(path2).st_size |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There is a utility for that named h5repack :-(