Skip to content

Instantly share code, notes, and snippets.

@ssomnath
Created August 30, 2018 18:31
Show Gist options
  • Save ssomnath/e3cb824b76d6837ea0f95054e6189574 to your computer and use it in GitHub Desktop.
Save ssomnath/e3cb824b76d6837ea0f95054e6189574 to your computer and use it in GitHub Desktop.
Extract JSON metadata from all HDF5 groups
from __future__ import print_function, division, unicode_literals
import os
import numpy as np
import h5py
import json
import sys
if sys.version_info.major == 3:
unicode = str
def get_attr(h5_object, attr_name):
"""
Returns the attribute from the h5py object
Parameters
----------
h5_object : h5py.Dataset, h5py.Group or h5py.File object
object whose attribute is desired
attr_name : str
Name of the attribute of interest
Returns
-------
att_val : object
value of attribute, in certain cases (byte strings or list of byte strings) reformatted to readily usable forms
"""
if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)):
raise TypeError('h5_object should be a h5py.Dataset, h5py.Group or h5py.File object')
if not isinstance(attr_name, (str, unicode)):
raise TypeError('attr_name should be a string')
if attr_name not in h5_object.attrs.keys():
raise KeyError("'{}' is not an attribute in '{}'".format(attr_name, h5_object.name))
att_val = h5_object.attrs.get(attr_name)
if isinstance(att_val, np.bytes_) or isinstance(att_val, bytes):
att_val = att_val.decode('utf-8')
elif type(att_val) == np.ndarray:
if sys.version_info.major == 3:
if att_val.dtype.type in [np.bytes_, np.object_]:
att_val = np.array([str(x, 'utf-8') for x in att_val])
return att_val
def get_attributes(h5_object, attr_names=None):
"""
Returns attribute associated with some DataSet.
Parameters
----------
h5_object : h5py.Dataset
Dataset object reference.
attr_names : string or list of strings, optional, default = all (DataSet.attrs).
Name of attribute object to return.
Returns
-------
Dictionary containing (name,value) pairs of attributes
"""
if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)):
raise TypeError('h5_object should be a h5py.Dataset, h5py.Group or h5py.File object')
if attr_names is None:
attr_names = h5_object.attrs.keys()
else:
if isinstance(attr_names, (str, unicode)):
attr_names = [attr_names]
if not isinstance(attr_names, (list, tuple)):
raise TypeError('attr_names should be a string or list / tuple of strings')
if not np.all([isinstance(x, (str, unicode)) for x in attr_names]):
raise TypeError('attr_names should be a string or list / tuple of strings')
att_dict = {}
for attr in attr_names:
try:
att_dict[attr] = get_attr(h5_object, attr)
except KeyError:
raise KeyError('%s is not an attribute of %s' % (str(attr), h5_object.name))
return att_dict
def clean_attributes(metadata):
attrs_to_delete = []
for key, val in metadata.items():
if type(val) in [np.uint16, np.uint8, np.uint, np.uint32, np.int, np.int16, np.int32, np.int64]:
metadata[key] = int(val)
if type(val) in [np.float, np.float16, np.float32, np.float64]:
metadata[key] = float(val)
if type(val) in [np.bool, np.bool_]:
metadata[key] = bool(val)
if isinstance(val, np.ndarray):
metadata[key] = val.tolist()
if isinstance(val, h5py.Reference):
attrs_to_delete.append(key)
for key in attrs_to_delete:
del metadata[key]
return metadata
def get_attrs_from_groups(parent):
metadata_tree = dict()
for item_name, obj in parent.items():
if isinstance(obj, h5py.Group):
temp = clean_attributes(get_attributes(obj))
sub_attrs = get_attrs_from_groups(obj)
temp.update(sub_attrs)
metadata_tree[item_name] = temp
return metadata_tree
def h5_atts_to_json(h5_path, root_only=False):
if not h5_path.endswith('.h5'):
print('Provided file did not have an h5 extension.')
return
with h5py.File(h5_path, mode='r') as h5_f:
if root_only:
metadata = get_attributes(h5_f)
else:
metadata = get_attrs_from_groups(h5_f)
metadata = clean_attributes(metadata)
json_path = h5_path.replace('.h5','.json')
with open(json_path, mode='w') as json_handle:
json.dump(metadata, json_handle)
def extract_json_from_all_h5(root_dir, root_only=False):
root_dir = os.path.abspath(root_dir)
for item in os.listdir(root_dir):
item_path = os.path.join(root_dir, item)
if os.path.isdir(item_path):
extract_json_from_all_h5(item_path)
else:
if item.endswith('.h5'):
h5_atts_to_json(item_path, root_only=root_only)
print('Finished generating JSON for ' + item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment