Skip to content

Instantly share code, notes, and snippets.

@krosaen
Created June 5, 2018 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krosaen/64e2a3012fb4a9c65a6ccb8697e2779f to your computer and use it in GitHub Desktop.
Save krosaen/64e2a3012fb4a9c65a6ccb8697e2779f to your computer and use it in GitHub Desktop.
def summarize_data_shape(example):
"""
Given some (json serializeable) example, provide a concise summary of its structure
by pruning it down to e.g one item per list.
like https://github.com/krosaen/data-shapy/blob/master/data_shapy/data_shape.py
but handles a best effort summary of class objects and tuples too
"""
def is_ground(item):
return any([
item is None,
isinstance(item, bool),
isinstance(item, int),
isinstance(item, str),
isinstance(item, float),
])
def non_none_kv_count(d):
return len([v for k, v in d.items() if v is not None])
def max_index(l):
max_i = 0
max_v = l[0]
for i, el in enumerate(l):
if el > max_v:
max_i = i
max_v = el
return max_i
if is_ground(example):
return example
if isinstance(example, np.ndarray):
return "np {}".format('x'.join(map(str, example.shape)))
if isinstance(example, bytes):
return "bytes[{}]".format(len(example))
if isinstance(example, tuple) and hasattr(example, '_fields'):
# named tuple
result = summarize_data_shape(example._asdict())
# result['cls'] = example.__class__
return result
if isinstance(example, list) or isinstance(example, tuple):
if len(example) == 2 and is_ground(example[0]):
return [example[0], summarize_data_shape(example[1])]
if len(example) < 10 and all(map(is_ground, example)):
return example
elif all([isinstance(el, dict) for el in example[:20]]):
# we have a list of dicts, find one to summarize that has the most non-null key values
# (looking 20 items out max)
non_none_kv_counts = [non_none_kv_count(d) for d in example[:20]]
return [summarize_data_shape(example[max_index(non_none_kv_counts)])]
else:
return [summarize_data_shape(example[0])]
if isinstance(example, set):
return summarize_data_shape(list(example))
if isinstance(example, dict) and len(example) == 1 and 'matrix' in example:
return summarize_data_shape(np.array(example['matrix']))
if isinstance(example, dict):
return {k: summarize_data_shape(v) for k, v in example.items()}
if isinstance(example, object):
ks = set(example.__dict__.keys()) - set(example.__class__.__dict__.keys())
result = {k: summarize_data_shape(example.__getattribute__(k)) for k in ks}
result['cls'] = example.__class__
return result
raise ValueError("dunno how to deal with type {}".format(type(example)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment