Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active November 17, 2023 23:22
Show Gist options
  • Save edsu/d2936e6786b0764b5181539fe3ee869a to your computer and use it in GitHub Desktop.
Save edsu/d2936e6786b0764b5181539fe3ee869a to your computer and use it in GitHub Desktop.
Feed in some JSONL and get a report of the patterns present in the data.
#!/usr/bin/env python3
import csv
import json
from collections import OrderedDict
from collections import Counter
def trace(data, shape=None):
if isinstance(data, dict):
new_dict = OrderedDict()
for k, v in data.items():
new_v = trace(v)
if new_v is not None:
new_dict[k] = new_v
return new_dict
elif isinstance(data, list) and len(data) > 0:
return list(filter(lambda e: e is not None, [trace(e) for e in data]))
elif data is None:
return None
else:
return ""
count = Counter()
examples = {}
for i, row in enumerate(csv.DictReader(open('orcids.csv'))):
data = json.loads(row['identifier_json'])
shape = json.dumps(trace(data), indent=2, sort_keys=True)
examples[shape] = row['external_identifier'].strip('druid:')
count[shape] += 1
for count, [shape, total] in enumerate(count.most_common()):
print(f"Shape {count + 1} total={total} example=https://purl.stanford.edu/{examples[shape]}.json")
print()
print(shape)
print()
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment