Last active
November 17, 2023 23:22
-
-
Save edsu/d2936e6786b0764b5181539fe3ee869a to your computer and use it in GitHub Desktop.
Feed in some JSONL and get a report of the patterns present in the data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
import json | |
from collections import OrderedDict | |
from collections import Counter | |
def trace(data, shape=None): | |
if isinstance(data, dict): | |
new_dict = OrderedDict() | |
for k, v in data.items(): | |
new_v = trace(v) | |
if new_v is not None: | |
new_dict[k] = new_v | |
return new_dict | |
elif isinstance(data, list) and len(data) > 0: | |
return list(filter(lambda e: e is not None, [trace(e) for e in data])) | |
elif data is None: | |
return None | |
else: | |
return "" | |
count = Counter() | |
examples = {} | |
for i, row in enumerate(csv.DictReader(open('orcids.csv'))): | |
data = json.loads(row['identifier_json']) | |
shape = json.dumps(trace(data), indent=2, sort_keys=True) | |
examples[shape] = row['external_identifier'].strip('druid:') | |
count[shape] += 1 | |
for count, [shape, total] in enumerate(count.most_common()): | |
print(f"Shape {count + 1} total={total} example=https://purl.stanford.edu/{examples[shape]}.json") | |
print() | |
print(shape) | |
print() | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment