Last active
April 5, 2024 17:54
-
-
Save mbafford/5a0ff6675b92ea1a6d80 to your computer and use it in GitHub Desktop.
summarizejson - Summarize JSON Structure (keys and data types)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Gives a quick summary of the structure of a JSON file, including the keys, object types, and | |
# leaf node types. Provides a count of each data type so you can quickly tell which data points | |
# are common. | |
# | |
# Example: | |
# | |
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | python summarizejson | |
# 9191 {features}.[].{geometry}.{coordinates}.[].[].[].[].float | |
# 41 {features}.[].{geometry}.{coordinates}.[].[].[].[].int | |
# 12171 {features}.[].{geometry}.{coordinates}.[].[].[].float | |
# 25 {features}.[].{geometry}.{coordinates}.[].[].[].int | |
# 180 {features}.[].{geometry}.{type}.unicode | |
# 180 {features}.[].{id}.unicode | |
# 180 {features}.[].{properties}.{name}.unicode | |
# 180 {features}.[].{type}.unicode | |
# 1 {type}.unicode | |
# | |
# From which I can see that I can extract data like this: | |
# | |
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | jq '.features[0] | [ .id, .geometry.coordinates[0][0][0:2] ]' | |
# [ | |
# "AFG", | |
# [ | |
# 61.210817, | |
# 35.650072 | |
# ] | |
# ] | |
import json | |
import fileinput | |
import re | |
from collections import defaultdict | |
# Look for JSONP wrapped JSON - JSON wrapped in a JavaScript function call | |
# like: | |
# my_callback1 ( [ 1, 2, 3, 4 ] ) ; | |
# | |
# This doesn't really respect every possible function name, but it'll catch most | |
# common ones. | |
def strip_jsonp( raw ): | |
start = re.match(r'^( *[$a-z0-9A-Z_]+ *[(] *)[[{]', raw[0:50]) | |
end = re.search(r"[]}] *([)][ ;]*)$", raw[-10:]) | |
if start and end: | |
raw = raw[ len( start.group(1) ) : -1 * len( end.group(1) ) ] | |
return raw | |
raw = "\n".join( fileinput.input() ) | |
raw = strip_jsonp( raw ) | |
data = json.loads( raw ) | |
def yieldkeys( data, parent_key = None ): | |
parent_key = '%s.' % ( parent_key ) if parent_key else '' | |
if isinstance( data, list ): | |
for i,item in enumerate( data ): | |
for y in yieldkeys( item, '%s[]' % ( parent_key ) ): | |
yield(y) | |
elif isinstance( data, dict ): | |
for i, item in data.items(): | |
for y in yieldkeys( item, '%s{%s}' % ( parent_key, i ) ): | |
yield(y) | |
else: | |
yield( '%s%s' % ( parent_key, type(data).__name__ ) ) | |
keycount = defaultdict(lambda: 0) | |
for a in yieldkeys(data): | |
keycount[a] += 1 | |
for key in sorted( keycount.keys() ): | |
print("%4d %s" % ( keycount[key], key )) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment