Skip to content

Instantly share code, notes, and snippets.

@mbafford
Last active April 5, 2024 17:54
Show Gist options
  • Save mbafford/5a0ff6675b92ea1a6d80 to your computer and use it in GitHub Desktop.
Save mbafford/5a0ff6675b92ea1a6d80 to your computer and use it in GitHub Desktop.
summarizejson - Summarize JSON Structure (keys and data types)
#!/usr/bin/env python
# Gives a quick summary of the structure of a JSON file, including the keys, object types, and
# leaf node types. Provides a count of each data type so you can quickly tell which data points
# are common.
#
# Example:
#
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | python summarizejson
# 9191 {features}.[].{geometry}.{coordinates}.[].[].[].[].float
# 41 {features}.[].{geometry}.{coordinates}.[].[].[].[].int
# 12171 {features}.[].{geometry}.{coordinates}.[].[].[].float
# 25 {features}.[].{geometry}.{coordinates}.[].[].[].int
# 180 {features}.[].{geometry}.{type}.unicode
# 180 {features}.[].{id}.unicode
# 180 {features}.[].{properties}.{name}.unicode
# 180 {features}.[].{type}.unicode
# 1 {type}.unicode
#
# From which I can see that I can extract data like this:
#
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | jq '.features[0] | [ .id, .geometry.coordinates[0][0][0:2] ]'
# [
# "AFG",
# [
# 61.210817,
# 35.650072
# ]
# ]
import json
import fileinput
import re
from collections import defaultdict
# Look for JSONP wrapped JSON - JSON wrapped in a JavaScript function call
# like:
# my_callback1 ( [ 1, 2, 3, 4 ] ) ;
#
# This doesn't really respect every possible function name, but it'll catch most
# common ones.
def strip_jsonp( raw ):
start = re.match(r'^( *[$a-z0-9A-Z_]+ *[(] *)[[{]', raw[0:50])
end = re.search(r"[]}] *([)][ ;]*)$", raw[-10:])
if start and end:
raw = raw[ len( start.group(1) ) : -1 * len( end.group(1) ) ]
return raw
raw = "\n".join( fileinput.input() )
raw = strip_jsonp( raw )
data = json.loads( raw )
def yieldkeys( data, parent_key = None ):
parent_key = '%s.' % ( parent_key ) if parent_key else ''
if isinstance( data, list ):
for i,item in enumerate( data ):
for y in yieldkeys( item, '%s[]' % ( parent_key ) ):
yield(y)
elif isinstance( data, dict ):
for i, item in data.items():
for y in yieldkeys( item, '%s{%s}' % ( parent_key, i ) ):
yield(y)
else:
yield( '%s%s' % ( parent_key, type(data).__name__ ) )
keycount = defaultdict(lambda: 0)
for a in yieldkeys(data):
keycount[a] += 1
for key in sorted( keycount.keys() ):
print("%4d %s" % ( keycount[key], key ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment