Skip to content

Instantly share code, notes, and snippets.

@tyrannosaur
Created December 1, 2012 07:28
Show Gist options
  • Select an option

  • Save tyrannosaur/4180979 to your computer and use it in GitHub Desktop.

Select an option

Save tyrannosaur/4180979 to your computer and use it in GitHub Desktop.
Convert nested JSON data to *-separated format
#!/usr/bin/env python
import os
import sys
import json
import re
def iter_obj(obj):
"""\
Build a list of (object, namespace) tuples.
An object is a non-dict object, and a namespace is the series of
keys (in order) required to that object from the root object.
"""
def make_namespace(namespace, key):
return (namespace[:] or []) + [key]
def make_work(obj, namespace):
keys = obj.keys()
return zip(obj.values(), [make_namespace(namespace, key) for key in keys])
# Create a queue of objects to descend
work_queue = make_work(obj, [])
results = []
while len(work_queue) > 0:
obj, namespace = work_queue.pop()
# If the current object is a dict, add it to the queue
if isinstance(obj, dict):
work_queue = make_work(obj, namespace) + work_queue
else:
results.append((obj, namespace))
return results
def val_from_namespace(obj, namespace):
"""\
Get a value from an object with the given namespace.
"""
cur = obj
for key in namespace:
cur = cur[key]
return cur
def convert(name):
SEPARATOR = u'\t'
EXTENSION = 'tsv'
path, ext = os.path.splitext(name)
out_name = '{0}.{1}'.format(path, EXTENSION)
def clean(text):
return re.sub(SEPARATOR + '+', u'', u'{0}'.format(text))
def write_out(outfile, vals):
line = SEPARATOR.join(vals)
outfile.write(u'{0}\n'.format(line).encode('utf-8'))
with open(name, 'r+b') as infile:
with open(out_name, 'w+b') as outfile:
data = json.loads(infile.read())
if len(data) > 0:
headers = [x[1] for x in iter_obj(data[0])]
write_out(outfile, [u'.'.join(header) for header in headers])
for obj in data[1:]:
values = [clean(val_from_namespace(obj, header)) for header in headers]
write_out(outfile, values)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print('usage {0} [file 1] [file 2] ...'.format(sys.argv[0]))
sys.exit(0)
for name in sys.argv[1:]:
convert(name)
#!/usr/bin/env python
import os
import sys
import json
import re
def iter_obj(obj):
"""\
Build a list of (object, namespace) tuples.
An object is a non-dict object, and a namespace is the series of
keys (in order) required to that object from the root object.
"""
def make_namespace(namespace, key):
return (namespace[:] or []) + [key]
def make_work(obj, namespace):
keys = obj.keys()
return zip(obj.values(), [make_namespace(namespace, key) for key in keys])
# Create a queue of objects to descend
work_queue = make_work(obj, [])
results = []
while len(work_queue) > 0:
obj, namespace = work_queue.pop()
# If the current object is a dict, add it to the queue
if isinstance(obj, dict):
work_queue = make_work(obj, namespace) + work_queue
else:
results.append((obj, namespace))
return results
def val_from_namespace(obj, namespace):
"""\
Get a value from an object with the given namespace.
"""
cur = obj
for key in namespace:
cur = cur[key]
return cur
def convert(name):
SEPARATOR = u'\t'
EXTENSION = 'tsv'
path, ext = os.path.splitext(name)
out_name = '{0}.{1}'.format(path, EXTENSION)
def clean(text):
return re.sub(SEPARATOR + '+', u'', u'{0}'.format(text))
def write_out(outfile, vals):
line = SEPARATOR.join(vals)
outfile.write(u'{0}\n'.format(line).encode('utf-8'))
with open(name, 'r+b') as infile:
with open(out_name, 'w+b') as outfile:
data = json.loads(infile.read())
if len(data) > 0:
headers = [x[1] for x in iter_obj(data[0])]
write_out(outfile, [u'.'.join(header) for header in headers])
for obj in data[1:]:
values = [clean(val_from_namespace(obj, header)) for header in headers]
write_out(outfile, values)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print('usage {0} [file 1] [file 2] ...'.format(sys.argv[0]))
sys.exit(0)
for name in sys.argv[1:]:
convert(name)
[
{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf83"} , "data" : { "power" : 1 } , "uploader" : "alice" , "created" : { "$date" : "2012-10-16T20:46:08.154Z"}}
,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf84"} , "data" : { "power" : 10} , "uploader" : "bob" , "created" : { "$date" : "2012-10-16T20:46:08.164Z"}}
,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf85"} , "data" : { "power" : 100} , "uploader" : "carol" , "created" : { "$date" : "2012-10-16T20:46:08.174Z"}}
,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf86"} , "data" : { "power" : 1000} , "uploader" : "david" , "created" : { "$date" : "2012-10-16T20:46:08.184Z"}}
]
uploader created.$date data.power _id.$oid
bob 2012-10-16T20:46:08.164Z 10 507dc7905ef6dc0006e4cf84
carol 2012-10-16T20:46:08.174Z 100 507dc7905ef6dc0006e4cf85
david 2012-10-16T20:46:08.184Z 1000 507dc7905ef6dc0006e4cf86
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment