Created
December 1, 2012 07:28
-
-
Save tyrannosaur/4180979 to your computer and use it in GitHub Desktop.
Convert nested JSON data to *-separated format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import os | |
| import sys | |
| import json | |
| import re | |
| def iter_obj(obj): | |
| """\ | |
| Build a list of (object, namespace) tuples. | |
| An object is a non-dict object, and a namespace is the series of | |
| keys (in order) required to that object from the root object. | |
| """ | |
| def make_namespace(namespace, key): | |
| return (namespace[:] or []) + [key] | |
| def make_work(obj, namespace): | |
| keys = obj.keys() | |
| return zip(obj.values(), [make_namespace(namespace, key) for key in keys]) | |
| # Create a queue of objects to descend | |
| work_queue = make_work(obj, []) | |
| results = [] | |
| while len(work_queue) > 0: | |
| obj, namespace = work_queue.pop() | |
| # If the current object is a dict, add it to the queue | |
| if isinstance(obj, dict): | |
| work_queue = make_work(obj, namespace) + work_queue | |
| else: | |
| results.append((obj, namespace)) | |
| return results | |
| def val_from_namespace(obj, namespace): | |
| """\ | |
| Get a value from an object with the given namespace. | |
| """ | |
| cur = obj | |
| for key in namespace: | |
| cur = cur[key] | |
| return cur | |
| def convert(name): | |
| SEPARATOR = u'\t' | |
| EXTENSION = 'tsv' | |
| path, ext = os.path.splitext(name) | |
| out_name = '{0}.{1}'.format(path, EXTENSION) | |
| def clean(text): | |
| return re.sub(SEPARATOR + '+', u'', u'{0}'.format(text)) | |
| def write_out(outfile, vals): | |
| line = SEPARATOR.join(vals) | |
| outfile.write(u'{0}\n'.format(line).encode('utf-8')) | |
| with open(name, 'r+b') as infile: | |
| with open(out_name, 'w+b') as outfile: | |
| data = json.loads(infile.read()) | |
| if len(data) > 0: | |
| headers = [x[1] for x in iter_obj(data[0])] | |
| write_out(outfile, [u'.'.join(header) for header in headers]) | |
| for obj in data[1:]: | |
| values = [clean(val_from_namespace(obj, header)) for header in headers] | |
| write_out(outfile, values) | |
| if __name__ == '__main__': | |
| if len(sys.argv) <= 1: | |
| print('usage {0} [file 1] [file 2] ...'.format(sys.argv[0])) | |
| sys.exit(0) | |
| for name in sys.argv[1:]: | |
| convert(name) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import os | |
| import sys | |
| import json | |
| import re | |
| def iter_obj(obj): | |
| """\ | |
| Build a list of (object, namespace) tuples. | |
| An object is a non-dict object, and a namespace is the series of | |
| keys (in order) required to that object from the root object. | |
| """ | |
| def make_namespace(namespace, key): | |
| return (namespace[:] or []) + [key] | |
| def make_work(obj, namespace): | |
| keys = obj.keys() | |
| return zip(obj.values(), [make_namespace(namespace, key) for key in keys]) | |
| # Create a queue of objects to descend | |
| work_queue = make_work(obj, []) | |
| results = [] | |
| while len(work_queue) > 0: | |
| obj, namespace = work_queue.pop() | |
| # If the current object is a dict, add it to the queue | |
| if isinstance(obj, dict): | |
| work_queue = make_work(obj, namespace) + work_queue | |
| else: | |
| results.append((obj, namespace)) | |
| return results | |
| def val_from_namespace(obj, namespace): | |
| """\ | |
| Get a value from an object with the given namespace. | |
| """ | |
| cur = obj | |
| for key in namespace: | |
| cur = cur[key] | |
| return cur | |
| def convert(name): | |
| SEPARATOR = u'\t' | |
| EXTENSION = 'tsv' | |
| path, ext = os.path.splitext(name) | |
| out_name = '{0}.{1}'.format(path, EXTENSION) | |
| def clean(text): | |
| return re.sub(SEPARATOR + '+', u'', u'{0}'.format(text)) | |
| def write_out(outfile, vals): | |
| line = SEPARATOR.join(vals) | |
| outfile.write(u'{0}\n'.format(line).encode('utf-8')) | |
| with open(name, 'r+b') as infile: | |
| with open(out_name, 'w+b') as outfile: | |
| data = json.loads(infile.read()) | |
| if len(data) > 0: | |
| headers = [x[1] for x in iter_obj(data[0])] | |
| write_out(outfile, [u'.'.join(header) for header in headers]) | |
| for obj in data[1:]: | |
| values = [clean(val_from_namespace(obj, header)) for header in headers] | |
| write_out(outfile, values) | |
| if __name__ == '__main__': | |
| if len(sys.argv) <= 1: | |
| print('usage {0} [file 1] [file 2] ...'.format(sys.argv[0])) | |
| sys.exit(0) | |
| for name in sys.argv[1:]: | |
| convert(name) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [ | |
| { "_id" : { "$oid" : "507dc7905ef6dc0006e4cf83"} , "data" : { "power" : 1 } , "uploader" : "alice" , "created" : { "$date" : "2012-10-16T20:46:08.154Z"}} | |
| ,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf84"} , "data" : { "power" : 10} , "uploader" : "bob" , "created" : { "$date" : "2012-10-16T20:46:08.164Z"}} | |
| ,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf85"} , "data" : { "power" : 100} , "uploader" : "carol" , "created" : { "$date" : "2012-10-16T20:46:08.174Z"}} | |
| ,{ "_id" : { "$oid" : "507dc7905ef6dc0006e4cf86"} , "data" : { "power" : 1000} , "uploader" : "david" , "created" : { "$date" : "2012-10-16T20:46:08.184Z"}} | |
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| uploader | created.$date | data.power | _id.$oid | |
|---|---|---|---|---|
| bob | 2012-10-16T20:46:08.164Z | 10 | 507dc7905ef6dc0006e4cf84 | |
| carol | 2012-10-16T20:46:08.174Z | 100 | 507dc7905ef6dc0006e4cf85 | |
| david | 2012-10-16T20:46:08.184Z | 1000 | 507dc7905ef6dc0006e4cf86 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment