Skip to content

Instantly share code, notes, and snippets.

@jpmckinney
Created July 20, 2019 02:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpmckinney/decf9ffc6c7498ad1381c2bbb4545736 to your computer and use it in GitHub Desktop.
Save jpmckinney/decf9ffc6c7498ad1381c2bbb4545736 to your computer and use it in GitHub Desktop.
Test the performance of approaches to flattening JSON to CSV.

simple-one-table.py

Building a dict by reference is faster than recursively building a dict (slowest) or list of tuples.

dict:

return {key: obj}
…
d = {}
…
d.update(parse(value, path + '/' + str(key)))
…
row = parse(entry)

tuples:

return [(key, obj)]
…
d = []
…
d.extend(parse(value, path + '/' + str(key)))
…
row = dict(parse(entry))
import argparse
import csv
from collections import OrderedDict
from tempfile import TemporaryFile
# Requires yajl and cffi. On MacOS: brew install yajl; pip install cffi
# See https://github.com/ICRAR/ijson/pull/1
import ijson.backends.yajl2_cffi as ijson
def build_row(obj, row, path):
if isinstance(obj, dict):
iterator = obj.items()
elif isinstance(obj, list):
iterator = enumerate(obj)
else:
key = '/'.join(path)
row[key] = obj
return
for key, value in iterator:
build_row(value, row, path + (str(key),))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input_file', help='the input JSON file')
parser.add_argument('output_file', help='the output CSV file')
args = parser.parse_args()
fieldnames = OrderedDict()
with TemporaryFile('w+', newline='') as tmpfile:
writer = csv.writer(tmpfile)
with open(args.input_file, 'rb') as f:
for entry in ijson.items(f, 'item'):
row = {}
build_row(entry, row, ())
fieldnames.update(row)
writer.writerow([row.get(fieldname) for fieldname in fieldnames])
tmpfile.seek(0)
reader = csv.DictReader(tmpfile, fieldnames=list(fieldnames))
with open(args.output_file, 'w') as f:
writer = csv.DictWriter(f, fieldnames=sorted(fieldnames))
writer.writeheader()
for row in reader:
writer.writerow(row)
if __name__ == '__main__':
main()
import argparse
import csv
import json
import sys
from decimal import Decimal
def build_row(obj, row, path):
if isinstance(obj, dict):
iterator = obj.items()
elif isinstance(obj, list):
iterator = enumerate(obj)
else:
key = '/'.join(path)
row[key] = obj
return
for key, value in iterator:
build_row(value, row, path + (str(key),))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('file', help='the JSON file')
args = parser.parse_args()
fieldnames = set()
with open(args.file) as f:
data = json.load(f, parse_float=Decimal)
rows = []
for entry in data:
row = {}
build_row(entry, row, ())
fieldnames.update(row)
rows.append(row)
writer = csv.DictWriter(sys.stdout, fieldnames=sorted(fieldnames))
writer.writeheader()
writer.writerows(rows)
if __name__ == '__main__':
main()
import csv
from glob import glob
for filename in glob('flattened/*.csv'):
with open(filename) as f:
rows = [row for row in csv.reader(f)]
with open(filename, 'w') as f:
writer = csv.writer(f)
writer.writerows(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment