Skip to content

Instantly share code, notes, and snippets.

@mgaitan
Last active July 22, 2021 22:53
Show Gist options
  • Save mgaitan/2557f205fad9db5b460a26557781f242 to your computer and use it in GitHub Desktop.
Save mgaitan/2557f205fad9db5b460a26557781f242 to your computer and use it in GitHub Desktop.
Given a json line file, return same content in csv format
from pathlib import Path
import argparse
import jsonlines
import csv
def jsonl2csv(filepath, type_=None, include=None, exclude=None):
def valid_type(item):
if type_:
return item.get('_type') == type_
return True
with jsonlines.open(filepath) as reader:
while True:
first = reader.read()
if valid_type(first):
break
all_headers = set(first.keys())
headers = set(include) if include else all_headers
assert headers.issubset(all_headers)
if exclude:
exclude = set(exclude)
assert exclude.issubset(all_headers)
headers -= set(exclude)
with Path(Path(filepath).with_suffix('.csv').name).open('w') as f:
writer = csv.DictWriter(f, extrasaction='ignore', fieldnames=headers)
writer.writeheader()
writer.writerow(first)
for obj in reader:
if not valid_type(obj):
continue
writer.writerow(obj)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('infile')
parser.add_argument('--exclude', help='Headers to include. Default to none', nargs='*')
parser.add_argument('--include', help='Headers to include. Default to all', nargs='*')
parser.add_argument('--type', help='Limit item to type')
args = parser.parse_args()
jsonl2csv(args.infile, args.type, args.include, args.exclude)
if __name__ == '__main__':
main()
@boukeversteegh
Copy link

Awesome script, thank you!

To support unicode characters in the JsonLines input, I changed the following:

- with Path(Path(filepath).with_suffix('.csv').name).open('w') as f:
+ with Path(Path(filepath).with_suffix('.csv').name).open('w', encoding='utf-8-sig') as f:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment