Last active
July 22, 2021 22:53
-
-
Save mgaitan/2557f205fad9db5b460a26557781f242 to your computer and use it in GitHub Desktop.
Given a json line file, return same content in csv format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import argparse | |
import jsonlines | |
import csv | |
def jsonl2csv(filepath, type_=None, include=None, exclude=None): | |
def valid_type(item): | |
if type_: | |
return item.get('_type') == type_ | |
return True | |
with jsonlines.open(filepath) as reader: | |
while True: | |
first = reader.read() | |
if valid_type(first): | |
break | |
all_headers = set(first.keys()) | |
headers = set(include) if include else all_headers | |
assert headers.issubset(all_headers) | |
if exclude: | |
exclude = set(exclude) | |
assert exclude.issubset(all_headers) | |
headers -= set(exclude) | |
with Path(Path(filepath).with_suffix('.csv').name).open('w') as f: | |
writer = csv.DictWriter(f, extrasaction='ignore', fieldnames=headers) | |
writer.writeheader() | |
writer.writerow(first) | |
for obj in reader: | |
if not valid_type(obj): | |
continue | |
writer.writerow(obj) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('infile') | |
parser.add_argument('--exclude', help='Headers to include. Default to none', nargs='*') | |
parser.add_argument('--include', help='Headers to include. Default to all', nargs='*') | |
parser.add_argument('--type', help='Limit item to type') | |
args = parser.parse_args() | |
jsonl2csv(args.infile, args.type, args.include, args.exclude) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome script, thank you!
To support unicode characters in the JsonLines input, I changed the following: