Skip to content

Instantly share code, notes, and snippets.

Forked from emredjan/
Created December 5, 2019 12:27
What would you like to do?
Yelp Dataset Challenge JSON to CSV conversion
Load Yelp JSON files and spit out CSV files
Does not try to reinvent the wheel and uses pandas json_normalize
Kinda hacky and requires a bit of RAM. But works, albeit naively.
Tested with Yelp JSON files in dataset challenge round 12:
import json
from pathlib import Path
from time import clock
from typing import Dict, List
import click
import pandas as pd
from import json_normalize
def read_json_as_array(json_file: Path) -> str:
Read a given Yelp JSON file as string, adding opening / closing
brackets and commas to convert from separate JSON objects to
an array of JSON objects, so JSON aware libraries can properly read
json_file: path-like
json_data: str
String representation of JSON array
json_data = ''
with open(json_file, 'r', encoding='utf-8') as in_file:
for i, line in enumerate(in_file):
if i == 0 and line:
json_data += '[' + line
elif line:
json_data += ',' + line
json_data += ']\n'
return json_data
def load_json(json_data: str) -> pd.DataFrame:
Read and normalize a given JSON array into a pandas DataFrame
json_data: str
String representation of JSON array
df: pandas.DataFrame
DataFrame containing the normalized JSON data
data = json.loads(json_data)
df = json_normalize(data)
return df
def write_csv(df: pd.DataFrame, out_file: Path) -> None:
Write a given DataFrame to csv without index
df: pandas.DataFrame
DataFrame containing the normalized JSON data
out_file: pathlib.Path
A proper path of CSV file name
df.to_csv(out_file, index=False)
@click.argument('json-dir', type=click.Path(exists=True, dir_okay=True))
def main(json_dir):
Read a given directory containing Yelp JSON data and convert those
files to CSV under 'csv_out' in the same directory
t0 = clock()
json_dir = Path(json_dir)
csv_dir = json_dir / 'csv_out'
file_list: List[Path] = json_dir.glob('*.json')
with click.progressbar(file_list, label='Processing files..') as bar:
for file in bar:
csv_file = csv_dir / (file.stem + '.csv')
data = read_json_as_array(file)
df = load_json(data)
write_csv(df, csv_file)
t1 = clock()
mins = (t1 - t0) // 60
secs = int((t1 - t0) % 60)
timing = f'Conversion finished in {mins} minutes and {secs} seconds'
click.secho(timing, fg='green')
if __name__ == '__main__':
main() # pylint: disable=E1120
Copy link

Nov05 commented Dec 5, 2019

2019-12-05 labs 19 project

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment