Created December 5, 2019 12:27
Yelp Dataset Challenge JSON to CSV conversion
Load Yelp JSON files and spit out CSV files
Does not try to reinvent the wheel and uses pandas json_normalize
Kinda hacky and requires a bit of RAM. But works, albeit naively.
Tested with Yelp JSON files in dataset challenge round 12:
import json
from pathlib import Path
from time import clock
from typing import Dict, List
import click
import pandas as pd
from import json_normalize
def read_json_as_array(json_file: Path) -> str:
Read a given Yelp JSON file as string, adding opening / closing
brackets and commas to convert from separate JSON objects to
an array of JSON objects, so JSON aware libraries can properly read
json_file: path-like
json_data: str
String representation of JSON array
json_data = ''
with open(json_file, 'r', encoding='utf-8') as in_file:
for i, line in enumerate(in_file):
if i == 0 and line:
json_data += '[' + line
elif line:
json_data += ',' + line
json_data += ']\n'
return json_data
def load_json(json_data: str) -> pd.DataFrame:
Read and normalize a given JSON array into a pandas DataFrame
json_data: str
String representation of JSON array
df: pandas.DataFrame
DataFrame containing the normalized JSON data
data = json.loads(json_data)
df = json_normalize(data)
return df
def write_csv(df: pd.DataFrame, out_file: Path) -> None:
Write a given DataFrame to csv without index
df: pandas.DataFrame
DataFrame containing the normalized JSON data
out_file: pathlib.Path
A proper path of CSV file name
df.to_csv(out_file, index=False)
@click.argument('json-dir', type=click.Path(exists=True, dir_okay=True))
def main(json_dir):
Read a given directory containing Yelp JSON data and convert those
files to CSV under 'csv_out' in the same directory
t0 = clock()
json_dir = Path(json_dir)
csv_dir = json_dir / 'csv_out'
file_list: List[Path] = json_dir.glob('*.json')
with click.progressbar(file_list, label='Processing files..') as bar:
for file in bar:
csv_file = csv_dir / (file.stem + '.csv')
data = read_json_as_array(file)
df = load_json(data)
write_csv(df, csv_file)
t1 = clock()
mins = (t1 - t0) // 60
secs = int((t1 - t0) % 60)
timing = f'Conversion finished in {mins} minutes and {secs} seconds'
click.secho(timing, fg='green')
if __name__ == '__main__':
main() # pylint: disable=E1120
Nov05 commented Dec 5, 2019

2019-12-05 labs 19 project

