Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Yelp Dataset Challenge JSON to CSV conversion
'''
Load Yelp JSON files and spit out CSV files
Does not try to reinvent the wheel and uses pandas json_normalize
Kinda hacky and requires a bit of RAM. But works, albeit naively.
Tested with Yelp JSON files in dataset challenge round 12:
https://www.yelp.com/dataset/challenge
'''
import json
from pathlib import Path
from time import clock
from typing import Dict, List
import click
import pandas as pd
from pandas.io.json import json_normalize
def read_json_as_array(json_file: Path) -> str:
'''
Read a given Yelp JSON file as string, adding opening / closing
brackets and commas to convert from separate JSON objects to
an array of JSON objects, so JSON aware libraries can properly read
Parameters
-----------
json_file: path-like
Returns
-------
json_data: str
String representation of JSON array
'''
json_data = ''
with open(json_file, 'r', encoding='utf-8') as in_file:
for i, line in enumerate(in_file):
if i == 0 and line:
json_data += '[' + line
elif line:
json_data += ',' + line
else:
pass
json_data += ']\n'
return json_data
def load_json(json_data: str) -> pd.DataFrame:
'''
Read and normalize a given JSON array into a pandas DataFrame
Parameters
-----------
json_data: str
String representation of JSON array
Returns
-------
df: pandas.DataFrame
DataFrame containing the normalized JSON data
'''
data = json.loads(json_data)
df = json_normalize(data)
return df
def write_csv(df: pd.DataFrame, out_file: Path) -> None:
'''
Write a given DataFrame to csv without index
Parameters
-----------
df: pandas.DataFrame
DataFrame containing the normalized JSON data
out_file: pathlib.Path
A proper path of CSV file name
'''
df.to_csv(out_file, index=False)
@click.command()
@click.argument('json-dir', type=click.Path(exists=True, dir_okay=True))
def main(json_dir):
'''
Read a given directory containing Yelp JSON data and convert those
files to CSV under 'csv_out' in the same directory
'''
t0 = clock()
json_dir = Path(json_dir)
csv_dir = json_dir / 'csv_out'
csv_dir.mkdir(exist_ok=True)
file_list: List[Path] = json_dir.glob('*.json')
with click.progressbar(file_list, label='Processing files..') as bar:
for file in bar:
csv_file = csv_dir / (file.stem + '.csv')
data = read_json_as_array(file)
df = load_json(data)
write_csv(df, csv_file)
t1 = clock()
mins = (t1 - t0) // 60
secs = int((t1 - t0) % 60)
timing = f'Conversion finished in {mins} minutes and {secs} seconds'
click.secho(timing, fg='green')
if __name__ == '__main__':
main() # pylint: disable=E1120
@sangprk
Copy link

sangprk commented Sep 18, 2018

I'm getting an error:
File "./json_to_csv.py", line 20
def read_json_as_array(json_file: Path) -> str:
^
SyntaxError: invalid syntax

with Python 3.7. Can you help with this?

@emredjan
Copy link
Author

emredjan commented Oct 7, 2018

Hmm, it works fine on my setup with Python 3.6, there shouldn't be any syntactical difference with Python 3.7. Can you give a bit more details? Do you have pandas library installed?

@robertosannazzaro
Copy link

robertosannazzaro commented Jan 21, 2019

Hi, I am getting this error:
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 3)
How can I solve it?

@eirtech
Copy link

eirtech commented Mar 5, 2019

Hi,
I got this error:
line 86:
file_list: List[Path] = json_dir.glob('*.json')
^
SyntaxError: invalid syntax

May you please help me what it refers to?

@news1537
Copy link

news1537 commented Feb 2, 2020

another abandoned github

@Slowly
Copy link

Slowly commented Feb 17, 2020

<;( ...conversion of any JSON data source, no matter how deep or complex, to a relational dataset with named columns must be an extremely difficult problem. I've researched it for weeks and can't find a solution other than the usual easy and simple.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment