''' | |
Load Yelp JSON files and spit out CSV files | |
Does not try to reinvent the wheel and uses pandas json_normalize | |
Kinda hacky and requires a bit of RAM. But works, albeit naively. | |
Tested with Yelp JSON files in dataset challenge round 12: | |
https://www.yelp.com/dataset/challenge | |
''' | |
import json | |
from pathlib import Path | |
from time import clock | |
from typing import Dict, List | |
import click | |
import pandas as pd | |
from pandas.io.json import json_normalize | |
def read_json_as_array(json_file: Path) -> str: | |
''' | |
Read a given Yelp JSON file as string, adding opening / closing | |
brackets and commas to convert from separate JSON objects to | |
an array of JSON objects, so JSON aware libraries can properly read | |
Parameters | |
----------- | |
json_file: path-like | |
Returns | |
------- | |
json_data: str | |
String representation of JSON array | |
''' | |
json_data = '' | |
with open(json_file, 'r', encoding='utf-8') as in_file: | |
for i, line in enumerate(in_file): | |
if i == 0 and line: | |
json_data += '[' + line | |
elif line: | |
json_data += ',' + line | |
else: | |
pass | |
json_data += ']\n' | |
return json_data | |
def load_json(json_data: str) -> pd.DataFrame: | |
''' | |
Read and normalize a given JSON array into a pandas DataFrame | |
Parameters | |
----------- | |
json_data: str | |
String representation of JSON array | |
Returns | |
------- | |
df: pandas.DataFrame | |
DataFrame containing the normalized JSON data | |
''' | |
data = json.loads(json_data) | |
df = json_normalize(data) | |
return df | |
def write_csv(df: pd.DataFrame, out_file: Path) -> None: | |
''' | |
Write a given DataFrame to csv without index | |
Parameters | |
----------- | |
df: pandas.DataFrame | |
DataFrame containing the normalized JSON data | |
out_file: pathlib.Path | |
A proper path of CSV file name | |
''' | |
df.to_csv(out_file, index=False) | |
@click.command() | |
@click.argument('json-dir', type=click.Path(exists=True, dir_okay=True)) | |
def main(json_dir): | |
''' | |
Read a given directory containing Yelp JSON data and convert those | |
files to CSV under 'csv_out' in the same directory | |
''' | |
t0 = clock() | |
json_dir = Path(json_dir) | |
csv_dir = json_dir / 'csv_out' | |
csv_dir.mkdir(exist_ok=True) | |
file_list: List[Path] = json_dir.glob('*.json') | |
with click.progressbar(file_list, label='Processing files..') as bar: | |
for file in bar: | |
csv_file = csv_dir / (file.stem + '.csv') | |
data = read_json_as_array(file) | |
df = load_json(data) | |
write_csv(df, csv_file) | |
t1 = clock() | |
mins = (t1 - t0) // 60 | |
secs = int((t1 - t0) % 60) | |
timing = f'Conversion finished in {mins} minutes and {secs} seconds' | |
click.secho(timing, fg='green') | |
if __name__ == '__main__': | |
main() # pylint: disable=E1120 |
Hmm, it works fine on my setup with Python 3.6, there shouldn't be any syntactical difference with Python 3.7. Can you give a bit more details? Do you have pandas
library installed?
Hi, I am getting this error:
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 3)
How can I solve it?
Hi,
I got this error:
line 86:
file_list: List[Path] = json_dir.glob('*.json')
^
SyntaxError: invalid syntax
May you please help me what it refers to?
another abandoned github
<;( ...conversion of any JSON data source, no matter how deep or complex, to a relational dataset with named columns must be an extremely difficult problem. I've researched it for weeks and can't find a solution other than the usual easy and simple.
I'm getting an error:
File "./json_to_csv.py", line 20
def read_json_as_array(json_file: Path) -> str:
^
SyntaxError: invalid syntax
with Python 3.7. Can you help with this?