Skip to content

Instantly share code, notes, and snippets.

@wesleyegberto
Last active May 28, 2022 05:55
Show Gist options
  • Save wesleyegberto/f7df5b38b31c23daba112cb4e62e08d2 to your computer and use it in GitHub Desktop.
Save wesleyegberto/f7df5b38b31c23daba112cb4e62e08d2 to your computer and use it in GitHub Desktop.
Simple class to exporter a JSON file to DataFrame and CSV
# This was created to export any JSON format to CSV.
# I've created to help extract MongoDb collection to one or more CSV, [hopefully] in a normalized tables.
import pandas as pd
class JsonToDataFrameCsvExporter(object):
""" Simple exporter to help transform a JSON into a DataFrame CSV file.
The main purpose of this exporter is to export a JSON as a collection of normalized
table in CSV file.
- the `default_id_field_name` will be used to get the ID of the received object (and its nested objects);
- each attribute of type object will be prefixed with `{field_name}_{nested_field_name}`;
- each attribute of type list will be exported in another CSV file (each nested list will also generate another CSV file);
- each CSV file from attribute of type list will have the name: `{root_field}_{nested_field?}.csv`;
- when exporting a new CSV file, the ID field from its parent will be also exported as `{root_field}_id`;
"""
def __init__(self, default_id_field_name, debug=False):
self.dataframes = {}
self.default_id_field_name = default_id_field_name
self.debug = debug
def get_dataframe_by_table_name(self, table_name) -> pd.DataFrame:
if table_name in self.dataframes:
return self.dataframes[table_name]
return pd.DataFrame()
def append_rows_to_table(self, table_name, rows):
df = self.get_dataframe_by_table_name(table_name)
df = df.append(rows, ignore_index=True)
self.dataframes[table_name] = df
if (self.debug):
print('=== File ' + table_name)
print(df)
def export_all_dataframes(self):
for k, df in self.dataframes.items():
df.to_csv(k + '.csv', sep=';')
def normalize_json(self, data: dict) -> dict:
new_data = dict()
for key, value in data.items():
if isinstance(value, dict):
for k, v in value.items():
new_data[key + "_" + k] = v
else:
new_data[key] = value
return new_data
def _normalize_field_list(self, data_list: list) -> list:
obj_items = [data for data in data_list if isinstance(data, dict)]
if len(obj_items) == 0:
return data_list
return [self._normalize_field_obj(data) for data in data_list]
def _normalize_field_obj(self, data: dict) -> dict:
for k, val in data.items():
if isinstance(val, list):
data[k] = self._normalize_field_list(val)
elif isinstance(val, dict):
data[k] = self._normalize_field_obj(val)
return self.normalize_json(data)
def _export_field_list(self, prefix: str, parent_id: str, key: str, data_list: list):
attr_table_name = prefix + '_' + key
obj_items = [data for data in data_list if isinstance(data, dict)]
if len(obj_items) == 0:
rows = [{prefix + '_id': parent_id, key: v } for v in data_list]
self.append_rows_to_table(attr_table_name, rows)
return
for v in data_list:
if isinstance(v, dict):
v[prefix + '_id'] = parent_id
id = parent_id
if self.default_id_field_name in v:
id = v[self.default_id_field_name]
self._export_obj_with_id(attr_table_name, id, v)
def _export_obj_with_id(self, table_name: str, id: str, obj: dict):
""" Export the JSON object to CSV (one or more CSV). """
norm_obj = self._normalize_field_obj(obj)
keys_list = [k for k,v in norm_obj.items() if isinstance(v, list)]
if len(keys_list) > 0:
# exporta os atributos de listas separadamente
for k in keys_list:
val = norm_obj[k]
self._export_field_list(table_name, id, k, val)
del norm_obj[k]
self.append_rows_to_table(table_name, norm_obj)
def export_obj(self, table_name: str, obj):
""" Export the JSON object to CSV (one or more CSV). """
self._export_obj_with_id(table_name, obj[self.default_id_field_name], obj)
def export_list_objs(self, table_name: str, list_objs: list):
""" Export the list of JSON object to CSV (one or more CSV). """
for obj in list_objs:
self._export_obj_with_id(table_name, obj[self.default_id_field_name], obj)
json = {
'_id': '42',
'name': 'Odair',
'age': '30',
'langs': ['pt', 'en'],
'info': {
'game': 'mario',
'country': 'BR',
'geo': {
'lat': 100,
'lon': 199
}
},
'consoles': [
{
'code': 'PS4',
'name': 'PlayStation 4',
'attr': {
'stars': 5,
'manufacturer': 'Sony'
},
'comm': [
'very good',
'awesome'
]
},
{
'code': 'XOX',
'name': 'Xbox One X',
'attr': {
'stars': 4,
'manufacturer': 'Microsoft'
},
'comm': []
}
]
}
exporter = JsonToDataFrameExporter('_id')
exporter.export_obj('persons', json)
exporter.export_obj('persons', json)
exporter.export_all_dataframes()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment