Last active
May 28, 2022 05:55
-
-
Save wesleyegberto/f7df5b38b31c23daba112cb4e62e08d2 to your computer and use it in GitHub Desktop.
Simple class to exporter a JSON file to DataFrame and CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This was created to export any JSON format to CSV. | |
# I've created to help extract MongoDb collection to one or more CSV, [hopefully] in a normalized tables. | |
import pandas as pd | |
class JsonToDataFrameCsvExporter(object): | |
""" Simple exporter to help transform a JSON into a DataFrame CSV file. | |
The main purpose of this exporter is to export a JSON as a collection of normalized | |
table in CSV file. | |
- the `default_id_field_name` will be used to get the ID of the received object (and its nested objects); | |
- each attribute of type object will be prefixed with `{field_name}_{nested_field_name}`; | |
- each attribute of type list will be exported in another CSV file (each nested list will also generate another CSV file); | |
- each CSV file from attribute of type list will have the name: `{root_field}_{nested_field?}.csv`; | |
- when exporting a new CSV file, the ID field from its parent will be also exported as `{root_field}_id`; | |
""" | |
def __init__(self, default_id_field_name, debug=False): | |
self.dataframes = {} | |
self.default_id_field_name = default_id_field_name | |
self.debug = debug | |
def get_dataframe_by_table_name(self, table_name) -> pd.DataFrame: | |
if table_name in self.dataframes: | |
return self.dataframes[table_name] | |
return pd.DataFrame() | |
def append_rows_to_table(self, table_name, rows): | |
df = self.get_dataframe_by_table_name(table_name) | |
df = df.append(rows, ignore_index=True) | |
self.dataframes[table_name] = df | |
if (self.debug): | |
print('=== File ' + table_name) | |
print(df) | |
def export_all_dataframes(self): | |
for k, df in self.dataframes.items(): | |
df.to_csv(k + '.csv', sep=';') | |
def normalize_json(self, data: dict) -> dict: | |
new_data = dict() | |
for key, value in data.items(): | |
if isinstance(value, dict): | |
for k, v in value.items(): | |
new_data[key + "_" + k] = v | |
else: | |
new_data[key] = value | |
return new_data | |
def _normalize_field_list(self, data_list: list) -> list: | |
obj_items = [data for data in data_list if isinstance(data, dict)] | |
if len(obj_items) == 0: | |
return data_list | |
return [self._normalize_field_obj(data) for data in data_list] | |
def _normalize_field_obj(self, data: dict) -> dict: | |
for k, val in data.items(): | |
if isinstance(val, list): | |
data[k] = self._normalize_field_list(val) | |
elif isinstance(val, dict): | |
data[k] = self._normalize_field_obj(val) | |
return self.normalize_json(data) | |
def _export_field_list(self, prefix: str, parent_id: str, key: str, data_list: list): | |
attr_table_name = prefix + '_' + key | |
obj_items = [data for data in data_list if isinstance(data, dict)] | |
if len(obj_items) == 0: | |
rows = [{prefix + '_id': parent_id, key: v } for v in data_list] | |
self.append_rows_to_table(attr_table_name, rows) | |
return | |
for v in data_list: | |
if isinstance(v, dict): | |
v[prefix + '_id'] = parent_id | |
id = parent_id | |
if self.default_id_field_name in v: | |
id = v[self.default_id_field_name] | |
self._export_obj_with_id(attr_table_name, id, v) | |
def _export_obj_with_id(self, table_name: str, id: str, obj: dict): | |
""" Export the JSON object to CSV (one or more CSV). """ | |
norm_obj = self._normalize_field_obj(obj) | |
keys_list = [k for k,v in norm_obj.items() if isinstance(v, list)] | |
if len(keys_list) > 0: | |
# exporta os atributos de listas separadamente | |
for k in keys_list: | |
val = norm_obj[k] | |
self._export_field_list(table_name, id, k, val) | |
del norm_obj[k] | |
self.append_rows_to_table(table_name, norm_obj) | |
def export_obj(self, table_name: str, obj): | |
""" Export the JSON object to CSV (one or more CSV). """ | |
self._export_obj_with_id(table_name, obj[self.default_id_field_name], obj) | |
def export_list_objs(self, table_name: str, list_objs: list): | |
""" Export the list of JSON object to CSV (one or more CSV). """ | |
for obj in list_objs: | |
self._export_obj_with_id(table_name, obj[self.default_id_field_name], obj) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
json = { | |
'_id': '42', | |
'name': 'Odair', | |
'age': '30', | |
'langs': ['pt', 'en'], | |
'info': { | |
'game': 'mario', | |
'country': 'BR', | |
'geo': { | |
'lat': 100, | |
'lon': 199 | |
} | |
}, | |
'consoles': [ | |
{ | |
'code': 'PS4', | |
'name': 'PlayStation 4', | |
'attr': { | |
'stars': 5, | |
'manufacturer': 'Sony' | |
}, | |
'comm': [ | |
'very good', | |
'awesome' | |
] | |
}, | |
{ | |
'code': 'XOX', | |
'name': 'Xbox One X', | |
'attr': { | |
'stars': 4, | |
'manufacturer': 'Microsoft' | |
}, | |
'comm': [] | |
} | |
] | |
} | |
exporter = JsonToDataFrameExporter('_id') | |
exporter.export_obj('persons', json) | |
exporter.export_obj('persons', json) | |
exporter.export_all_dataframes() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment