Skip to content

Instantly share code, notes, and snippets.

@Attumm
Created September 11, 2023 11:54
Show Gist options
  • Save Attumm/84431f7cc507d2673965497c883f468a to your computer and use it in GitHub Desktop.
Save Attumm/84431f7cc507d2673965497c883f468a to your computer and use it in GitHub Desktop.
csv writer written for handling json per line dataset.
import csv
class CSVWriter():
"""
A context manager for writing formatted data to CSV files.
This class handles the creation and writing of data to CSV files. It infers the CSV headers from the first data entry
for each table. The CSV files are saved in the specified directory.
Attributes:
base_dir_data (str, optional): The directory where CSV files will be saved. Defaults to 'files'.
file_handlers (dict): A dictionary to store open file handlers for each table.
csv_writers (dict): A dictionary to store CSV writer objects for each table.
Example:
with CSVWriter() as csv_writer:
for item in data_set():
table_name = item["name"]
formatted_data = create_formatted_data(item)
csv_writer.write(table_name, formatted_data)
Usage:
>>> data = [{'id': 1, 'name': 'John', 'age': 25}, {'id': 2, 'name': 'Jane', 'age': 28}]
>>> with CSVWriter() as csv_writer:
... for item in data:
... csv_writer.write('users', item)
>>> # This will create a 'users.csv' file in the 'files' directory with headers inferred from data.
"""
def __init__(self, base_dir_data="files"):
self.file_handlers = {}
self.csv_writers = {}
self.base_dir_data = base_dir_data
def write(self, table_name, formatted_data):
"""
Write the formatted data to the specified table's CSV file.
If this is the first data entry for the table, the headers are inferred from the formatted_data.
Args:
table_name (str): The name of the table (corresponds to the CSV file name).
formatted_data (dict): The data to be written as a dictionary.
"""
if table_name not in self.file_handlers:
self.setup(table_name, formatted_data)
csv_writer = self.csv_writers[table_name]
csv_writer.writerow(formatted_data)
def setup(self, table_name, formatted_data):
"""
Set up the file handler and CSV writer for a table if not already done.
Headers are inferred from the formatted_data.
Args:
table_name (str): The name of the table.
formatted_data (dict): The data whose keys are used as headers.
"""
file_handler = open(f"{self.base_dir_data}/{table_name}.csv", 'w', newline='')
csv_writer = csv.DictWriter(file_handler, fieldnames=formatted_data.keys())
csv_writer.writeheader()
self.file_handlers[table_name] = file_handler
self.csv_writers[table_name] = csv_writer
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
for _, file_handler in self.file_handlers.items():
file_handler.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment