Skip to content

Instantly share code, notes, and snippets.

@ryansmccoy
Last active September 8, 2020 01:53
Show Gist options
  • Save ryansmccoy/5a85720e6f12ea9e3829b6b84741c3a0 to your computer and use it in GitHub Desktop.
Save ryansmccoy/5a85720e6f12ea9e3829b6b84741c3a0 to your computer and use it in GitHub Desktop.
Converts CSV to Parquet
#!/usr/bin/env python
"""Converts CSV file to Parquet
This module converts a CSV file to a Parquet file
Example:
$ python csv2parquet.py --input-filepath example.csv --output-filepath example.parquet
"""
import logging
import subprocess
import sys
import time
import pkg_resources
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
required = {'pandas', 'click'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed
if missing:
python = sys.executable
subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)
time.sleep(10)
import csv
import click
import pandas as pd
class DataProcessor(object):
"""Example CSV class to read data files."""
@classmethod
def read_csv(cls, input_file):
"""Reads a CSV separated value file."""
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter=",")
lines = []
for line in reader:
lines.append(line)
return lines
@click.command()
@click.option('--input-filepath', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to input CSV file')
@click.option('--output-filepath', required=True,
type=click.Path(resolve_path=True, readable=True, exists=False,
file_okay=True),
help='Filepath to Output Parquet file')
def main(input_filepath, output_filepath):
"""Converts CSV file to Parquet file
Args:
input_filepath (str): CSV file
output_filepath (str): Parquet file
"""
try:
df = pd.read_csv(input_filepath)
df.to_parquet(output_filepath)
except Exception as e:
logger.error(f"Error! Could not Complete with Pandas\t{e}")
sys.exit(1)
logger.info(f"Success! Output Filepath:\t{output_filepath}")
return 0
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment