Last active
September 8, 2020 01:53
-
-
Save ryansmccoy/5a85720e6f12ea9e3829b6b84741c3a0 to your computer and use it in GitHub Desktop.
Converts CSV to Parquet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Converts CSV file to Parquet | |
This module converts a CSV file to a Parquet file | |
Example: | |
$ python csv2parquet.py --input-filepath example.csv --output-filepath example.parquet | |
""" | |
import logging | |
import subprocess | |
import sys | |
import time | |
import pkg_resources | |
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', | |
datefmt='%m/%d/%Y %H:%M:%S', | |
level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
required = {'pandas', 'click'} | |
installed = {pkg.key for pkg in pkg_resources.working_set} | |
missing = required - installed | |
if missing: | |
python = sys.executable | |
subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL) | |
time.sleep(10) | |
import csv | |
import click | |
import pandas as pd | |
class DataProcessor(object): | |
"""Example CSV class to read data files.""" | |
@classmethod | |
def read_csv(cls, input_file): | |
"""Reads a CSV separated value file.""" | |
with open(input_file, "r") as f: | |
reader = csv.reader(f, delimiter=",") | |
lines = [] | |
for line in reader: | |
lines.append(line) | |
return lines | |
@click.command() | |
@click.option('--input-filepath', required=True, | |
type=click.Path(resolve_path=True, readable=True, exists=True, | |
file_okay=True), | |
help='Filepath to input CSV file') | |
@click.option('--output-filepath', required=True, | |
type=click.Path(resolve_path=True, readable=True, exists=False, | |
file_okay=True), | |
help='Filepath to Output Parquet file') | |
def main(input_filepath, output_filepath): | |
"""Converts CSV file to Parquet file | |
Args: | |
input_filepath (str): CSV file | |
output_filepath (str): Parquet file | |
""" | |
try: | |
df = pd.read_csv(input_filepath) | |
df.to_parquet(output_filepath) | |
except Exception as e: | |
logger.error(f"Error! Could not Complete with Pandas\t{e}") | |
sys.exit(1) | |
logger.info(f"Success! Output Filepath:\t{output_filepath}") | |
return 0 | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment