Created
March 7, 2017 09:40
-
-
Save garethsenior/d67e6c7553461ab2f56f5036a0ff3ec8 to your computer and use it in GitHub Desktop.
Trim GZipped CSV
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 15.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import sys | |
import gzip | |
import os | |
""" | |
Usage: | |
trim.csv <output-dir> <filename_filter> <search pattern> | |
trim.csv processed .csv.gz ,8088, | |
Script that was used to slim mahoosive CSV files down to a more manageable size. | |
Trims down to the only lines that match a pattern. | |
kudos to @rbnbradford | |
""" | |
def get_cli_args(): | |
cli_args = sys.argv | |
try: | |
directory = cli_args[1] | |
in_file_name = cli_args[2] | |
search_token = cli_args[3] | |
return directory, in_file_name, search_token | |
except: | |
print("bad usage please provide space separated arguments for: " + | |
"directory, in_file_name/'ALL', search_token\n" + | |
"./csv_trimmer.py './im_a_directory' 'ALL' '2205'") | |
exit() | |
def csv_trim(directory, in_file_name, search_token): | |
out_directory = '{}/{}'.format(directory, 'trimmed') | |
try: | |
os.stat(out_directory) | |
except: | |
os.mkdir(out_directory) | |
if in_file_name == 'ALL': | |
csv_trim_all(directory, search_token) | |
else: | |
csv_trim_single(directory, in_file_name, search_token) | |
def csv_trim_single(directory, file_name, search_token): | |
in_filepath = '{}/{}'.format(directory, file_name) | |
out_filepath = '{}/{}/{}'.format(directory, 'trimmed', file_name) | |
opn = gzip.open if '.gz' in file_name else open | |
with opn(in_filepath, 'rt') as infile, opn(out_filepath, 'wt') as outfile: | |
for line in infile: | |
if search_token in line: | |
outfile.write(line) | |
def csv_trim_all(directory, search_token): | |
files = os.listdir(directory) | |
files = [file for file in files if file.endswith('.csv') or file.endswith('.csv.gz')] | |
for file_name in files: | |
csv_trim_single(directory, file_name, search_token) | |
csv_trim(*get_cli_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment