Skip to content

Instantly share code, notes, and snippets.

@garethsenior
Created March 7, 2017 09:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save garethsenior/d67e6c7553461ab2f56f5036a0ff3ec8 to your computer and use it in GitHub Desktop.
Save garethsenior/d67e6c7553461ab2f56f5036a0ff3ec8 to your computer and use it in GitHub Desktop.
Trim GZipped CSV
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 15.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import gzip
import os
"""
Usage:
trim.csv <output-dir> <filename_filter> <search pattern>
trim.csv processed .csv.gz ,8088,
Script that was used to slim mahoosive CSV files down to a more manageable size.
Trims down to the only lines that match a pattern.
kudos to @rbnbradford
"""
def get_cli_args():
cli_args = sys.argv
try:
directory = cli_args[1]
in_file_name = cli_args[2]
search_token = cli_args[3]
return directory, in_file_name, search_token
except:
print("bad usage please provide space separated arguments for: " +
"directory, in_file_name/'ALL', search_token\n" +
"./csv_trimmer.py './im_a_directory' 'ALL' '2205'")
exit()
def csv_trim(directory, in_file_name, search_token):
out_directory = '{}/{}'.format(directory, 'trimmed')
try:
os.stat(out_directory)
except:
os.mkdir(out_directory)
if in_file_name == 'ALL':
csv_trim_all(directory, search_token)
else:
csv_trim_single(directory, in_file_name, search_token)
def csv_trim_single(directory, file_name, search_token):
in_filepath = '{}/{}'.format(directory, file_name)
out_filepath = '{}/{}/{}'.format(directory, 'trimmed', file_name)
opn = gzip.open if '.gz' in file_name else open
with opn(in_filepath, 'rt') as infile, opn(out_filepath, 'wt') as outfile:
for line in infile:
if search_token in line:
outfile.write(line)
def csv_trim_all(directory, search_token):
files = os.listdir(directory)
files = [file for file in files if file.endswith('.csv') or file.endswith('.csv.gz')]
for file_name in files:
csv_trim_single(directory, file_name, search_token)
csv_trim(*get_cli_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment