Last active
December 9, 2020 20:11
-
-
Save JimHaughwout/9722807 to your computer and use it in GitHub Desktop.
Remove a series of columns from a CSV file. When you need to strip a huge and sparse CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
Remove a series of columns from a CSV file. | |
Recommended for files saved in Windows CSV format. | |
Useful for situations where you need to strip a huge and sparse CSV file | |
(e.g., logs, sensor data, etc.) | |
Can easily be converted to function for real-time use. | |
: param source : csv_file you want to strip. Must end in .csv | |
: param first_column_number_to_strip : Must be >= 1 | |
: param nth_column_number_to_strip : Must be >= 1 | |
Result is a stripped destination csv of name format source_stripped.csv. | |
Usage: | |
drop_csv_columns foo.csv 2 4 9 => | |
Remove the 2nd, 4th and 9th columns from foo.csv | |
Save output to foo_stripped.csv | |
""" | |
import sys | |
import csv | |
from sys import argv | |
num_arguments = len(argv) | |
# Check usage and provide help. Apologies for PEP8 violoations here | |
if argv[1] in ('-h', '-help'): | |
print "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0] | |
sys.exit() | |
if num_arguments < 3: # Guidance on arguments to pass | |
usage = "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0] | |
error = "You passed only %d arguments" % num_arguments | |
sys.exit("%s -- %s" % (usage, error)) | |
if '.csv' not in argv[1]: # Ensure using a CSV file | |
usage = "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0] | |
error = "You passed %r for input_file.csv" % argv[1] | |
sys.exit("%s -- %s" % (usage, error)) | |
# Create the output file as input with _stripped before .csv extension | |
input_file = argv[1] | |
output_file = input_file.replace('.csv', '_stripped.csv') | |
# Ensure you can open the source and target files | |
try: | |
source = open(input_file, 'r') | |
except: | |
e = sys.exc_info()[0] | |
sys.exit("Error - Could not open input file %r: %s" % (input_file, e)) | |
try: | |
target = open(output_file, 'w') | |
except: | |
e = sys.exc_info()[0] | |
sys.exit("Error - Could not open output file %r: %s" % (output_file, e)) | |
print "\nStripping data from file %r into file %r" % (input_file, output_file) | |
reader = csv.reader(source) | |
writer = csv.writer(target) | |
# Now create a list of columns to delete | |
deleted_cols = [] | |
for i in range (2, num_arguments): # Skip script name and input filename | |
# Ensure you are passed column numbers | |
try: | |
col_to_delete = int(argv[i]) - 1 # Convert from human to computer | |
except: | |
e = sys.exc_info()[0] | |
sys.exit("Error - Column to delete %r is not an integer: %s." % \ | |
(argv[i], e)) | |
if col_to_delete < 0: | |
sys.exit("Error - Use column numbers from 1 to N") | |
else: # Create a tuple for each as described above | |
deleted_cols.append(col_to_delete) # Add to list of columns to delete | |
# Let the user know what you are going to delete | |
header = reader.next() | |
print 'Deleting these columns:' | |
for column in deleted_cols: | |
print ' Column: %3d (%r)' % (column, header[column]) | |
source.seek(0) | |
# Now loop through each row and remove the desired columns | |
num_rows = 0 | |
for row in reader: | |
num_rows += 1 | |
num_cols_deleted = 0 # Reset number cols deleted per row loop | |
for column in deleted_cols: | |
# Every time you pass through this loop, the number of columns | |
# is reduced by one. Factor this into the index to delete | |
column_to_delete = int(column) - num_cols_deleted | |
del row[column_to_delete] | |
num_cols_deleted += 1 | |
writer.writerow(row) # Say that three times quickly! | |
# Let the user know you are done | |
print 'SUCCESS: Removed %d columns from %d rows\n' % \ | |
(len(deleted_cols), num_rows) | |
# Be good and close the files | |
source.closed | |
target.closed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment