Skip to content

Instantly share code, notes, and snippets.

@JimHaughwout
Last active December 9, 2020 20:11
Show Gist options
  • Save JimHaughwout/9722807 to your computer and use it in GitHub Desktop.
Save JimHaughwout/9722807 to your computer and use it in GitHub Desktop.
Remove a series of columns from a CSV file. When you need to strip a huge and sparse CSV file
#! /usr/bin/env python
"""
Remove a series of columns from a CSV file.
Recommended for files saved in Windows CSV format.
Useful for situations where you need to strip a huge and sparse CSV file
(e.g., logs, sensor data, etc.)
Can easily be converted to function for real-time use.
: param source : csv_file you want to strip. Must end in .csv
: param first_column_number_to_strip : Must be >= 1
: param nth_column_number_to_strip : Must be >= 1
Result is a stripped destination csv of name format source_stripped.csv.
Usage:
drop_csv_columns foo.csv 2 4 9 =>
Remove the 2nd, 4th and 9th columns from foo.csv
Save output to foo_stripped.csv
"""
import sys
import csv
from sys import argv
num_arguments = len(argv)
# Check usage and provide help. Apologies for PEP8 violoations here
if argv[1] in ('-h', '-help'):
print "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0]
sys.exit()
if num_arguments < 3: # Guidance on arguments to pass
usage = "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0]
error = "You passed only %d arguments" % num_arguments
sys.exit("%s -- %s" % (usage, error))
if '.csv' not in argv[1]: # Ensure using a CSV file
usage = "Usage: %s input_file.csv 1st_column_to delete ... nth_column_to_delete" % argv[0]
error = "You passed %r for input_file.csv" % argv[1]
sys.exit("%s -- %s" % (usage, error))
# Create the output file as input with _stripped before .csv extension
input_file = argv[1]
output_file = input_file.replace('.csv', '_stripped.csv')
# Ensure you can open the source and target files
try:
source = open(input_file, 'r')
except:
e = sys.exc_info()[0]
sys.exit("Error - Could not open input file %r: %s" % (input_file, e))
try:
target = open(output_file, 'w')
except:
e = sys.exc_info()[0]
sys.exit("Error - Could not open output file %r: %s" % (output_file, e))
print "\nStripping data from file %r into file %r" % (input_file, output_file)
reader = csv.reader(source)
writer = csv.writer(target)
# Now create a list of columns to delete
deleted_cols = []
for i in range (2, num_arguments): # Skip script name and input filename
# Ensure you are passed column numbers
try:
col_to_delete = int(argv[i]) - 1 # Convert from human to computer
except:
e = sys.exc_info()[0]
sys.exit("Error - Column to delete %r is not an integer: %s." % \
(argv[i], e))
if col_to_delete < 0:
sys.exit("Error - Use column numbers from 1 to N")
else: # Create a tuple for each as described above
deleted_cols.append(col_to_delete) # Add to list of columns to delete
# Let the user know what you are going to delete
header = reader.next()
print 'Deleting these columns:'
for column in deleted_cols:
print ' Column: %3d (%r)' % (column, header[column])
source.seek(0)
# Now loop through each row and remove the desired columns
num_rows = 0
for row in reader:
num_rows += 1
num_cols_deleted = 0 # Reset number cols deleted per row loop
for column in deleted_cols:
# Every time you pass through this loop, the number of columns
# is reduced by one. Factor this into the index to delete
column_to_delete = int(column) - num_cols_deleted
del row[column_to_delete]
num_cols_deleted += 1
writer.writerow(row) # Say that three times quickly!
# Let the user know you are done
print 'SUCCESS: Removed %d columns from %d rows\n' % \
(len(deleted_cols), num_rows)
# Be good and close the files
source.closed
target.closed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment