Skip to content

Instantly share code, notes, and snippets.

@Morasta
Created November 20, 2014 23:52
Show Gist options
  • Save Morasta/68c2b3ba969685b3394d to your computer and use it in GitHub Desktop.
Save Morasta/68c2b3ba969685b3394d to your computer and use it in GitHub Desktop.
CSVToLookups
#Separates columns in a CSV into individual files with unique values per field
# Useful for generating lookup tables with unique values from the original csv's field
#
# Room for implementation improvement, but this works as a quick and dirty solution
#!/usr/bin/python
import csv
from distutils.util import strtobool
input_filename = raw_input('Enter input filename: ')
insert_ids = None
while insert_ids is None:
try:
insert_ids = strtobool(raw_input('Include ID column in output? (y/n) '))
except ValueError:
print "Error: Invalid selection."
include_header = None
while include_header is None:
try:
include_header = strtobool(raw_input('Include column/field names in output? (y/n) '))
except ValueError:
print "Error: Invalid selection."
delimiter = ','
quote_char = '"'
escape_char = '\\'
csv_fp = open(input_filename, 'rU')
csv_reader = csv.DictReader(csv_fp, fieldnames=[], restkey='undefined-fieldnames', delimiter=delimiter, quotechar=quote_char)
results = dict() #output dict format (cola: [val1, val2, ..., valn], colb: [val1, val2, ...], ...)
header_row = True
for row in csv_reader:
if header_row == True:
header_row = False
csv_reader.fieldnames = row['undefined-fieldnames']
for field in csv_reader.fieldnames:
results.setdefault(field)
else:
for key in results:
if results[key] == None:
results[key] = [row[key]]
else:
if row[key] not in results[key]:
results[key].append(row[key])
for key in results:
if insert_ids == True:
initial_id = 1
with open("output/"+key+'_output.csv', 'wb') as csvfile:
output = csv.writer(csvfile, delimiter = delimiter, quotechar=quote_char, quoting=csv.QUOTE_NONE, escapechar=escape_char)
if include_header == True:
if insert_ids == True:
output.writerow(['id', u''.join(key).encode('utf8').strip()])
else:
output.writerow([u''.join(key).encode('utf8').strip()])
results[key].sort()
for record in results[key]:
if record is not "":
if insert_ids == True:
output.writerow([initial_id, u''.join(record).encode('utf8').strip()])
initial_id += 1
else:
output.writerow([u''.join(record).encode('utf8').strip()])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment