Skip to content

Instantly share code, notes, and snippets.

@a-r-d
Created November 6, 2012 22:15
Show Gist options
  • Save a-r-d/4027994 to your computer and use it in GitHub Desktop.
Save a-r-d/4027994 to your computer and use it in GitHub Desktop.
Takes a big long CSV, splits in more CSVs by sorting uniques in a column
#windows :)
#
# split_csv_to_files.py
# ard - me@a-r-d.me
#
# Example usage:
# python .\csv_sort.py -i test.csv -o testing --col 12
#
# Why?
# -Lets say you have a dump of clients or contacts and want to organize them all by state city or company
# -With this script you just specify column to sort by and it creates a dictionary grouping rows with the same
# value for that column and spits each item in the dictionary out into a CSV file, directory via -o option.
#
import sys
import os
import getopt
import csv
MASTER_ROW_LIST = []
MASTER_UNIQUE_LIST = []
MASTER_DICT = {}
HEADER = []
COL_SPLIT = 0
INPUT_FILE = ''
output_dir_default = 'dumps'
OUTPUT_DIR = ''
def proc_args(argv):
out_dir = ""
global OUTPUT_DIR
global INPUT_FILE
global COL_SPLIT
try:
opts, args = getopt.getopt(argv,"hi:o:c",["ifile=","ofile=","col="])
except getopt.GetoptError:
print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
sys.exit()
elif opt in ("-i", "--ifile"):
INPUT_FILE = arg
elif opt in ("-o", "--ofile"):
out_dir = arg
OUTPUT_DIR = os.path.join(os.getcwd(),out_dir)
elif opt in ("-c", "--col"):
#print "arg: ",arg
#print opts
COL_SPLIT = int(arg)
if out_dir == "" or out_dir == None:
path = os.path.join(os.getcwd(),output_dir_default)
OUTPUT_DIR = path
def do_split():
global MASTER_ROW_LIST
global MASTER_UNIQUE_LIST
global MASTER_DICT
global HEADER
try:
f = open(INPUT_FILE, "r")
#Using CSV module
reader = csv.reader(f)
rownum = 0
for row in reader:
# Save header row.
if rownum == 0:
HEADER = row
else:
MASTER_ROW_LIST.append(row)
colnum = 0
for col in row:
if colnum == COL_SPLIT:
MASTER_UNIQUE_LIST.append(col)
colnum += 1
rownum += 1
my_set = set(MASTER_UNIQUE_LIST)
print my_set
MASTER_UNIQUE_LIST = list(my_set)
print "length of the set: %s" % MASTER_UNIQUE_LIST
for unique in MASTER_UNIQUE_LIST:
MASTER_DICT[unique] = []
## Formaulate tree: MASTER_DICT = ["AK": [row, row, row], "OH": [row, row]]
# for row in master list
f = open(INPUT_FILE, "r")
reader = csv.reader(f) # start at begin again
rownum = 0
for row in reader:
#print "on row: ", rownum
# Save header row.
if rownum == 0:
header = row
else:
colnum = 0
for col in row:
if colnum == COL_SPLIT:
#for each key in the master dict
for key, value in MASTER_DICT.items():
#add a row to the value for the key that matches
#print key, value
if key == col:
#print row
value.append(row)
break # break after adding that row.
colnum += 1
rownum += 1
except Exception, e:
print e
sys.exit()
def sep_rows():
try:
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print "created directory"
i = 0
for key, value in MASTER_DICT.items():
s = key.strip() #no filenames with spaces in from
fname = "row_" + str(COL_SPLIT) + "_" + s + ".csv"
fname = fname.replace('"', "")
fname = fname.replace("'", "")
fname = fname.replace("\\", "")
fname = fname.replace("/", "")
fpath = os.path.join(OUTPUT_DIR,fname)
f = open(fpath,'w')
for col in HEADER:
f.write(col + ",")
f.write("\n")
for row in value:
rowStr = ""
for item in row:
rowStr += item + ","
f.write(rowStr + "\n")
f.close()
i += 1
print "%s file written to %s" % (i, OUTPUT_DIR)
except Exception, e:
print e
sys.exit()
def main(argv):
proc_args(argv)
print 'Input file is: ', INPUT_FILE
print 'Output file is: ', OUTPUT_DIR
print 'Col to sort on is: ', COL_SPLIT
do_split()
sep_rows()
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment