Created
November 6, 2012 22:15
-
-
Save a-r-d/4027994 to your computer and use it in GitHub Desktop.
Takes a big long CSV, splits in more CSVs by sorting uniques in a column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#windows :) | |
# | |
# split_csv_to_files.py | |
# ard - me@a-r-d.me | |
# | |
# Example usage: | |
# python .\csv_sort.py -i test.csv -o testing --col 12 | |
# | |
# Why? | |
# -Lets say you have a dump of clients or contacts and want to organize them all by state city or company | |
# -With this script you just specify column to sort by and it creates a dictionary grouping rows with the same | |
# value for that column and spits each item in the dictionary out into a CSV file, directory via -o option. | |
# | |
import sys | |
import os | |
import getopt | |
import csv | |
MASTER_ROW_LIST = [] | |
MASTER_UNIQUE_LIST = [] | |
MASTER_DICT = {} | |
HEADER = [] | |
COL_SPLIT = 0 | |
INPUT_FILE = '' | |
output_dir_default = 'dumps' | |
OUTPUT_DIR = '' | |
def proc_args(argv): | |
out_dir = "" | |
global OUTPUT_DIR | |
global INPUT_FILE | |
global COL_SPLIT | |
try: | |
opts, args = getopt.getopt(argv,"hi:o:c",["ifile=","ofile=","col="]) | |
except getopt.GetoptError: | |
print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>' | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt == '-h': | |
print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>' | |
sys.exit() | |
elif opt in ("-i", "--ifile"): | |
INPUT_FILE = arg | |
elif opt in ("-o", "--ofile"): | |
out_dir = arg | |
OUTPUT_DIR = os.path.join(os.getcwd(),out_dir) | |
elif opt in ("-c", "--col"): | |
#print "arg: ",arg | |
#print opts | |
COL_SPLIT = int(arg) | |
if out_dir == "" or out_dir == None: | |
path = os.path.join(os.getcwd(),output_dir_default) | |
OUTPUT_DIR = path | |
def do_split(): | |
global MASTER_ROW_LIST | |
global MASTER_UNIQUE_LIST | |
global MASTER_DICT | |
global HEADER | |
try: | |
f = open(INPUT_FILE, "r") | |
#Using CSV module | |
reader = csv.reader(f) | |
rownum = 0 | |
for row in reader: | |
# Save header row. | |
if rownum == 0: | |
HEADER = row | |
else: | |
MASTER_ROW_LIST.append(row) | |
colnum = 0 | |
for col in row: | |
if colnum == COL_SPLIT: | |
MASTER_UNIQUE_LIST.append(col) | |
colnum += 1 | |
rownum += 1 | |
my_set = set(MASTER_UNIQUE_LIST) | |
print my_set | |
MASTER_UNIQUE_LIST = list(my_set) | |
print "length of the set: %s" % MASTER_UNIQUE_LIST | |
for unique in MASTER_UNIQUE_LIST: | |
MASTER_DICT[unique] = [] | |
## Formaulate tree: MASTER_DICT = ["AK": [row, row, row], "OH": [row, row]] | |
# for row in master list | |
f = open(INPUT_FILE, "r") | |
reader = csv.reader(f) # start at begin again | |
rownum = 0 | |
for row in reader: | |
#print "on row: ", rownum | |
# Save header row. | |
if rownum == 0: | |
header = row | |
else: | |
colnum = 0 | |
for col in row: | |
if colnum == COL_SPLIT: | |
#for each key in the master dict | |
for key, value in MASTER_DICT.items(): | |
#add a row to the value for the key that matches | |
#print key, value | |
if key == col: | |
#print row | |
value.append(row) | |
break # break after adding that row. | |
colnum += 1 | |
rownum += 1 | |
except Exception, e: | |
print e | |
sys.exit() | |
def sep_rows(): | |
try: | |
if not os.path.exists(OUTPUT_DIR): | |
os.makedirs(OUTPUT_DIR) | |
print "created directory" | |
i = 0 | |
for key, value in MASTER_DICT.items(): | |
s = key.strip() #no filenames with spaces in from | |
fname = "row_" + str(COL_SPLIT) + "_" + s + ".csv" | |
fname = fname.replace('"', "") | |
fname = fname.replace("'", "") | |
fname = fname.replace("\\", "") | |
fname = fname.replace("/", "") | |
fpath = os.path.join(OUTPUT_DIR,fname) | |
f = open(fpath,'w') | |
for col in HEADER: | |
f.write(col + ",") | |
f.write("\n") | |
for row in value: | |
rowStr = "" | |
for item in row: | |
rowStr += item + "," | |
f.write(rowStr + "\n") | |
f.close() | |
i += 1 | |
print "%s file written to %s" % (i, OUTPUT_DIR) | |
except Exception, e: | |
print e | |
sys.exit() | |
def main(argv): | |
proc_args(argv) | |
print 'Input file is: ', INPUT_FILE | |
print 'Output file is: ', OUTPUT_DIR | |
print 'Col to sort on is: ', COL_SPLIT | |
do_split() | |
sep_rows() | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment