a-r-d/split_csv_to_files.py

## split_csv_to_files.py
#windows :)
#
# split_csv_to_files.py
# ard - me@a-r-d.me
#
# Example usage:
# python .\csv_sort.py -i test.csv -o testing --col 12
#
# Why?
# -Lets say you have a dump of clients or contacts and want to organize them all by state city or company
# -With this script you just specify column to sort by and it creates a dictionary grouping rows with the same
#  value for that column and spits each item in the dictionary out into a CSV file, directory via -o option.
#
import sys
import os
import getopt
import csv

MASTER_ROW_LIST = []
MASTER_UNIQUE_LIST = []
MASTER_DICT = {}
HEADER = []

COL_SPLIT = 0

INPUT_FILE = ''
output_dir_default = 'dumps'
OUTPUT_DIR = ''


def proc_args(argv):
    out_dir = ""
    global OUTPUT_DIR
    global INPUT_FILE
    global COL_SPLIT
    try:
        opts, args = getopt.getopt(argv,"hi:o:c",["ifile=","ofile=","col="])
    except getopt.GetoptError:
        print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            INPUT_FILE = arg
        elif opt in ("-o", "--ofile"):
            out_dir = arg
            OUTPUT_DIR = os.path.join(os.getcwd(),out_dir)
        elif opt in ("-c", "--col"):
            #print "arg: ",arg
            #print opts
            COL_SPLIT = int(arg)

    if out_dir == "" or out_dir == None:
        path = os.path.join(os.getcwd(),output_dir_default)
        OUTPUT_DIR = path


def do_split():
    global MASTER_ROW_LIST
    global MASTER_UNIQUE_LIST
    global MASTER_DICT
    global HEADER
    try:
        f = open(INPUT_FILE, "r")
        #Using CSV module
        reader = csv.reader(f)
        rownum = 0
        for row in reader:
            # Save header row.
            if rownum == 0:
                HEADER = row
            else:
                MASTER_ROW_LIST.append(row)
                colnum = 0
                for col in row:
                    if colnum == COL_SPLIT:
                        MASTER_UNIQUE_LIST.append(col)
                    colnum += 1
            rownum += 1

        my_set = set(MASTER_UNIQUE_LIST)
        print my_set
        MASTER_UNIQUE_LIST = list(my_set)
        print "length of the set: %s" % MASTER_UNIQUE_LIST
        for unique in MASTER_UNIQUE_LIST:
            MASTER_DICT[unique] = []

        ## Formaulate tree: MASTER_DICT = ["AK": [row, row, row], "OH": [row, row]]
        # for row in master list
        f = open(INPUT_FILE, "r")
        reader = csv.reader(f) # start at begin again
        rownum = 0
        for row in reader:
            #print "on row: ", rownum
            # Save header row.
            if rownum == 0:
                header = row
            else:
                colnum = 0
                for col in row:
                    if colnum == COL_SPLIT:
                        #for each key in the master dict
                        for key, value in MASTER_DICT.items():
                            #add a row to the value for the key that matches
                            #print key, value
                            if key == col:
                                #print row
                                value.append(row)
                                break # break after adding that row.
                    colnum += 1
            rownum += 1

    except Exception, e:
        print e
        sys.exit()

def sep_rows():
    try:
        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
            print "created directory"

        i = 0
        for key, value in MASTER_DICT.items():
            s = key.strip() #no filenames with spaces in from
            fname = "row_" + str(COL_SPLIT) + "_" + s + ".csv"
            fname = fname.replace('"', "")
            fname = fname.replace("'", "")
            fname = fname.replace("\\", "")
            fname = fname.replace("/", "")
            fpath = os.path.join(OUTPUT_DIR,fname)
            f = open(fpath,'w')

            for col in HEADER:
                f.write(col + ",")
            f.write("\n")

            for row in value:
                rowStr = ""
                for item in row:
                    rowStr += item + ","
                f.write(rowStr + "\n")
            f.close()

            i += 1
            print "%s file written to %s" % (i, OUTPUT_DIR)
    except Exception, e:
        print e
        sys.exit()

def main(argv):
    proc_args(argv)
    print 'Input file is: ', INPUT_FILE
    print 'Output file is: ', OUTPUT_DIR
    print 'Col to sort on is: ', COL_SPLIT
    do_split()
    sep_rows()


if __name__ == "__main__":
   main(sys.argv[1:])
	#windows :)
	#
	# split_csv_to_files.py
	# ard - me@a-r-d.me
	#
	# Example usage:
	# python .\csv_sort.py -i test.csv -o testing --col 12
	#
	# Why?
	# -Lets say you have a dump of clients or contacts and want to organize them all by state city or company
	# -With this script you just specify column to sort by and it creates a dictionary grouping rows with the same
	# value for that column and spits each item in the dictionary out into a CSV file, directory via -o option.
	#
	import sys
	import os
	import getopt
	import csv

	MASTER_ROW_LIST = []
	MASTER_UNIQUE_LIST = []
	MASTER_DICT = {}
	HEADER = []

	COL_SPLIT = 0

	INPUT_FILE = ''
	output_dir_default = 'dumps'
	OUTPUT_DIR = ''


	def proc_args(argv):
	out_dir = ""
	global OUTPUT_DIR
	global INPUT_FILE
	global COL_SPLIT
	try:
	opts, args = getopt.getopt(argv,"hi:o:c",["ifile=","ofile=","col="])
	except getopt.GetoptError:
	print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
	sys.exit(2)

	for opt, arg in opts:
	if opt == '-h':
	print 'csv_sort.py -i <inputfile> -o <outputdir> --col <coltosortandsplit>'
	sys.exit()
	elif opt in ("-i", "--ifile"):
	INPUT_FILE = arg
	elif opt in ("-o", "--ofile"):
	out_dir = arg
	OUTPUT_DIR = os.path.join(os.getcwd(),out_dir)
	elif opt in ("-c", "--col"):
	#print "arg: ",arg
	#print opts
	COL_SPLIT = int(arg)

	if out_dir == "" or out_dir == None:
	path = os.path.join(os.getcwd(),output_dir_default)
	OUTPUT_DIR = path


	def do_split():
	global MASTER_ROW_LIST
	global MASTER_UNIQUE_LIST
	global MASTER_DICT
	global HEADER
	try:
	f = open(INPUT_FILE, "r")
	#Using CSV module
	reader = csv.reader(f)
	rownum = 0
	for row in reader:
	# Save header row.
	if rownum == 0:
	HEADER = row
	else:
	MASTER_ROW_LIST.append(row)
	colnum = 0
	for col in row:
	if colnum == COL_SPLIT:
	MASTER_UNIQUE_LIST.append(col)
	colnum += 1
	rownum += 1

	my_set = set(MASTER_UNIQUE_LIST)
	print my_set
	MASTER_UNIQUE_LIST = list(my_set)
	print "length of the set: %s" % MASTER_UNIQUE_LIST
	for unique in MASTER_UNIQUE_LIST:
	MASTER_DICT[unique] = []

	## Formaulate tree: MASTER_DICT = ["AK": [row, row, row], "OH": [row, row]]
	# for row in master list
	f = open(INPUT_FILE, "r")
	reader = csv.reader(f) # start at begin again
	rownum = 0
	for row in reader:
	#print "on row: ", rownum
	# Save header row.
	if rownum == 0:
	header = row
	else:
	colnum = 0
	for col in row:
	if colnum == COL_SPLIT:
	#for each key in the master dict
	for key, value in MASTER_DICT.items():
	#add a row to the value for the key that matches
	#print key, value
	if key == col:
	#print row
	value.append(row)
	break # break after adding that row.
	colnum += 1
	rownum += 1

	except Exception, e:
	print e
	sys.exit()

	def sep_rows():
	try:
	if not os.path.exists(OUTPUT_DIR):
	os.makedirs(OUTPUT_DIR)
	print "created directory"

	i = 0
	for key, value in MASTER_DICT.items():
	s = key.strip() #no filenames with spaces in from
	fname = "row_" + str(COL_SPLIT) + "_" + s + ".csv"
	fname = fname.replace('"', "")
	fname = fname.replace("'", "")
	fname = fname.replace("\\", "")
	fname = fname.replace("/", "")
	fpath = os.path.join(OUTPUT_DIR,fname)
	f = open(fpath,'w')

	for col in HEADER:
	f.write(col + ",")
	f.write("\n")

	for row in value:
	rowStr = ""
	for item in row:
	rowStr += item + ","
	f.write(rowStr + "\n")
	f.close()

	i += 1
	print "%s file written to %s" % (i, OUTPUT_DIR)
	except Exception, e:
	print e
	sys.exit()

	def main(argv):
	proc_args(argv)
	print 'Input file is: ', INPUT_FILE
	print 'Output file is: ', OUTPUT_DIR
	print 'Col to sort on is: ', COL_SPLIT
	do_split()
	sep_rows()


	if __name__ == "__main__":
	main(sys.argv[1:])