rwev/csvc.py

## csvc.py
'''
CSVC.PY: Comma-Separated Value Combine

Combines all  .csv files in the current directory into a single file if of same format. Preserves chronology of data as provided by filenames.

Author: rwev (https://github.com/rwev)

Built-in dependencies only.

USAGE:
	To use CSVC.PY, place all relevant data files of same format in their own folder, and run
    		>python csvc.py
	from that directory.

Procedure:
    	1. Recognize all .csv files in the current directory.
	2. Check for a significant common substring in the filenames discovered. Continue if present, stop if not.
	3. For each file
	- Open and check column labels
	    - concatenate in memory if column labels are consistent with previous; quit if there are any variations.
	4. Save file as the base stem discovered.

NOTE:
    - To preserve chronological order of the data set, ensure that the files are named in an order correspondent to their data.
    - For combination process to succeed, all CSV files must have the exact same column names and order.
'''

import sys
import os
import itertools
from time import time

cwdcsvfiles = [f for f in os.listdir('.') if f.endswith('.csv')]
if len(cwdcsvfiles) < 2:
    print 'ERROR: Not enough CSV files found'
    sys.exit()
print 'Found CSV files:'
for fn in cwdcsvfiles:
    print '\t', fn

# find the common part of filenames
# requirements to define a common part:
#   - must consist of letters only
#   - must be more than 4 characters long
#   - must occupy the same indices of the string
#   - cannot be the file extension (.csv)

def findLongestCommonSubstring(string1, string2):
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        match = ""
        for j in range(len2):
            if (i + j < len1 and string1[i + j] == string2[j]) and string2[j].isalpha():
                match += string2[j]
            else:
                if (len(match) > len(answer)):
                    answer = match
                match = ""
    return answer

subs = findLongestCommonSubstring(cwdcsvfiles[0], cwdcsvfiles[1])
for f in cwdcsvfiles[2:]:
    subs = findLongestCommonSubstring(subs, f)
    if not subs or subs == '.csv': #
        print 'ERROR: Unable to find base filename'
        sys.exit()

print 'Found common filename substring "' + subs + '"'

ofn = cwdcsvfiles[0]

starttime = time()

print 'Reading ' + ofn + '...\t',
of = open(ofn, mode='r')
# this script is for CSV only -> comma delimited
ofieldstr = of.readline()
olines = of.readlines()
orowcount = len(olines)
of.close()

print orowcount, 'lines'
totalrowcount = orowcount

linefarr = [olines]

for tfn in cwdcsvfiles[1:]:
    print 'Reading ' + tfn + '...\t',
    tf = open(tfn, mode='r')
    tfieldstr = tf.readline()

    if tfieldstr != ofieldstr:
        print 'ERROR: Column label mismatch'
        sys.exit()
    else:
        tlines = tf.readlines()
        trowcount = len(tlines)
        print trowcount, 'lines'
        totalrowcount += trowcount
        linefarr.append(tlines)

    tf.close()

finalfn = subs + '.csv'
finalf = open(finalfn, mode='w+')
finalf.write(tfieldstr)
for linearr in linefarr:
    finalf.writelines(linearr)
finalf.close()

print 'SUCCESS. Combined data saved to ' + finalfn

pred_bytes = 0
for fn in cwdcsvfiles:
    pred_bytes += os.path.getsize(fn)
print 'Predicted filesize: %d KB; Actual: %d KB' % (pred_bytes / 1000, os.path.getsize(finalfn) / 1000)

endtime = time()
print 'DONE in ' + str(round(endtime - starttime, 2)) + ' seconds.'
	'''
	CSVC.PY: Comma-Separated Value Combine

	Combines all .csv files in the current directory into a single file if of same format. Preserves chronology of data as provided by filenames.

	Author: rwev (https://github.com/rwev)

	Built-in dependencies only.

	USAGE:
	To use CSVC.PY, place all relevant data files of same format in their own folder, and run
	>python csvc.py
	from that directory.

	Procedure:
	1. Recognize all .csv files in the current directory.
	2. Check for a significant common substring in the filenames discovered. Continue if present, stop if not.
	3. For each file
	- Open and check column labels
	- concatenate in memory if column labels are consistent with previous; quit if there are any variations.
	4. Save file as the base stem discovered.

	NOTE:
	- To preserve chronological order of the data set, ensure that the files are named in an order correspondent to their data.
	- For combination process to succeed, all CSV files must have the exact same column names and order.
	'''

	import sys
	import os
	import itertools
	from time import time

	cwdcsvfiles = [f for f in os.listdir('.') if f.endswith('.csv')]
	if len(cwdcsvfiles) < 2:
	print 'ERROR: Not enough CSV files found'
	sys.exit()
	print 'Found CSV files:'
	for fn in cwdcsvfiles:
	print '\t', fn

	# find the common part of filenames
	# requirements to define a common part:
	# - must consist of letters only
	# - must be more than 4 characters long
	# - must occupy the same indices of the string
	# - cannot be the file extension (.csv)

	def findLongestCommonSubstring(string1, string2):
	answer = ""
	len1, len2 = len(string1), len(string2)
	for i in range(len1):
	match = ""
	for j in range(len2):
	if (i + j < len1 and string1[i + j] == string2[j]) and string2[j].isalpha():
	match += string2[j]
	else:
	if (len(match) > len(answer)):
	answer = match
	match = ""
	return answer

	subs = findLongestCommonSubstring(cwdcsvfiles[0], cwdcsvfiles[1])
	for f in cwdcsvfiles[2:]:
	subs = findLongestCommonSubstring(subs, f)
	if not subs or subs == '.csv': #
	print 'ERROR: Unable to find base filename'
	sys.exit()

	print 'Found common filename substring "' + subs + '"'

	ofn = cwdcsvfiles[0]

	starttime = time()

	print 'Reading ' + ofn + '...\t',
	of = open(ofn, mode='r')
	# this script is for CSV only -> comma delimited
	ofieldstr = of.readline()
	olines = of.readlines()
	orowcount = len(olines)
	of.close()

	print orowcount, 'lines'
	totalrowcount = orowcount

	linefarr = [olines]

	for tfn in cwdcsvfiles[1:]:
	print 'Reading ' + tfn + '...\t',
	tf = open(tfn, mode='r')
	tfieldstr = tf.readline()

	if tfieldstr != ofieldstr:
	print 'ERROR: Column label mismatch'
	sys.exit()
	else:
	tlines = tf.readlines()
	trowcount = len(tlines)
	print trowcount, 'lines'
	totalrowcount += trowcount
	linefarr.append(tlines)

	tf.close()

	finalfn = subs + '.csv'
	finalf = open(finalfn, mode='w+')
	finalf.write(tfieldstr)
	for linearr in linefarr:
	finalf.writelines(linearr)
	finalf.close()

	print 'SUCCESS. Combined data saved to ' + finalfn

	pred_bytes = 0
	for fn in cwdcsvfiles:
	pred_bytes += os.path.getsize(fn)
	print 'Predicted filesize: %d KB; Actual: %d KB' % (pred_bytes / 1000, os.path.getsize(finalfn) / 1000)

	endtime = time()
	print 'DONE in ' + str(round(endtime - starttime, 2)) + ' seconds.'