Last active
May 23, 2019 16:35
-
-
Save rwev/ac73c7d0addfbcffb42651335ab973cd to your computer and use it in GitHub Desktop.
Combines all .csv files in the current directory into a single file if they are of the same format (i.e. columns / fields match). Preserves chronology of data as provided by filenames.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
CSVC.PY: Comma-Separated Value Combine | |
Combines all .csv files in the current directory into a single file if of same format. Preserves chronology of data as provided by filenames. | |
Author: rwev (https://github.com/rwev) | |
Built-in dependencies only. | |
USAGE: | |
To use CSVC.PY, place all relevant data files of same format in their own folder, and run | |
>python csvc.py | |
from that directory. | |
Procedure: | |
1. Recognize all .csv files in the current directory. | |
2. Check for a significant common substring in the filenames discovered. Continue if present, stop if not. | |
3. For each file | |
- Open and check column labels | |
- concatenate in memory if column labels are consistent with previous; quit if there are any variations. | |
4. Save file as the base stem discovered. | |
NOTE: | |
- To preserve chronological order of the data set, ensure that the files are named in an order correspondent to their data. | |
- For combination process to succeed, all CSV files must have the exact same column names and order. | |
''' | |
import sys | |
import os | |
import itertools | |
from time import time | |
cwdcsvfiles = [f for f in os.listdir('.') if f.endswith('.csv')] | |
if len(cwdcsvfiles) < 2: | |
print 'ERROR: Not enough CSV files found' | |
sys.exit() | |
print 'Found CSV files:' | |
for fn in cwdcsvfiles: | |
print '\t', fn | |
# find the common part of filenames | |
# requirements to define a common part: | |
# - must consist of letters only | |
# - must be more than 4 characters long | |
# - must occupy the same indices of the string | |
# - cannot be the file extension (.csv) | |
def findLongestCommonSubstring(string1, string2): | |
answer = "" | |
len1, len2 = len(string1), len(string2) | |
for i in range(len1): | |
match = "" | |
for j in range(len2): | |
if (i + j < len1 and string1[i + j] == string2[j]) and string2[j].isalpha(): | |
match += string2[j] | |
else: | |
if (len(match) > len(answer)): | |
answer = match | |
match = "" | |
return answer | |
subs = findLongestCommonSubstring(cwdcsvfiles[0], cwdcsvfiles[1]) | |
for f in cwdcsvfiles[2:]: | |
subs = findLongestCommonSubstring(subs, f) | |
if not subs or subs == '.csv': # | |
print 'ERROR: Unable to find base filename' | |
sys.exit() | |
print 'Found common filename substring "' + subs + '"' | |
ofn = cwdcsvfiles[0] | |
starttime = time() | |
print 'Reading ' + ofn + '...\t', | |
of = open(ofn, mode='r') | |
# this script is for CSV only -> comma delimited | |
ofieldstr = of.readline() | |
olines = of.readlines() | |
orowcount = len(olines) | |
of.close() | |
print orowcount, 'lines' | |
totalrowcount = orowcount | |
linefarr = [olines] | |
for tfn in cwdcsvfiles[1:]: | |
print 'Reading ' + tfn + '...\t', | |
tf = open(tfn, mode='r') | |
tfieldstr = tf.readline() | |
if tfieldstr != ofieldstr: | |
print 'ERROR: Column label mismatch' | |
sys.exit() | |
else: | |
tlines = tf.readlines() | |
trowcount = len(tlines) | |
print trowcount, 'lines' | |
totalrowcount += trowcount | |
linefarr.append(tlines) | |
tf.close() | |
finalfn = subs + '.csv' | |
finalf = open(finalfn, mode='w+') | |
finalf.write(tfieldstr) | |
for linearr in linefarr: | |
finalf.writelines(linearr) | |
finalf.close() | |
print 'SUCCESS. Combined data saved to ' + finalfn | |
pred_bytes = 0 | |
for fn in cwdcsvfiles: | |
pred_bytes += os.path.getsize(fn) | |
print 'Predicted filesize: %d KB; Actual: %d KB' % (pred_bytes / 1000, os.path.getsize(finalfn) / 1000) | |
endtime = time() | |
print 'DONE in ' + str(round(endtime - starttime, 2)) + ' seconds.' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment