Skip to content

Instantly share code, notes, and snippets.

@sumanthprabhu
Last active January 1, 2016 00:19
Show Gist options
  • Save sumanthprabhu/8066184 to your computer and use it in GitHub Desktop.
Save sumanthprabhu/8066184 to your computer and use it in GitHub Desktop.
A python utility script to merge files in a directory. Pass two arguments - 1) the path to the directory containing files to be merged 2) required number of files after merging
'''
Combine all files in a directory into required number of files
'''
import csv
import sys
import os
import time
def fetch(num):
'''
Return file number for a record
'''
file_number = 1
for entry in fetch.indexer:
if num <= entry:
return file_number
file_number += 1
return file_number
def main(argv):
'''
Main function
'''
record_count = 0
temp_name = "temp" + str(time.time())
with open(temp_name, "a+") as target_file:
directory = os.path.join(argv[1])
for root,dirs,files in os.walk(directory):
for entry in files:
if entry.endswith('.csv'):#matched
#write all data to target file
with open(os.path.join(directory, entry), 'rb') as srcfile:
for line in srcfile:
target_file.write(line)
record_count += 1
#delete file
os.remove(os.path.join(directory, entry) )
#split into required number of files
number_of_files = int(argv[2])
fetch.indexer = []
lines_per_file = record_count / number_of_files
remainder = record_count % number_of_files
for i in xrange(1, number_of_files):
fetch.indexer.append(i * lines_per_file)
#last index should be the record count itself
fetch.indexer.append(record_count)
print "Beginning writing process.."
line_list = []
count = 0
# Write 3000 lines at a time into corresponding file
# Basically, chose 3k as limit for size of line_list
comparator = min(3000, lines_per_file)
with open(temp_name, "rb") as target_file:
for i, line in enumerate(target_file):
if count < comparator:
line_list.append(line)
count += 1
else:
file_number = fetch(i)
count = 0
path = os.path.join(argv[1], "train_set%s.csv" % file_number)
with open(path, "ab") as train_piece:
for line in line_list:
train_piece.write(line)
line_list = []
with open(path, "ab") as train_piece:
for line in line_list:
train_piece.write(line)
#delete temp file
os.remove(temp_name)
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment