Created
August 7, 2015 00:41
-
-
Save Spaxe/20b8ba483c6c155c2e3d to your computer and use it in GitHub Desktop.
Splits large tab-separated value files into smaller ones.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
'''Breaks large tab-separated value files into smaller files''' | |
import argparse | |
def split_tsv(filepath, lines, limit): | |
header = [] | |
with open(filepath, 'r') as f: | |
# Assume there is header | |
header = f.readline().split() | |
# Write into separate TSV files | |
count = 0 | |
buff = [] | |
for line in f.readlines(): | |
buff += [line] | |
count += 1 | |
if count % lines == 0: | |
new_filepath = filepath + '_{:04d}_'.format(int(count/lines)) + '.txt' | |
with open(new_filepath, 'w') as t: | |
buff = ['\t'.join(header)] + buff | |
print('Writing to {}'.format(new_filepath)) | |
t.write('\n'.join(buff)) | |
buff = [] | |
if limit > 0 and count / lines >= limit: | |
break | |
if '__main__' in __name__: | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('input', metavar='input_file', type=str, help='Path to TSV input file.') | |
parser.add_argument('-c', '--count', metavar='lines', type=int, default=1000000, help='Number of lines to store per file. The default is 1000000 lines.') | |
parser.add_argument('-l', '--limit', metavar='limit', type=int, default=0, help='Maximum number of files to write before quitting. The default is no limit.') | |
args = parser.parse_args() | |
split_tsv(args.input, args.count, args.limit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment