Skip to content

Instantly share code, notes, and snippets.

@Spaxe
Created August 7, 2015 00:41
Show Gist options
  • Save Spaxe/20b8ba483c6c155c2e3d to your computer and use it in GitHub Desktop.
Save Spaxe/20b8ba483c6c155c2e3d to your computer and use it in GitHub Desktop.
Splits large tab-separated value files into smaller ones.
#!/usr/bin/env python3
'''Breaks large tab-separated value files into smaller files'''
import argparse
def split_tsv(filepath, lines, limit):
header = []
with open(filepath, 'r') as f:
# Assume there is header
header = f.readline().split()
# Write into separate TSV files
count = 0
buff = []
for line in f.readlines():
buff += [line]
count += 1
if count % lines == 0:
new_filepath = filepath + '_{:04d}_'.format(int(count/lines)) + '.txt'
with open(new_filepath, 'w') as t:
buff = ['\t'.join(header)] + buff
print('Writing to {}'.format(new_filepath))
t.write('\n'.join(buff))
buff = []
if limit > 0 and count / lines >= limit:
break
if '__main__' in __name__:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('input', metavar='input_file', type=str, help='Path to TSV input file.')
parser.add_argument('-c', '--count', metavar='lines', type=int, default=1000000, help='Number of lines to store per file. The default is 1000000 lines.')
parser.add_argument('-l', '--limit', metavar='limit', type=int, default=0, help='Maximum number of files to write before quitting. The default is no limit.')
args = parser.parse_args()
split_tsv(args.input, args.count, args.limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment