Skip to content

Instantly share code, notes, and snippets.

@aSipiere
Created August 5, 2020 10:15
Show Gist options
  • Save aSipiere/50142b8d731f1e6c9b3144c7e1877363 to your computer and use it in GitHub Desktop.
Save aSipiere/50142b8d731f1e6c9b3144c7e1877363 to your computer and use it in GitHub Desktop.
A python 3.8 update of: https://gist.github.com/jrivero/1085501 with argparse and tqdm.
import os
import csv
import argparse
from tqdm import tqdm
def split(filehandler, delimiter=',', row_limit=10000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
"""
Splits a CSV file into multiple pieces.
A quick bastardization of the Python CSV library.
Arguments:
`row_limit`: The number of rows you want in each output file. 10,000 by default.
`output_name_template`: A %s-style template for the numbered output files.
`output_path`: Where to stick the output files.
`keep_headers`: Whether or not to print the headers in each output file.
Example usage:
>> from toolbox import csv_splitter;
>> csv_splitter.split(open('/home/ben/input.csv', 'r'));
"""
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
f"{output_name_template}{current_piece}.csv"
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = next(reader)
current_out_writer.writerow(headers)
for i, row in tqdm(enumerate(reader), total=142168021):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
f"{output_name_template}{current_piece}.csv"
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--infile", "-I", help="input csv path")
parser.add_argument("--outpath", "-O", nargs='?', default='.', type=str, help="Where to stick the output files.")
parser.add_argument("--delimiter", "-D", nargs='?', default=',', type=str, help="delimiter")
parser.add_argument("--rows", "-R", nargs='?', default=10000, type=int, help="number of rows")
parser.add_argument("--name", "-N", nargs='?', default='output_', type=str, help="A %s-style template for the numbered output files")
args = parser.parse_args()
with open(args.infile) as csvfile:
split(
csvfile,
delimiter=args.delimiter,
output_name_template=args.name,
row_limit=args.rows,
output_path=args.outpath,
keep_headers=True
)
@aSipiere
Copy link
Author

aSipiere commented Aug 5, 2020

Just a few notes, there's a hardcoded value on the tqdm loop that i got from wc -l of the file which is the number of new line characters, and the keep headers is true by default because I forgot to make it a paser arg.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment