Skip to content

Instantly share code, notes, and snippets.

@SegFaultAX
Last active August 22, 2021 02:40
Show Gist options
  • Save SegFaultAX/9b05d9922ecd2b49c304 to your computer and use it in GitHub Desktop.
Save SegFaultAX/9b05d9922ecd2b49c304 to your computer and use it in GitHub Desktop.
CSV file splitter (each file after the split keeps the header line)
#!/usr/bin/env python
import csv
import argparse
from itertools import islice
def load_csv(filename, headers=None):
reader = csv.reader(open(filename, "rU"))
if headers is None:
headers = next(reader)
else:
next(reader)
return headers, reader
def split_every(n, iterable):
i = iter(iterable)
piece = list(islice(i, n))
while piece:
yield piece
piece = list(islice(i, n))
def split_csv(filename, per_file, headers=None):
name, ext = filename.rsplit(".", 1)
newf = lambda n: ".".join([name, "part" + str(n), ext])
headers, reader = load_csv(filename, headers)
for idx, rows in enumerate(split_every(per_file, reader)):
out_file = newf(idx)
print "Writing {} rows to {}".format(len(rows), out_file)
with open(out_file, "w") as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(rows)
def main():
parser = argparse.ArgumentParser(
description="Break CSV into multiple files")
parser.add_argument("filename", type=str,
help="Path to the CSV file")
parser.add_argument("-m", "--max-rows", type=int, default=500,
help="Maximum number of rows per file (default: 500)")
parser.add_argument("--headers", type=str,
help="Custom column headers")
args = parser.parse_args()
split_csv(args.filename, args.max_rows, args.headers)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment