Skip to content

Instantly share code, notes, and snippets.

@tsabat
Last active July 1, 2022 16:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsabat/63f36efced3206eaa5514449c0048e62 to your computer and use it in GitHub Desktop.
Save tsabat/63f36efced3206eaa5514449c0048e62 to your computer and use it in GitHub Desktop.
Breaks a large csv file up into smaller chunks.
#!/usr/bin/env python
import sys
import csv
import argparse
from pathlib import Path
import chardet
from urllib3 import encode_multipart_formdata
csv.field_size_limit(sys.maxsize)
parser = argparse.ArgumentParser(
description="Chunk a large .csv file into smaller files."
)
parser.add_argument(
"--n",
type=str,
required=True,
help='Filename of the .csv file to be chunked (do not need file extension / ".csv")',
)
parser.add_argument(
"--s", type=int, required=True, help="Max number of rows each file should have."
)
parser.add_argument(
"--e", type=str, required=True, help="Encoding: iso-8859-1"
)
args = parser.parse_args()
extension = ".csv"
if ".csv" in args.n:
extension = ""
filename = args.n + extension
chunk_size = args.s
encoding = args.e
part_dir = Path("part")
if not part_dir.exists():
part_dir.mkdir()
def predict_encoding(file_path: Path, n_lines: int = 200) -> str:
"""Predict a file's encoding using chardet"""
# Open the file as binary data
with Path(file_path).open("rb") as f:
# Join binary lines for specified number of lines
rawdata = b"".join([f.readline() for _ in range(n_lines)])
return chardet.detect(rawdata)["encoding"]
def write_chunk(header, part, rows):
with open("part/" + filename + "_part_" + str(part) + ".csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(header)
writer.writerows(rows)
from os.path import exists
# file_exists = exists()
# print("predicting encoding...")
# encoding = predict_encoding(Path(filename), 5000)
# encoding = "iso-8859-1"
# print(f"found encoding: {encoding}")
# if
rows = []
with open(filename, "r", encoding=encoding) as csvfile:
count = 0
filecount = 0
datareader = csv.reader(csvfile)
header = next(datareader)
file_number = count // chunk_size
print("creating files")
for row in datareader:
count += 1
rows.append(row)
if count % chunk_size == 0:
file_number = count // chunk_size
print(f"creating {file_number}")
write_chunk(header, file_number, rows)
rows = []
filecount += 1
if len(rows) > 0:
print(f"creating last file")
write_chunk(header, file_number + 1, rows)
filecount += 1
print(str(filecount) + " files created from " + filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment