tsabat/lets_breakup.py

## lets_breakup.py
#!/usr/bin/env python

import sys
import csv
import argparse
from pathlib import Path
import chardet
from urllib3 import encode_multipart_formdata

csv.field_size_limit(sys.maxsize)
parser = argparse.ArgumentParser(
    description="Chunk a large .csv file into smaller files."
)
parser.add_argument(
    "--n",
    type=str,
    required=True,
    help='Filename of the .csv file to be chunked (do not need file extension / ".csv")',
)
parser.add_argument(
    "--s", type=int, required=True, help="Max number of rows each file should have."
)
parser.add_argument(
    "--e", type=str, required=True, help="Encoding: iso-8859-1"
)

args = parser.parse_args()

extension = ".csv"
if ".csv" in args.n:
    extension = ""
filename = args.n + extension
chunk_size = args.s
encoding = args.e

part_dir = Path("part")

if not part_dir.exists():
    part_dir.mkdir()


def predict_encoding(file_path: Path, n_lines: int = 200) -> str:
    """Predict a file's encoding using chardet"""

    # Open the file as binary data
    with Path(file_path).open("rb") as f:
        # Join binary lines for specified number of lines
        rawdata = b"".join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)["encoding"]


def write_chunk(header, part, rows):
    with open("part/" + filename + "_part_" + str(part) + ".csv", "w") as f_out:
        writer = csv.writer(f_out)
        writer.writerow(header)
        writer.writerows(rows)


from os.path import exists

# file_exists = exists()
# print("predicting encoding...")
# encoding = predict_encoding(Path(filename), 5000)
# encoding = "iso-8859-1"
# print(f"found encoding: {encoding}")
# if

rows = []
with open(filename, "r", encoding=encoding) as csvfile:
    count = 0
    filecount = 0
    datareader = csv.reader(csvfile)
    header = next(datareader)
    file_number = count // chunk_size

    print("creating files")
    for row in datareader:
        count += 1
        rows.append(row)
        if count % chunk_size == 0:
            file_number = count // chunk_size
            print(f"creating {file_number}")
            write_chunk(header, file_number, rows)
            rows = []
            filecount += 1

    if len(rows) > 0:
        print(f"creating last file")
        write_chunk(header, file_number + 1, rows)
        filecount += 1


print(str(filecount) + " files created from " + filename)
	#!/usr/bin/env python

	import sys
	import csv
	import argparse
	from pathlib import Path
	import chardet
	from urllib3 import encode_multipart_formdata

	csv.field_size_limit(sys.maxsize)
	parser = argparse.ArgumentParser(
	description="Chunk a large .csv file into smaller files."
	)
	parser.add_argument(
	"--n",
	type=str,
	required=True,
	help='Filename of the .csv file to be chunked (do not need file extension / ".csv")',
	)
	parser.add_argument(
	"--s", type=int, required=True, help="Max number of rows each file should have."
	)
	parser.add_argument(
	"--e", type=str, required=True, help="Encoding: iso-8859-1"
	)

	args = parser.parse_args()

	extension = ".csv"
	if ".csv" in args.n:
	extension = ""
	filename = args.n + extension
	chunk_size = args.s
	encoding = args.e

	part_dir = Path("part")

	if not part_dir.exists():
	part_dir.mkdir()


	def predict_encoding(file_path: Path, n_lines: int = 200) -> str:
	"""Predict a file's encoding using chardet"""

	# Open the file as binary data
	with Path(file_path).open("rb") as f:
	# Join binary lines for specified number of lines
	rawdata = b"".join([f.readline() for _ in range(n_lines)])

	return chardet.detect(rawdata)["encoding"]


	def write_chunk(header, part, rows):
	with open("part/" + filename + "_part_" + str(part) + ".csv", "w") as f_out:
	writer = csv.writer(f_out)
	writer.writerow(header)
	writer.writerows(rows)


	from os.path import exists

	# file_exists = exists()
	# print("predicting encoding...")
	# encoding = predict_encoding(Path(filename), 5000)
	# encoding = "iso-8859-1"
	# print(f"found encoding: {encoding}")
	# if

	rows = []
	with open(filename, "r", encoding=encoding) as csvfile:
	count = 0
	filecount = 0
	datareader = csv.reader(csvfile)
	header = next(datareader)
	file_number = count // chunk_size

	print("creating files")
	for row in datareader:
	count += 1
	rows.append(row)
	if count % chunk_size == 0:
	file_number = count // chunk_size
	print(f"creating {file_number}")
	write_chunk(header, file_number, rows)
	rows = []
	filecount += 1

	if len(rows) > 0:
	print(f"creating last file")
	write_chunk(header, file_number + 1, rows)
	filecount += 1


	print(str(filecount) + " files created from " + filename)