Last active
May 12, 2020 20:23
-
-
Save samvarankashyap/b745e2b487379581fad00a38cfba9d96 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This is source code from filesplit package | |
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import logging | |
import os | |
import ntpath | |
class FileSplit(object): | |
def __init__(self, file, splitsize, output_dir="."): | |
""" | |
Constructor | |
:param str file: full path to the input file | |
:param long splitsize: file split size in bytes | |
:param str output_dir: path to the output directory | |
""" | |
self.log = logging.getLogger(__name__) | |
self.log.info("Initializing file split module.") | |
if not os.path.isfile(file): | |
raise FileNotFoundError("The given file path '{0}' is not valid".format(file)) | |
self.file = file | |
self.log.info("Given input file : '{0}'".format(self.file)) | |
if not os.path.exists(output_dir): | |
raise NotADirectoryError("The given output path '{0}' is not a valid directory".format(output_dir)) | |
self.output_dir = output_dir | |
self.log.info("Given output directory: '{0}'".format(self.output_dir)) | |
self._buffer_size = 1000000 # 1MB | |
self._header = None | |
self._carryover = None | |
self._split_size = splitsize | |
def split(self, include_header=False, callback=None): | |
""" | |
Method to split the file to chunks based on the given encoding. Use this function if the file needs to be | |
read be written to chucks as-is | |
:param bool include_header: set to True to include header in each splits. Default: False | |
:param callable callback: (Optional) callback function [func (str, long, long)] that accepts | |
three arguments - full file path to the destination, size of the file in bytes and line count. | |
""" | |
self.log.info("Total file size in bytes: {0}".format(os.path.getsize(self.file))) | |
self.log.info("Given split size: {0} bytes".format(self._split_size)) | |
directory, file = ntpath.split(self.file) | |
filename, ext = os.path.splitext(file) | |
# Keep track of file splits and increment the file counter accordingly | |
filecounter = 1 | |
# Open the file in read-only mode with the given encoding | |
with open(file=self.file, mode="rb") as f: | |
# Iterate and write to file in append mode. | |
while filecounter is not None: | |
outfile = os.path.join(self.output_dir, "{0}_{1}{2}".format(filename, filecounter, ext)) | |
# Remove any existing file with the generated file name if exists in the output directory. This | |
# should automatically clean up existing files that needs to be regenerated. | |
if os.path.exists(outfile): | |
self.log.debug("Removing an existing file with the filename '{0}'".format(outfile)) | |
os.remove(outfile) | |
with open(file=outfile, mode="ab") as of: | |
self.log.info("Writing to file '{0}'".format(outfile)) | |
total_size, line_count, carryover = self._process_(f, of, include_header) | |
# Log the file details | |
self.log.info("Wrote to file '{0}' with {1} bytes of data".format(outfile, total_size)) | |
# Return the file details to the callback function if applicable | |
if callback is not None: | |
callback(outfile, total_size, line_count) | |
# Check if there is any carryover to the next file; if yes increment the filecounter and iterate again | |
# else exit | |
if carryover: | |
filecounter += 1 | |
else: | |
break | |
self.log.info("File split complete.") | |
def splitbyencoding(self, rencoding="utf-8", wencoding="utf-8", include_header=False, callback=None): | |
""" | |
Method to split the file to chunks based on the given encoding. Use this function if the file needs to be | |
read and be written to chucks of specific encoding format | |
:param str rencoding: encoding of the input file; default utf-8 | |
:param str wencoding: encoding of the output file; default utf-8 | |
:param bool include_header: set to True to include header in each splits | |
:param callable callback: (Optional) callback function [func (str, long, long)] that accepts | |
three arguments - full file path to the destination, size of the file in bytes and line count. | |
:return: None | |
""" | |
self.log.info("Total file size in bytes: {0}".format(os.path.getsize(self.file))) | |
self.log.info("Given split size: {0} bytes".format(self._split_size)) | |
directory, file = ntpath.split(self.file) | |
filename, ext = os.path.splitext(file) | |
# Keep track of file splits and increment the file counter accordingly | |
filecounter = 1 | |
# Open the file in read-only mode with the given encoding | |
with open(file=self.file, mode="r", encoding=rencoding) as f: | |
# Iterate and write to file in append mode. | |
while filecounter is not None: | |
outfile = os.path.join(self.output_dir, "{0}_{1}{2}".format(filename, filecounter, ext)) | |
# Remove any existing file with the generated file name if exists in the output directory. This | |
# should automatically clean up existing files that needs to be regenerated. | |
if os.path.exists(outfile): | |
self.log.debug("Removing an existing file with the filename '{0}'".format(outfile)) | |
os.remove(outfile) | |
with open(file=outfile, mode="a", encoding=wencoding) as of: | |
self.log.info("Writing to file '{0}'".format(outfile)) | |
total_size, line_count, carryover = self._process_(f, of, include_header, wencoding) | |
# Log the file details | |
self.log.info("Wrote to file '{0}' with {1} bytes of data".format(outfile, total_size)) | |
# Return the file details to the callback function if applicable | |
if callback is not None: | |
callback(outfile, total_size, line_count) | |
# Check if there is any carryover to the next file; if yes increment the filecounter and iterate again | |
# else exit | |
if carryover: | |
filecounter += 1 | |
else: | |
break | |
self.log.info("File split complete.") | |
def _process_(self, f, of, include_header, wenc=None): | |
""" | |
Private function to handle the file splits | |
:param f: read file object stream | |
:param of: write file object stream | |
:param include_header: set to True if header needs to be included in the split files | |
:param wenc: encoding of the split files; default 'None' if the file needs to be written in binary mode | |
:return tuple: total size, line count, carryover (bool) | |
""" | |
total_size = 0 | |
li = [] | |
current_size = 0 | |
line_count = 0 | |
# If the header needs to be included, treat the first line as header and capture the value beforehand | |
if include_header & (self._header is None): | |
self._header = f.readline() | |
# If the header is set to True, write header to each file splits | |
if self._header is not None: | |
of.write(self._header) | |
size = len(self._header.encode(wenc)) if wenc is not None else len(self._header) | |
current_size += size | |
total_size += size | |
line_count += 1 | |
if self._carryover is not None: | |
of.write(self._carryover) | |
size = len(self._carryover.encode(wenc)) if wenc is not None else len(self._carryover) | |
current_size += size | |
total_size += size | |
self._carryover = None | |
line_count += 1 | |
for line in f: | |
size = len(line.encode(wenc)) if wenc is not None else len(line) | |
current_size += size | |
total_size += size | |
line_count += 1 | |
# Keep writing to the buffer list as long as the total byte size is within the limits | |
# of buffer and total split size | |
if (current_size <= self._buffer_size) & (total_size <= self._split_size): | |
li.append(line) | |
continue | |
# Write the buffer contents to the file if the total byte size exceeds the buffer size | |
# but is within the split size. Reset the total size and the buffer contents to empty. | |
elif (current_size > self._buffer_size) & (total_size <= self._split_size): | |
li.append(line) | |
of.write("".join(li)) if wenc is not None else of.write(b"".join(li)) | |
current_size = 0 | |
li = [] | |
# If the split size threshold is reached, we don't want to write the current line to the | |
# current file. Instead, we carry over the line to the next file. | |
else: | |
self._carryover = line | |
self._total_size = total_size - size | |
line_count = line_count - 1 | |
break | |
# Empty buffer contents to file before exiting if at all there exists any that did not fit | |
# into the above if..elif..else logic. | |
of.write("".join(li)) if wenc is not None else of.write(b"".join(li)) | |
# Set the carryover flag if there are lines pending to be written next split | |
carryover = True if self._carryover is not None else False | |
return total_size, line_count, carryover | |
import time | |
import sys | |
start_time = time.time() | |
print("This is the name of the script: ", sys.argv[0]) | |
print("Number of arguments: ", len(sys.argv)) | |
if len(sys.argv < 4): | |
raise Exception("Missing arguments") | |
print("The arguments are: " , str(sys.argv)) | |
print("first arugument file_path ", sys.argv[1]) | |
print("second argument splitsize in MB", str(sys.argv[2])) | |
print("Third argument output_directory" , str(sys.argv[3])) | |
fpath = sys.argv[1] | |
split_size = int(sys.argv[2]) * 1000000 | |
output_dir = sys.argv[3] | |
fs = FileSplit(file=fpath, splitsize=split_size , output_dir=output_dir) | |
fs.split() | |
print("--- %s seconds ---" % (time.time() - start_time)) |
yes test string | head -c 20GB > 20gbfile.txt
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python3 filesplit.py /path/to/file/ 1024 /output/dir/