Skip to content

Instantly share code, notes, and snippets.

@samvarankashyap
Last active May 12, 2020 20:23
Show Gist options
  • Save samvarankashyap/b745e2b487379581fad00a38cfba9d96 to your computer and use it in GitHub Desktop.
Save samvarankashyap/b745e2b487379581fad00a38cfba9d96 to your computer and use it in GitHub Desktop.
#This is source code from filesplit package
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import os
import ntpath
class FileSplit(object):
def __init__(self, file, splitsize, output_dir="."):
"""
Constructor
:param str file: full path to the input file
:param long splitsize: file split size in bytes
:param str output_dir: path to the output directory
"""
self.log = logging.getLogger(__name__)
self.log.info("Initializing file split module.")
if not os.path.isfile(file):
raise FileNotFoundError("The given file path '{0}' is not valid".format(file))
self.file = file
self.log.info("Given input file : '{0}'".format(self.file))
if not os.path.exists(output_dir):
raise NotADirectoryError("The given output path '{0}' is not a valid directory".format(output_dir))
self.output_dir = output_dir
self.log.info("Given output directory: '{0}'".format(self.output_dir))
self._buffer_size = 1000000 # 1MB
self._header = None
self._carryover = None
self._split_size = splitsize
def split(self, include_header=False, callback=None):
"""
Method to split the file to chunks based on the given encoding. Use this function if the file needs to be
read be written to chucks as-is
:param bool include_header: set to True to include header in each splits. Default: False
:param callable callback: (Optional) callback function [func (str, long, long)] that accepts
three arguments - full file path to the destination, size of the file in bytes and line count.
"""
self.log.info("Total file size in bytes: {0}".format(os.path.getsize(self.file)))
self.log.info("Given split size: {0} bytes".format(self._split_size))
directory, file = ntpath.split(self.file)
filename, ext = os.path.splitext(file)
# Keep track of file splits and increment the file counter accordingly
filecounter = 1
# Open the file in read-only mode with the given encoding
with open(file=self.file, mode="rb") as f:
# Iterate and write to file in append mode.
while filecounter is not None:
outfile = os.path.join(self.output_dir, "{0}_{1}{2}".format(filename, filecounter, ext))
# Remove any existing file with the generated file name if exists in the output directory. This
# should automatically clean up existing files that needs to be regenerated.
if os.path.exists(outfile):
self.log.debug("Removing an existing file with the filename '{0}'".format(outfile))
os.remove(outfile)
with open(file=outfile, mode="ab") as of:
self.log.info("Writing to file '{0}'".format(outfile))
total_size, line_count, carryover = self._process_(f, of, include_header)
# Log the file details
self.log.info("Wrote to file '{0}' with {1} bytes of data".format(outfile, total_size))
# Return the file details to the callback function if applicable
if callback is not None:
callback(outfile, total_size, line_count)
# Check if there is any carryover to the next file; if yes increment the filecounter and iterate again
# else exit
if carryover:
filecounter += 1
else:
break
self.log.info("File split complete.")
def splitbyencoding(self, rencoding="utf-8", wencoding="utf-8", include_header=False, callback=None):
"""
Method to split the file to chunks based on the given encoding. Use this function if the file needs to be
read and be written to chucks of specific encoding format
:param str rencoding: encoding of the input file; default utf-8
:param str wencoding: encoding of the output file; default utf-8
:param bool include_header: set to True to include header in each splits
:param callable callback: (Optional) callback function [func (str, long, long)] that accepts
three arguments - full file path to the destination, size of the file in bytes and line count.
:return: None
"""
self.log.info("Total file size in bytes: {0}".format(os.path.getsize(self.file)))
self.log.info("Given split size: {0} bytes".format(self._split_size))
directory, file = ntpath.split(self.file)
filename, ext = os.path.splitext(file)
# Keep track of file splits and increment the file counter accordingly
filecounter = 1
# Open the file in read-only mode with the given encoding
with open(file=self.file, mode="r", encoding=rencoding) as f:
# Iterate and write to file in append mode.
while filecounter is not None:
outfile = os.path.join(self.output_dir, "{0}_{1}{2}".format(filename, filecounter, ext))
# Remove any existing file with the generated file name if exists in the output directory. This
# should automatically clean up existing files that needs to be regenerated.
if os.path.exists(outfile):
self.log.debug("Removing an existing file with the filename '{0}'".format(outfile))
os.remove(outfile)
with open(file=outfile, mode="a", encoding=wencoding) as of:
self.log.info("Writing to file '{0}'".format(outfile))
total_size, line_count, carryover = self._process_(f, of, include_header, wencoding)
# Log the file details
self.log.info("Wrote to file '{0}' with {1} bytes of data".format(outfile, total_size))
# Return the file details to the callback function if applicable
if callback is not None:
callback(outfile, total_size, line_count)
# Check if there is any carryover to the next file; if yes increment the filecounter and iterate again
# else exit
if carryover:
filecounter += 1
else:
break
self.log.info("File split complete.")
def _process_(self, f, of, include_header, wenc=None):
"""
Private function to handle the file splits
:param f: read file object stream
:param of: write file object stream
:param include_header: set to True if header needs to be included in the split files
:param wenc: encoding of the split files; default 'None' if the file needs to be written in binary mode
:return tuple: total size, line count, carryover (bool)
"""
total_size = 0
li = []
current_size = 0
line_count = 0
# If the header needs to be included, treat the first line as header and capture the value beforehand
if include_header & (self._header is None):
self._header = f.readline()
# If the header is set to True, write header to each file splits
if self._header is not None:
of.write(self._header)
size = len(self._header.encode(wenc)) if wenc is not None else len(self._header)
current_size += size
total_size += size
line_count += 1
if self._carryover is not None:
of.write(self._carryover)
size = len(self._carryover.encode(wenc)) if wenc is not None else len(self._carryover)
current_size += size
total_size += size
self._carryover = None
line_count += 1
for line in f:
size = len(line.encode(wenc)) if wenc is not None else len(line)
current_size += size
total_size += size
line_count += 1
# Keep writing to the buffer list as long as the total byte size is within the limits
# of buffer and total split size
if (current_size <= self._buffer_size) & (total_size <= self._split_size):
li.append(line)
continue
# Write the buffer contents to the file if the total byte size exceeds the buffer size
# but is within the split size. Reset the total size and the buffer contents to empty.
elif (current_size > self._buffer_size) & (total_size <= self._split_size):
li.append(line)
of.write("".join(li)) if wenc is not None else of.write(b"".join(li))
current_size = 0
li = []
# If the split size threshold is reached, we don't want to write the current line to the
# current file. Instead, we carry over the line to the next file.
else:
self._carryover = line
self._total_size = total_size - size
line_count = line_count - 1
break
# Empty buffer contents to file before exiting if at all there exists any that did not fit
# into the above if..elif..else logic.
of.write("".join(li)) if wenc is not None else of.write(b"".join(li))
# Set the carryover flag if there are lines pending to be written next split
carryover = True if self._carryover is not None else False
return total_size, line_count, carryover
import time
import sys
start_time = time.time()
print("This is the name of the script: ", sys.argv[0])
print("Number of arguments: ", len(sys.argv))
if len(sys.argv < 4):
raise Exception("Missing arguments")
print("The arguments are: " , str(sys.argv))
print("first arugument file_path ", sys.argv[1])
print("second argument splitsize in MB", str(sys.argv[2]))
print("Third argument output_directory" , str(sys.argv[3]))
fpath = sys.argv[1]
split_size = int(sys.argv[2]) * 1000000
output_dir = sys.argv[3]
fs = FileSplit(file=fpath, splitsize=split_size , output_dir=output_dir)
fs.split()
print("--- %s seconds ---" % (time.time() - start_time))
@samvarankashyap
Copy link
Author

python3 filesplit.py /path/to/file/ 1024 /output/dir/

@samvarankashyap
Copy link
Author

yes test string | head -c 20GB > 20gbfile.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment