Last active
April 8, 2020 00:14
-
-
Save maximpertsov/7011378282ac0b1d873d to your computer and use it in GitHub Desktop.
Split a text file into several smaller files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
from __future__ import division | |
from argparse import ArgumentParser, Action | |
from os.path import splitext | |
from itertools import izip_longest | |
class FileSplitter(object): | |
def execute(self, argv=None): | |
""" | |
Split a text file into several smaller files. User must specify | |
maximum line count per file. | |
""" | |
# Read arguments and split file | |
args = self._parse_arguments(argv) | |
newfiles = self.split_by_line_count(args.file, args.limit) | |
# Output message | |
print "Split {0} into {1} files".format(args.file, len(newfiles)) | |
# Verbose output message | |
if args.verbose: | |
for f in newfiles: | |
print f | |
def split_by_line_count(self, myfile, maxlines): | |
""" | |
Split a text file into several smaller files, each with no more than | |
<maxlines> lines per file. Returns a list of the newly created files. | |
""" | |
with open(myfile, 'r') as infile: | |
# Split file into smaller chunks | |
chunks = izip_longest(*[infile] * maxlines) | |
# Write the lines each chunk to a separate file | |
newfiles = [] | |
for i, chunk in enumerate(chunks, start=1): | |
newfile = self._write_to_file(chunk, myfile, 'w', i) | |
newfiles.append(newfile) | |
return newfiles | |
def _write_to_file(self, lines, myfile, mode='w', suffix=''): | |
""" | |
Write lines to a text file, and return the text file name | |
if successful. Can optionally supply a suffix to the filename. | |
""" | |
# Add suffix to filename if supplied | |
if suffix: | |
fn, ext = splitext(myfile) | |
newfile = "{0}{1}{2}".format(fn, suffix, ext) | |
else: | |
newfile = myfile | |
# Write supplied lines to file | |
with open(newfile, mode) as f: | |
for line in lines: | |
if line is not None: | |
f.write(line) | |
return newfile | |
def _parse_arguments(self, argv=None): | |
""" | |
Define necessary inputs and parse command-line arguments | |
""" | |
parser = ArgumentParser( | |
description='Split text file into several smaller files') | |
parser.add_argument( | |
'file', help='File to be split') | |
parser.add_argument( | |
'limit', help='Limit value', | |
type=int, const=1, action=self.AssertAtLeastConst) | |
parser.add_argument( | |
'-v', '--verbose', help='Verbosity of output message', | |
action='store_true') | |
return parser.parse_args(argv) | |
class AssertAtLeastConst(Action): | |
def __call__(self, parser, namespace, values, option_string=None): | |
""" | |
Ensures that an argument is greater than or equal a given constant | |
""" | |
if (self.const is not None) and (values < self.const): | |
arg_name = self._get_arg_output_name(option_string) | |
parser.error( | |
"{0} must be at least {1}".format(arg_name, self.const) | |
) | |
else: | |
setattr(namespace, self.dest, values) | |
def _get_arg_output_name(self, option_string=None): | |
""" | |
Return the argument name, which should be taken from one of the | |
following sources (in order of priority): | |
1. Option String | |
2. Metavar | |
3. Dest | |
""" | |
for text in [option_string, self.metavar, self.dest]: | |
if text is not None: | |
arg_name = text | |
break | |
return arg_name | |
if __name__ == '__main__': | |
FileSplitter().execute() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
from __future__ import division # force integer division to return a floating point number | |
from argparse import ArgumentParser, Action | |
from os.path import splitext | |
from math import ceil | |
from itertools import izip_longest | |
class FileSplitter(object): | |
def execute(self, argv=None): | |
""" | |
Split a text file into several smaller files. User must specify either a maximum file count or | |
a maximum line count per file. If user specifies both, then the maximum file count will take priority. | |
""" | |
# Read arguments and split file | |
args = self._parse_arguments(argv) | |
if args.files: | |
splitfunc = self.split_by_filecount | |
elif args.lines: | |
splitfunc = self.split_by_linecount | |
else: | |
raise ValueError("Must specify either a maximum file count or a maximum line count") | |
newfiles = splitfunc(args.file, args.limit) | |
# Output message | |
print "Split {0} into {1} files".format(args.file, len(newfiles)) | |
# Verbose output message | |
if args.verbose: | |
for f in newfiles: print f | |
def split_by_filecount(self, myfile, maxfiles): | |
""" | |
Split a text file into, at most, <maxfiles> smaller files. | |
Returns a list of the newly created files. | |
""" | |
lines = sum(1 for line in open(myfile)) | |
newfiles = self.split_by_linecount(myfile, int(ceil(lines / maxfiles))) | |
return newfiles | |
def split_by_linecount(self, myfile, maxlines): | |
""" | |
Split a text file into several smaller files, each with no more than <maxlines> lines per file. | |
Returns a list of the newly created files. | |
""" | |
with open(myfile, 'r') as infile: | |
# Split file into smaller chunks | |
chunks = izip_longest(*[infile] * maxlines) | |
# Write the lines each chunk to a separate file | |
newfiles = [] | |
for i, chunk in enumerate(chunks, start=1): | |
newfile = self._write_to_file(chunk, myfile, 'w', i) | |
newfiles.append(newfile) | |
return newfiles | |
def _write_to_file(self, lines, myfile, mode='w', suffix=''): | |
""" | |
Write lines to a text file, and return the text file name if successful. | |
Can optionally supply a suffix to the filename. | |
""" | |
# Add suffix to filename if supplied | |
if suffix: | |
fn, ext = splitext(myfile) | |
newfile = "{0}{1}{2}".format(fn, suffix, ext) | |
else: | |
newfile = myfile | |
# Write supplied lines to file | |
with open(newfile, mode) as f: | |
for line in lines: | |
if line is not None: | |
f.write(line) | |
return newfile | |
def _parse_arguments(self, argv=None): | |
"""Define necessary inputs and parse command-line arguments""" | |
parser = ArgumentParser(description='Split text file into several smaller files') | |
parser.add_argument('file', help='File to be split') | |
parser.add_argument('limit', help='Limit value', type=int, const=1, action=self.AssertAtLeastConst) | |
parser.add_argument('-v', '--verbose', help='Verbosity of output message', action='store_true') | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument('-f', '--files', help='Limit number new files to create', action='store_true') | |
group.add_argument('-l', '--lines', help='Limit number of lines per new file', action='store_true') | |
return parser.parse_args(argv) | |
class AssertAtLeastConst(Action): | |
def __call__(self, parser, namespace, values, option_string=None): | |
"""Ensures that an argument is greater than or equal a given constant""" | |
if (self.const is not None) and (values < self.const): | |
arg_name = self._get_arg_output_name(option_string) | |
parser.error("{0} must be at least {1}".format(arg_name, self.const)) | |
else: | |
setattr(namespace, self.dest, values) | |
def _get_arg_output_name(self, option_string = None): | |
""" | |
Return the argument name, which should be taken from one of the following sources | |
(in order of priority): 1. Option String, 2. Metavar, 3. Dest | |
""" | |
for text in [option_string, self.metavar, self.dest]: | |
if text is not None: | |
arg_name = text | |
break | |
return arg_name | |
if __name__ == '__main__': | |
FileSplitter().execute() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1---------------- | |
24 | |
532 | |
1 | |
432 | |
4321 | |
4321 | |
4321 | |
32 | |
32 | |
4 | |
321 | |
432---------------- | |
4 | |
24 | |
21 | |
43 | |
321 | |
34 | |
321 | |
4 | |
324 | |
3 | |
24 | |
32---------------- | |
432 | |
14 | |
32 | |
4 | |
32 | |
432 | |
324 | |
421 | |
2143 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment