Skip to content

Instantly share code, notes, and snippets.

@msharp
Last active August 25, 2023 04:33
Show Gist options
  • Save msharp/8571150 to your computer and use it in GitHub Desktop.
Save msharp/8571150 to your computer and use it in GitHub Desktop.
python script to split a (large) file into multiple (smaller) files with specified number of lines
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
class FileSplitter:
def __init__(self):
self.parse_args(sys.argv)
@staticmethod
def run():
splitter = FileSplitter()
splitter.split()
def split(self):
file_number = 1
line_number = 1
print "Splitting %s into multiple files with %s lines" % (os.path.join(self.working_dir, self.file_base_name+self.file_ext), str(self.split_size
))
out_file = self.get_new_file(file_number)
for line in self.in_file:
out_file.write(line)
line_number += 1
if line_number == self.split_size:
out_file.close()
file_number += 1
line_number = 1
out_file = self.get_new_file(file_number)
out_file.close()
print "Created %s files." % (str(file_number))
def get_new_file(self,file_number):
"""return a new file object ready to write to"""
new_file_name = "%s.%s%s" % (self.file_base_name, str(file_number), self.file_ext)
new_file_path = os.path.join(self.working_dir, new_file_name)
print "creating file %s" % (new_file_path)
return open(new_file_path, 'w')
def parse_args(self,argv):
"""parse args and set up instance variables"""
try:
self.split_size = 1000
if len(argv) > 2:
self.split_size = int(argv[2])
self.file_name = argv[1]
self.in_file = open(self.file_name, "r")
self.working_dir = os.getcwd()
self.file_base_name, self.file_ext = os.path.splitext(self.file_name)
except:
print self.usage()
sys.exit(1)
def usage(self):
return """
Split a large file into many smaller files with set number of rows.
Usage:
$ python file_splitter.py <file_name> [row_count]
row_count is optional (default is 1000)
"""
if __name__ == "__main__":
FileSplitter.run()
@SubodhChiwate-TomTom
Copy link

Nice utility. I faced issues on Python 3.8. Fix is using print().
I am new to python so don't know the cause for this issue. Seems some broken API in the language syntax.

@msharp
Copy link
Author

msharp commented Jun 2, 2021

Thanks for the message @SubodhChiwate-TomTom

This is an older script developed with Python 2.7. In the Python 3+ versions, the print function must wrap its argument in parentheses.

If you are using Linux, there is the split utility which will be faster and more effective than this script. https://pubs.opengroup.org/onlinepubs/9699919799/utilities/split.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment