Last active
October 18, 2018 06:48
-
-
Save tkrs/97fc0b3cc0d050a283b39dd006c528e5 to your computer and use it in GitHub Desktop.
Splits the file by the passed lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import gzip | |
from itertools import islice | |
class FileSpliter: | |
def __init__(self, from_file, to_file_prefix): | |
self.from_file = from_file | |
self.to_file_prefix = to_file_prefix | |
def _open_from(self): | |
if self.from_file.endswith('.gz'): | |
return gzip.open(self.from_file, 'rb') | |
else: | |
return open(self.from_file, 'rb') | |
def _open_to(self, suffix): | |
f = '%s.%i' % (self.to_file_prefix, suffix) | |
if self.from_file.endswith('.gz'): | |
return gzip.open(f + '.gz', 'wb') | |
else: | |
return open(f, 'wb') | |
def _read(self, num_lines): | |
with self._open_from() as fpr: | |
while True: | |
lines = islice(fpr, num_lines) | |
try: | |
head = next(lines) | |
def f(): | |
yield head | |
for v in lines: | |
yield v | |
yield f() | |
except StopIteration: | |
return | |
def _write(self, num_files, lines): | |
with self._open_to(num_files) as fpw: | |
fpw.writelines(lines) | |
def split(self, num_lines): | |
for (i, l) in enumerate(self._read(num_lines)): | |
self._write(i, l) | |
def main(): | |
argv = sys.argv[1:3] | |
fs = FileSpliter(*argv) | |
fs.split(int(sys.argv[3])) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment