Skip to content

Instantly share code, notes, and snippets.

@daragh
Last active June 29, 2019 22:58
Show Gist options
  • Save daragh/27cf82fd7d6558990fbb9a2045287f7c to your computer and use it in GitHub Desktop.
Save daragh/27cf82fd7d6558990fbb9a2045287f7c to your computer and use it in GitHub Desktop.
External Merge Sort in Python
#!/usr/bin/env python
import sys
import os
from tempfile import mkstemp
from argparse import ArgumentParser
def main(argv):
parser = ArgumentParser()
parser.add_argument('path')
parser.add_argument('--limit', default=100, metavar='MEGABYTES', type=int)
parsed = parser.parse_args()
megabyte = 1024 ** 2
limit = parsed.limit * megabyte
stdout = getattr(sys.stdout, 'buffer', sys.stdout)
f = open(parsed.path, 'rb')
for line in external_merge_sort(f, limit):
stdout.write(line)
f.close()
def external_merge_sort(f, limit, keyfn=lambda x: x):
paths = []
try:
for lines in divide_into_chunks(f, limit):
lines.sort(key=keyfn)
path = write_file(lines)
paths.append(path)
while len(paths) > 1:
infile1 = open(paths.pop(), 'rb')
os.remove(infile1.name)
infile2 = open(paths.pop(), 'rb')
os.remove(infile2.name)
outpath = merge_to_file(infile1, infile2, keyfn)
infile1.close()
infile2.close()
paths.append(outpath)
outfile = open(paths.pop(), 'rb')
os.remove(outfile.name)
for line in outfile:
yield line
outfile.close()
finally:
for path in paths:
os.remove(path)
def divide_into_chunks(f, limit):
(size, lines) = (0, [])
for line in f:
size += len(line)
lines.append(line)
if size > limit:
yield lines
(size, lines) = (0, [])
yield lines
def write_file(lines):
(fd, path) = mkstemp()
f = os.fdopen(fd, 'wb')
for line in lines:
f.write(line)
f.close()
return path
def merge_to_file(infile1, infile2, keyfn):
(fd, path) = mkstemp()
outfile = os.fdopen(fd, 'wb')
for line in merge_buffers(infile1, infile2, keyfn):
outfile.write(line)
outfile.close()
return path
def merge_buffers(f1, f2, keyfn):
while True:
(f1, f2) = sorted([f1, f2], key=lambda x: keyfn(peekline(x)))
line = f1.readline()
if line:
yield line
else:
for line in f2:
yield line
break
def peekline(f):
line = f.readline()
f.seek(-len(line), 1)
return line
if __name__ == '__main__': main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment