Skip to content

Instantly share code, notes, and snippets.

@dalemyers
Created November 16, 2015 01:10
Show Gist options
  • Save dalemyers/10cef1cdf5f03a81db3e to your computer and use it in GitHub Desktop.
Save dalemyers/10cef1cdf5f03a81db3e to your computer and use it in GitHub Desktop.
'''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS
you will read pages from disk and be able to hold a particular number in
memory. This implementation allows you to set the number of pages by setting
the MAX_BUFFERS variable, and the size of a page by setting the
MAX_BUFFER_SPACE variable.'''
import sys
import os
import tempfile
#How many lines we can keep in memory at once per buffer before we are forced
#to write out
MAX_BUFFER_SPACE = 10000
#How many buffers we can have in memory
MAX_BUFFERS = 10
def get_lines(file_name):
'''Utility function for getting a generator from a file which spits out
lines of a file, converted to integers.'''
with open(file_name) as f:
for line in f:
yield int(line)
def get_next(iterator):
'''A wrapper to get the next value from an iterator without having to worry
about catching exceptions. It simply returns None when we have no more
elements in the iterator.'''
try:
return next(iterator)
except StopIteration:
#We have run out
return None
def get_temp_file():
'''Creates a temporary file in a particular location (for easy debugging)
and returns a file wrapper and a path in a tuple.'''
temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/")
temp_file = os.fdopen(temp_file_handle, "w")
return (temp_file, temp_file_path)
def write_temp_file(buff):
'''Writes a list of integers out to a temporary file.'''
temp_file, temp_file_path = get_temp_file()
for line in buff:
temp_file.write(str(line) + "\n")
temp_file.close()
return temp_file_path
def split_into_buffers(file):
'''Splits a large file into multiple files. While this isn't necessary, and
indeed even hurts performance, I did this to keep the algorithm simple. With
this split, we only need to think about how the n-way merge algorithm works.'''
output_files = []
output = []
#Get our iterator
lines = get_lines(file)
while True:
#Get the next line and append it to the output
next_line = get_next(lines)
if next_line != None:
output.append(next_line)
#If we run out of buffer space, or data, then sort the buffer and
#write it out to disk
if len(output) > MAX_BUFFER_SPACE or next_line == None:
if len(output) > 0:
output.sort()
output_files.append(write_temp_file(output))
output = []
if next_line == None:
break
#Return the list of files we have written out
return output_files
def merge_files(initial_file_names):
'''Implements a naive n-way merge of pre-sorted files. The naive method
simply iterates over the first value of each file it has open, finds the
smallest out of these, then writes this to the output file, and repeats
until all files have been used up completely.'''
all_file_names = initial_file_names[:]
#This algorithm repeats until we have no files left to merge
while True:
#If we have a single file we are finished
if len(all_file_names) == 1:
return all_file_names[0]
#If we can fit at least 1 line from each file in memory we can just go
#ahead and do a regular merge of all of them. If not, we need to take a
#subset and merge those, then iterate.
#This isn't strictly like a DMBS would do as we don't have to load an
#entire page into memory and can instead go line by line. We only load
#as many files as pages to approximate the performance.
if len(all_file_names) > MAX_BUFFERS:
file_names = all_file_names[:MAX_BUFFERS]
remaining_file_names = all_file_names[MAX_BUFFERS:]
else:
file_names = all_file_names[:]
remaining_file_names = []
print "Merging", len(remaining_file_names) + len(file_names), "files"
#Since we know everything is sorted, we can just merge the files directly,
#without needing to store anything in memory. Instead, we will keep going
#until all buffers are exhausted
generators = map(get_lines, file_names)
output_file, output_file_path = get_temp_file()
current_values = []
#Pre-populate the very first values from each file
for generator in generators:
current_values.append(get_next(generator))
while True:
smallest_index = 0
#Find the smallest int that isn't None (i.e. the file still has
#values in it)
while current_values[smallest_index] == None:
smallest_index += 1
#If we have the entire list as None then break out
if smallest_index >= len(current_values):
break
#Check that we have at least one number
if smallest_index < len(current_values):
#Find the smallest number remaining
for i in range(smallest_index, len(current_values)):
if current_values[i] == None:
continue
if current_values[i] < current_values[smallest_index]:
smallest_index = i
#Write it out
output_file.write(str(current_values[smallest_index]) + "\n")
#Replace the smallest values
current_values[smallest_index] = get_next(generators[smallest_index])
# Check if all files are empty
if smallest_index >= len(current_values):
#All files are exhausted so we are done with this round
#Add this file to the list and repeat everything until we are left
#with just a single file
all_file_names = remaining_file_names[:]
all_file_names.append(output_file_path)
output_file.close()
#Delete everything in all_files to stop us destroying the disk
for file_name in file_names:
os.remove(file_name)
break
def main():
if len(sys.argv) != 2:
print "Expected a single argument of a file name"
sys.exit(1)
files = split_into_buffers(sys.argv[1])
print "Split files"
print "Sorted file:", merge_files(files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment