dalemyers/external_merge.py

## external_merge.py
'''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS
you will read pages from disk and be able to hold a particular number in
memory. This implementation allows you to set the number of pages by setting
the MAX_BUFFERS variable, and the size of a page by setting the
MAX_BUFFER_SPACE variable.'''

import sys
import os
import tempfile

#How many lines we can keep in memory at once per buffer before we are forced
#to write out
MAX_BUFFER_SPACE = 10000
#How many buffers we can have in memory
MAX_BUFFERS = 10

def get_lines(file_name):
	'''Utility function for getting a generator from a file which spits out
	lines of a file, converted to integers.'''

	with open(file_name) as f:
		for line in f:
			yield int(line)


def get_next(iterator):
	'''A wrapper to get the next value from an iterator without having to worry
	about catching exceptions. It simply returns None when we have no more
	elements in the iterator.'''

	try:
		return next(iterator)
	except StopIteration:
		#We have run out
		return None

def get_temp_file():
	'''Creates a temporary file in a particular location (for easy debugging)
	and returns a file wrapper and a path in a tuple.'''

	temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/")
	temp_file = os.fdopen(temp_file_handle, "w")
	return (temp_file, temp_file_path)

def write_temp_file(buff):
	'''Writes a list of integers out to a temporary file.'''
	temp_file, temp_file_path = get_temp_file()

	for line in buff:
		temp_file.write(str(line) + "\n")

	temp_file.close()

	return temp_file_path

def split_into_buffers(file):
	'''Splits a large file into multiple files. While this isn't necessary, and
	indeed even hurts performance, I did this to keep the algorithm simple. With
	this split, we only need to think about how the n-way merge algorithm works.'''
	output_files = []
	output = []

	#Get our iterator
	lines = get_lines(file)

	while True:

		#Get the next line and append it to the output
		next_line = get_next(lines)
		if next_line != None:
			output.append(next_line)

		#If we run out of buffer space, or data, then sort the buffer and
		#write it out to disk
		if len(output) > MAX_BUFFER_SPACE or next_line == None:
			if len(output) > 0:
				output.sort()
				output_files.append(write_temp_file(output))
				output = []

			if next_line == None:
				break

	#Return the list of files we have written out
	return output_files


def merge_files(initial_file_names):
	'''Implements a naive n-way merge of pre-sorted files. The naive method
	simply iterates over the first value of each file it has open, finds the
	smallest out of these, then writes this to the output file, and repeats
	until all files have been used up completely.'''

	all_file_names = initial_file_names[:]

	#This algorithm repeats until we have no files left to merge
	while True:
		#If we have a single file we are finished
		if len(all_file_names) == 1:
			return all_file_names[0]

		#If we can fit at least 1 line from each file in memory we can just go
		#ahead and do a regular merge of all of them. If not, we need to take a
		#subset and merge those, then iterate.

		#This isn't strictly like a DMBS would do as we don't have to load an
		#entire page into memory and can instead go line by line. We only load
		#as many files as pages to approximate the performance.
		if len(all_file_names) > MAX_BUFFERS:
			file_names = all_file_names[:MAX_BUFFERS]
			remaining_file_names = all_file_names[MAX_BUFFERS:]
		else:
			file_names = all_file_names[:]
			remaining_file_names = []

		print "Merging", len(remaining_file_names) + len(file_names), "files"

		#Since we know everything is sorted, we can just merge the files directly,
		#without needing to store anything in memory. Instead, we will keep going
		#until all buffers are exhausted

		generators = map(get_lines, file_names)
		output_file, output_file_path = get_temp_file()

		current_values = []

		#Pre-populate the very first values from each file
		for generator in generators:
			current_values.append(get_next(generator))

		while True:
			smallest_index = 0

			#Find the smallest int that isn't None (i.e. the file still has
			#values in it)
			while current_values[smallest_index] == None:
				smallest_index += 1
				#If we have the entire list as None then break out
				if smallest_index >= len(current_values):
					break

			#Check that we have at least one number
			if smallest_index < len(current_values):

				#Find the smallest number remaining
				for i in range(smallest_index, len(current_values)):
					if current_values[i] == None:
						continue
					if current_values[i] < current_values[smallest_index]:
						smallest_index = i

				#Write it out
				output_file.write(str(current_values[smallest_index]) + "\n")

				#Replace the smallest values
				current_values[smallest_index] = get_next(generators[smallest_index])

			# Check if all files are empty
			if smallest_index >= len(current_values):
				#All files are exhausted so we are done with this round
				#Add this file to the list and repeat everything until we are left
				#with just a single file
				all_file_names = remaining_file_names[:]
				all_file_names.append(output_file_path)
				output_file.close()

				#Delete everything in all_files to stop us destroying the disk
				for file_name in file_names:
					os.remove(file_name)

				break

def main():
	if len(sys.argv) != 2:
		print "Expected a single argument of a file name"
		sys.exit(1)

	files = split_into_buffers(sys.argv[1])
	print "Split files"
	print "Sorted file:", merge_files(files)

if __name__ == "__main__":
	main()
	'''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS
	you will read pages from disk and be able to hold a particular number in
	memory. This implementation allows you to set the number of pages by setting
	the MAX_BUFFERS variable, and the size of a page by setting the
	MAX_BUFFER_SPACE variable.'''

	import sys
	import os
	import tempfile

	#How many lines we can keep in memory at once per buffer before we are forced
	#to write out
	MAX_BUFFER_SPACE = 10000
	#How many buffers we can have in memory
	MAX_BUFFERS = 10

	def get_lines(file_name):
	'''Utility function for getting a generator from a file which spits out
	lines of a file, converted to integers.'''

	with open(file_name) as f:
	for line in f:
	yield int(line)


	def get_next(iterator):
	'''A wrapper to get the next value from an iterator without having to worry
	about catching exceptions. It simply returns None when we have no more
	elements in the iterator.'''

	try:
	return next(iterator)
	except StopIteration:
	#We have run out
	return None

	def get_temp_file():
	'''Creates a temporary file in a particular location (for easy debugging)
	and returns a file wrapper and a path in a tuple.'''

	temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/")
	temp_file = os.fdopen(temp_file_handle, "w")
	return (temp_file, temp_file_path)

	def write_temp_file(buff):
	'''Writes a list of integers out to a temporary file.'''
	temp_file, temp_file_path = get_temp_file()

	for line in buff:
	temp_file.write(str(line) + "\n")

	temp_file.close()

	return temp_file_path

	def split_into_buffers(file):
	'''Splits a large file into multiple files. While this isn't necessary, and
	indeed even hurts performance, I did this to keep the algorithm simple. With
	this split, we only need to think about how the n-way merge algorithm works.'''
	output_files = []
	output = []

	#Get our iterator
	lines = get_lines(file)

	while True:

	#Get the next line and append it to the output
	next_line = get_next(lines)
	if next_line != None:
	output.append(next_line)

	#If we run out of buffer space, or data, then sort the buffer and
	#write it out to disk
	if len(output) > MAX_BUFFER_SPACE or next_line == None:
	if len(output) > 0:
	output.sort()
	output_files.append(write_temp_file(output))
	output = []

	if next_line == None:
	break

	#Return the list of files we have written out
	return output_files


	def merge_files(initial_file_names):
	'''Implements a naive n-way merge of pre-sorted files. The naive method
	simply iterates over the first value of each file it has open, finds the
	smallest out of these, then writes this to the output file, and repeats
	until all files have been used up completely.'''

	all_file_names = initial_file_names[:]

	#This algorithm repeats until we have no files left to merge
	while True:
	#If we have a single file we are finished
	if len(all_file_names) == 1:
	return all_file_names[0]

	#If we can fit at least 1 line from each file in memory we can just go
	#ahead and do a regular merge of all of them. If not, we need to take a
	#subset and merge those, then iterate.

	#This isn't strictly like a DMBS would do as we don't have to load an
	#entire page into memory and can instead go line by line. We only load
	#as many files as pages to approximate the performance.
	if len(all_file_names) > MAX_BUFFERS:
	file_names = all_file_names[:MAX_BUFFERS]
	remaining_file_names = all_file_names[MAX_BUFFERS:]
	else:
	file_names = all_file_names[:]
	remaining_file_names = []

	print "Merging", len(remaining_file_names) + len(file_names), "files"

	#Since we know everything is sorted, we can just merge the files directly,
	#without needing to store anything in memory. Instead, we will keep going
	#until all buffers are exhausted

	generators = map(get_lines, file_names)
	output_file, output_file_path = get_temp_file()

	current_values = []

	#Pre-populate the very first values from each file
	for generator in generators:
	current_values.append(get_next(generator))

	while True:
	smallest_index = 0

	#Find the smallest int that isn't None (i.e. the file still has
	#values in it)
	while current_values[smallest_index] == None:
	smallest_index += 1
	#If we have the entire list as None then break out
	if smallest_index >= len(current_values):
	break

	#Check that we have at least one number
	if smallest_index < len(current_values):

	#Find the smallest number remaining
	for i in range(smallest_index, len(current_values)):
	if current_values[i] == None:
	continue
	if current_values[i] < current_values[smallest_index]:
	smallest_index = i

	#Write it out
	output_file.write(str(current_values[smallest_index]) + "\n")

	#Replace the smallest values
	current_values[smallest_index] = get_next(generators[smallest_index])

	# Check if all files are empty
	if smallest_index >= len(current_values):
	#All files are exhausted so we are done with this round
	#Add this file to the list and repeat everything until we are left
	#with just a single file
	all_file_names = remaining_file_names[:]
	all_file_names.append(output_file_path)
	output_file.close()

	#Delete everything in all_files to stop us destroying the disk
	for file_name in file_names:
	os.remove(file_name)

	break

	def main():
	if len(sys.argv) != 2:
	print "Expected a single argument of a file name"
	sys.exit(1)

	files = split_into_buffers(sys.argv[1])
	print "Split files"
	print "Sorted file:", merge_files(files)

	if __name__ == "__main__":
	main()