Created
November 17, 2019 22:19
-
-
Save manangandhi7/f260eb1a41defdedcce8d9f8a7e30c0c to your computer and use it in GitHub Desktop.
External sort in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import shutil\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# buffer size is the max number your memory can handle, set a small number in the beginning for trial here\n", | |
"buffer_size = 10000\n", | |
"# total size of the number that needs to be sorted, only needed if you want to generate a file of unsorted integers\n", | |
"total_size = 55000" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Helper functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def save_array_to_file(file_name, array_to_save):\n", | |
" np.savetxt(file_name, array_to_save, fmt = '%d')\n", | |
" \n", | |
"def sort_and_write(file_name, array_to_sort):\n", | |
" array_to_sort.sort()\n", | |
" save_array_to_file(file_name, array_to_sort)\n", | |
" \n", | |
"def read_n_int(file_, numbers_to_read):\n", | |
" array_ = []\n", | |
" \n", | |
" if numbers_to_read <= 0 :\n", | |
" return array_\n", | |
" \n", | |
" num = file_.readline()\n", | |
" while(num):\n", | |
" array_.append(int(num))\n", | |
" if len(array_) >= numbers_to_read:\n", | |
" break\n", | |
" num = file_.readline()\n", | |
" \n", | |
" return array_\n", | |
"\n", | |
"def create_unsorted_file(size_, file_name_ = 'unsorted.csv'):\n", | |
" arr = np.arange(size_)\n", | |
" np.random.shuffle(arr)\n", | |
" save_array_to_file(file_name_, arr)\n", | |
" arr = None" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Methods for external sorting" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import heapq\n", | |
"\n", | |
"def sort_slices(file_name, buffer_size_):\n", | |
" read_arr = []\n", | |
" chunk = 1\n", | |
" f = open(file_name)\n", | |
"\n", | |
" if os.path.exists('./tmp/'):\n", | |
" shutil.rmtree('./tmp/')\n", | |
" os.mkdir('./tmp/')\n", | |
"\n", | |
" #TODO : delete contents of tmp directory\n", | |
" read_arr = read_n_int(f, buffer_size_)\n", | |
" while (len(read_arr) > 0):\n", | |
" sort_and_write('./tmp/sorted_' + str(chunk), read_arr)\n", | |
" read_arr = read_n_int(f, buffer_size_)\n", | |
" chunk = chunk + 1\n", | |
"\n", | |
" f.close()\n", | |
"\n", | |
"def min_heap_sort(output_file):\n", | |
" sorted_file = open(output_file, 'w+')\n", | |
"\n", | |
" min_heap = []\n", | |
" heapq.heapify(min_heap)\n", | |
" \n", | |
" open_files = []\n", | |
" for f in os.listdir('./tmp/'):\n", | |
" if os.path.isfile('./tmp/' + f):\n", | |
" file_ = open('./tmp/' + f)\n", | |
" open_files.append(file_)\n", | |
" val = file_.readline()\n", | |
" heapq.heappush(min_heap, (int(val), file_))\n", | |
"\n", | |
" while(len(min_heap) > 0):\n", | |
" min_element = heapq.heappop(min_heap)\n", | |
" sorted_file.write(str(min_element[0]) + '\\n')\n", | |
" next_str = min_element[1].readline()\n", | |
" if next_str:\n", | |
" heapq.heappush(min_heap, (int(next_str), min_element[1]))\n", | |
" else:\n", | |
" min_element[1].close()\n", | |
"\n", | |
" sorted_file.close()\n", | |
"\n", | |
"def external_sort(input_file, output_file, buffer_size_ = 10000):\n", | |
" sort_slices(input_file, buffer_size_)\n", | |
" min_heap_sort(output_file)\n", | |
" print('Sorted values are written to' , str(output_file))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 184 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"create_unsorted_file(total_size, file_name_ = 'unsorted.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## This function performs external sort, just call it with input and output file name and it will do the magic" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sorted values are written to sorted_external.csv\n", | |
"Wall time: 396 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"external_sort(input_file= 'unsorted.csv', output_file='sorted_external.csv', buffer_size_ = buffer_size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# %%time\n", | |
"# read_arr = []\n", | |
"\n", | |
"# with open('unsorted.csv') as f:\n", | |
"# num = f.readline()\n", | |
"# while (num):\n", | |
"# read_arr.append(int(num))\n", | |
"# num = f.readline()\n", | |
" \n", | |
"# print('Sorting numbers now')\n", | |
"# read_arr.sort()\n", | |
"\n", | |
"# print('writing sorted numbers to file')\n", | |
"# save_array_to_file('sorted_in_mem.csv', read_arr)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment