Skip to content

Instantly share code, notes, and snippets.

@manangandhi7
Created November 17, 2019 22:19
Show Gist options
  • Save manangandhi7/f260eb1a41defdedcce8d9f8a7e30c0c to your computer and use it in GitHub Desktop.
Save manangandhi7/f260eb1a41defdedcce8d9f8a7e30c0c to your computer and use it in GitHub Desktop.
External sort in python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import shutil\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# buffer size is the max number your memory can handle, set a small number in the beginning for trial here\n",
"buffer_size = 10000\n",
"# total size of the number that needs to be sorted, only needed if you want to generate a file of unsorted integers\n",
"total_size = 55000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Helper functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def save_array_to_file(file_name, array_to_save):\n",
" np.savetxt(file_name, array_to_save, fmt = '%d')\n",
" \n",
"def sort_and_write(file_name, array_to_sort):\n",
" array_to_sort.sort()\n",
" save_array_to_file(file_name, array_to_sort)\n",
" \n",
"def read_n_int(file_, numbers_to_read):\n",
" array_ = []\n",
" \n",
" if numbers_to_read <= 0 :\n",
" return array_\n",
" \n",
" num = file_.readline()\n",
" while(num):\n",
" array_.append(int(num))\n",
" if len(array_) >= numbers_to_read:\n",
" break\n",
" num = file_.readline()\n",
" \n",
" return array_\n",
"\n",
"def create_unsorted_file(size_, file_name_ = 'unsorted.csv'):\n",
" arr = np.arange(size_)\n",
" np.random.shuffle(arr)\n",
" save_array_to_file(file_name_, arr)\n",
" arr = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Methods for external sorting"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import heapq\n",
"\n",
"def sort_slices(file_name, buffer_size_):\n",
" read_arr = []\n",
" chunk = 1\n",
" f = open(file_name)\n",
"\n",
" if os.path.exists('./tmp/'):\n",
" shutil.rmtree('./tmp/')\n",
" os.mkdir('./tmp/')\n",
"\n",
" #TODO : delete contents of tmp directory\n",
" read_arr = read_n_int(f, buffer_size_)\n",
" while (len(read_arr) > 0):\n",
" sort_and_write('./tmp/sorted_' + str(chunk), read_arr)\n",
" read_arr = read_n_int(f, buffer_size_)\n",
" chunk = chunk + 1\n",
"\n",
" f.close()\n",
"\n",
"def min_heap_sort(output_file):\n",
" sorted_file = open(output_file, 'w+')\n",
"\n",
" min_heap = []\n",
" heapq.heapify(min_heap)\n",
" \n",
" open_files = []\n",
" for f in os.listdir('./tmp/'):\n",
" if os.path.isfile('./tmp/' + f):\n",
" file_ = open('./tmp/' + f)\n",
" open_files.append(file_)\n",
" val = file_.readline()\n",
" heapq.heappush(min_heap, (int(val), file_))\n",
"\n",
" while(len(min_heap) > 0):\n",
" min_element = heapq.heappop(min_heap)\n",
" sorted_file.write(str(min_element[0]) + '\\n')\n",
" next_str = min_element[1].readline()\n",
" if next_str:\n",
" heapq.heappush(min_heap, (int(next_str), min_element[1]))\n",
" else:\n",
" min_element[1].close()\n",
"\n",
" sorted_file.close()\n",
"\n",
"def external_sort(input_file, output_file, buffer_size_ = 10000):\n",
" sort_slices(input_file, buffer_size_)\n",
" min_heap_sort(output_file)\n",
" print('Sorted values are written to' , str(output_file))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 184 ms\n"
]
}
],
"source": [
"%%time\n",
"create_unsorted_file(total_size, file_name_ = 'unsorted.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## This function performs external sort, just call it with input and output file name and it will do the magic"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sorted values are written to sorted_external.csv\n",
"Wall time: 396 ms\n"
]
}
],
"source": [
"%%time\n",
"external_sort(input_file= 'unsorted.csv', output_file='sorted_external.csv', buffer_size_ = buffer_size)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# %%time\n",
"# read_arr = []\n",
"\n",
"# with open('unsorted.csv') as f:\n",
"# num = f.readline()\n",
"# while (num):\n",
"# read_arr.append(int(num))\n",
"# num = f.readline()\n",
" \n",
"# print('Sorting numbers now')\n",
"# read_arr.sort()\n",
"\n",
"# print('writing sorted numbers to file')\n",
"# save_array_to_file('sorted_in_mem.csv', read_arr)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment