Last active
November 20, 2022 02:55
-
-
Save Blokyk/525ddd6e3250d60e4094952b29227cc2 to your computer and use it in GitHub Desktop.
An implementation of MergeSort using two alternating buffers to avoid memcpy's (in pure C)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* MIT License | |
* | |
* Copyright (c) 2022 blokyk | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in all | |
* copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
#include <stdlib.h> | |
#include <string.h> | |
// This version of MergeSort basically just alternates between two buffers | |
// for writing the sorted version. To do this, the function that merges | |
// each array checks if the *left* array is inside the dst buffer, | |
// and, if that's the case, swaps the write destination to the src buffer. | |
// Since recurse_merge_sort makes sure length(left) >= length(right), we can safely write | |
// into the new destination buffer by starting "above" `left` (i.e. at the same | |
// offset as `left` is but in the opposite buffer) no matter where `right` is actually | |
// stored, because they will never overlap with `right` before we've written then to | |
// the destination. | |
// Once we've written into the destination, we just use that as the "source" for the | |
// next merge, and up and up the stack frames we go. | |
void merge_sort(int* arr, size_t length) { | |
int* tmp_dst = malloc(length * sizeof(int)); | |
memcpy(tmp_dst, arr, length * sizeof(int)); // fill tmp array, cf recurse_merge_sort | |
// note: we need to keep copies since recurse_merge_sort doesn't always | |
// assign the right values to p_src and p_dst | |
// todo: figure out why the fuck recurse_merge_sort does that (esp. since it worked before) | |
int* original = arr; | |
int* dest = tmp_dst; | |
recurse_merge_sort(&arr, &tmp_dst, 0, length - 1); | |
// if the buffers were swapped, just copy the other way | |
if (tmp_dst != dest) { | |
memcpy(dest, original, length * sizeof(int)); | |
} else { | |
memcpy(original, dest, length * sizeof(int)); | |
} | |
free(dest); | |
} | |
// recurse_merge_sort is just a modified version of a basic MergeSort method | |
// with a few lines added to track the src/dst buffers for each side. | |
void recurse_merge_sort(int** p_src, int** p_dst, int start, int end) { | |
if (start == end) { | |
// technically, we should copy the element from *p_src[start] to *p_dst[start] | |
// here since we're expected to write a correct value into the dst buffer. | |
// However, | |
// (a) merge_sort calls memcpy(3) on its temp array (which will | |
// be the dst buffer in some situations), | |
// (b)this path is only executed once for every index, | |
// (c) AND those indexes are always "pristine," i.e. they haven't | |
// been written to by merge_sides yet; | |
// This guarantees that the value will be the same as in the original buffer, | |
// thus rendering assignment useless | |
return; | |
} else if (end == start + 1) { // if this is only two elements | |
// just do a very basic swap in case its needed, and don't write otherwise | |
// (cf above, it's basically the same justification + the fact there's only two | |
// possible permutations for two elements, so this won't have the same catastrophic | |
// perf as BubbleSort) | |
int a = (*p_src)[start]; | |
int b = (*p_src)[end]; | |
if (a > b) { | |
(*p_dst)[start] = b; | |
(*p_dst)[end] = a; | |
} | |
return; | |
} | |
// we add 1 to guarantee that we'll have a bias towards the left side | |
// (i.e. potentially more elements on the left than the right) | |
int mid = (start + end) / 2 + 1; | |
// we need to keep track of each src and dst since recurse_merge_sort | |
// could swap them (through merge_sides) | |
int* left_src = *p_src; | |
int* left_dst = *p_dst; | |
recurse_merge_sort(&left_src, &left_dst, mid, end); | |
int* right_src = *p_src; | |
int* right_dst = *p_dst; | |
recurse_merge_sort(&right_src, &right_dst, start, mid - 1); | |
// since both recurse_merge_sort calls WROTE the sorted values INTO the dst | |
// arrays, we use those for the merge, and not the "original" (i.e. src) | |
merge_sides(p_src, p_dst, start, left_dst + start, mid - start, right_dst + mid, end - (start + end)/2); | |
} | |
void merge_sides(int** p_src, int** p_dst, int dst_offset, int* left, size_t l_length, int* right, size_t r_length) { | |
size_t length = l_length + r_length; | |
// the actual array we'll store stuff into, | |
// add offset directly so that indexing is more readable and fool-proof | |
int* dest = *p_dst + dst_offset; | |
// If left overlaps with the destination buffer, then use the | |
// "source" buffer for the destination instead | |
if (left >= *p_dst + dst_offset && left < *p_dst + dst_offset + length) { | |
dest = *p_src; | |
*p_src = *p_dst; | |
*p_dst = dest; | |
} | |
int l_idx = 0; | |
int r_idx = 0; | |
for (int i = 0; i < length; i++) { | |
// if we've already copied everything from left, | |
// then just copy the rest of right into dest and exit | |
if (l_idx >= l_length) { | |
size_t rest_length = length - i; // == r_length - r_idx | |
// if we only have one element left, just do it manually, it'll be faster | |
// and doesn't take up too much source-code. | |
// | |
// If you absolutely wanted to, you could probably write variations up to | |
// something like 8 items and it (might) be faster, since it wouldn't fit | |
// into AVX-256 registers/buses anyway if under <32 bytes, i.e. 8*sizeof(int) | |
// and wouldn't require additional function calls, but that's pretty debatable. | |
// You might also be able to cast stuff to long/long* to get only half the | |
// number of mov's, although that's basically nano-optimization at that point, | |
// and the lack of data locality with this algorithm will be a much bigger | |
// problem if I had to guess | |
if (rest_length == 1) { | |
dest[i] = right[r_idx]; | |
} else { | |
memcpy(dest + i, right + r_idx, rest_length * sizeof(int)); | |
} | |
return; | |
} | |
// same as above | |
if (r_idx >= r_length) { | |
size_t rest_length = length - i; | |
if (rest_length == 1) { | |
dest[i] = left[l_idx]; | |
} else { | |
memcpy(dest + i, left + l_idx, rest_length * sizeof(int)); | |
} | |
return; | |
} | |
if (left[l_idx] > right[r_idx]) { | |
dest[i] = right[r_idx]; | |
r_idx++; | |
} else { | |
dest[i] = left[l_idx]; | |
l_idx++; | |
} | |
} | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment