Skip to content

Instantly share code, notes, and snippets.

@Blokyk
Last active November 20, 2022 02:55
Show Gist options
  • Save Blokyk/525ddd6e3250d60e4094952b29227cc2 to your computer and use it in GitHub Desktop.
Save Blokyk/525ddd6e3250d60e4094952b29227cc2 to your computer and use it in GitHub Desktop.
An implementation of MergeSort using two alternating buffers to avoid memcpy's (in pure C)
/*
* MIT License
*
* Copyright (c) 2022 blokyk
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <stdlib.h>
#include <string.h>
// This version of MergeSort basically just alternates between two buffers
// for writing the sorted version. To do this, the function that merges
// each array checks if the *left* array is inside the dst buffer,
// and, if that's the case, swaps the write destination to the src buffer.
// Since recurse_merge_sort makes sure length(left) >= length(right), we can safely write
// into the new destination buffer by starting "above" `left` (i.e. at the same
// offset as `left` is but in the opposite buffer) no matter where `right` is actually
// stored, because they will never overlap with `right` before we've written then to
// the destination.
// Once we've written into the destination, we just use that as the "source" for the
// next merge, and up and up the stack frames we go.
void merge_sort(int* arr, size_t length) {
int* tmp_dst = malloc(length * sizeof(int));
memcpy(tmp_dst, arr, length * sizeof(int)); // fill tmp array, cf recurse_merge_sort
// note: we need to keep copies since recurse_merge_sort doesn't always
// assign the right values to p_src and p_dst
// todo: figure out why the fuck recurse_merge_sort does that (esp. since it worked before)
int* original = arr;
int* dest = tmp_dst;
recurse_merge_sort(&arr, &tmp_dst, 0, length - 1);
// if the buffers were swapped, just copy the other way
if (tmp_dst != dest) {
memcpy(dest, original, length * sizeof(int));
} else {
memcpy(original, dest, length * sizeof(int));
}
free(dest);
}
// recurse_merge_sort is just a modified version of a basic MergeSort method
// with a few lines added to track the src/dst buffers for each side.
void recurse_merge_sort(int** p_src, int** p_dst, int start, int end) {
if (start == end) {
// technically, we should copy the element from *p_src[start] to *p_dst[start]
// here since we're expected to write a correct value into the dst buffer.
// However,
// (a) merge_sort calls memcpy(3) on its temp array (which will
// be the dst buffer in some situations),
// (b)this path is only executed once for every index,
// (c) AND those indexes are always "pristine," i.e. they haven't
// been written to by merge_sides yet;
// This guarantees that the value will be the same as in the original buffer,
// thus rendering assignment useless
return;
} else if (end == start + 1) { // if this is only two elements
// just do a very basic swap in case its needed, and don't write otherwise
// (cf above, it's basically the same justification + the fact there's only two
// possible permutations for two elements, so this won't have the same catastrophic
// perf as BubbleSort)
int a = (*p_src)[start];
int b = (*p_src)[end];
if (a > b) {
(*p_dst)[start] = b;
(*p_dst)[end] = a;
}
return;
}
// we add 1 to guarantee that we'll have a bias towards the left side
// (i.e. potentially more elements on the left than the right)
int mid = (start + end) / 2 + 1;
// we need to keep track of each src and dst since recurse_merge_sort
// could swap them (through merge_sides)
int* left_src = *p_src;
int* left_dst = *p_dst;
recurse_merge_sort(&left_src, &left_dst, mid, end);
int* right_src = *p_src;
int* right_dst = *p_dst;
recurse_merge_sort(&right_src, &right_dst, start, mid - 1);
// since both recurse_merge_sort calls WROTE the sorted values INTO the dst
// arrays, we use those for the merge, and not the "original" (i.e. src)
merge_sides(p_src, p_dst, start, left_dst + start, mid - start, right_dst + mid, end - (start + end)/2);
}
void merge_sides(int** p_src, int** p_dst, int dst_offset, int* left, size_t l_length, int* right, size_t r_length) {
size_t length = l_length + r_length;
// the actual array we'll store stuff into,
// add offset directly so that indexing is more readable and fool-proof
int* dest = *p_dst + dst_offset;
// If left overlaps with the destination buffer, then use the
// "source" buffer for the destination instead
if (left >= *p_dst + dst_offset && left < *p_dst + dst_offset + length) {
dest = *p_src;
*p_src = *p_dst;
*p_dst = dest;
}
int l_idx = 0;
int r_idx = 0;
for (int i = 0; i < length; i++) {
// if we've already copied everything from left,
// then just copy the rest of right into dest and exit
if (l_idx >= l_length) {
size_t rest_length = length - i; // == r_length - r_idx
// if we only have one element left, just do it manually, it'll be faster
// and doesn't take up too much source-code.
//
// If you absolutely wanted to, you could probably write variations up to
// something like 8 items and it (might) be faster, since it wouldn't fit
// into AVX-256 registers/buses anyway if under <32 bytes, i.e. 8*sizeof(int)
// and wouldn't require additional function calls, but that's pretty debatable.
// You might also be able to cast stuff to long/long* to get only half the
// number of mov's, although that's basically nano-optimization at that point,
// and the lack of data locality with this algorithm will be a much bigger
// problem if I had to guess
if (rest_length == 1) {
dest[i] = right[r_idx];
} else {
memcpy(dest + i, right + r_idx, rest_length * sizeof(int));
}
return;
}
// same as above
if (r_idx >= r_length) {
size_t rest_length = length - i;
if (rest_length == 1) {
dest[i] = left[l_idx];
} else {
memcpy(dest + i, left + l_idx, rest_length * sizeof(int));
}
return;
}
if (left[l_idx] > right[r_idx]) {
dest[i] = right[r_idx];
r_idx++;
} else {
dest[i] = left[l_idx];
l_idx++;
}
}
return;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment