Blokyk/better_merge_sort.c

## better_merge_sort.c
/*
* MIT License
*
* Copyright (c) 2022 blokyk
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include <stdlib.h>
#include <string.h>

// This version of MergeSort basically just alternates between two buffers
// for writing the sorted version. To do this, the function that merges
// each array checks if the *left* array is inside the dst buffer,
// and, if that's the case, swaps the write destination to the src buffer.
// Since recurse_merge_sort makes sure length(left) >= length(right), we can safely write
// into the new destination buffer by starting "above" `left` (i.e. at the same
// offset as `left` is but in the opposite buffer) no matter where `right` is actually
// stored, because they will never overlap with `right` before we've written then to
// the destination.
// Once we've written into the destination, we just use that as the "source" for the
// next merge, and up and up the stack frames we go.

void merge_sort(int* arr, size_t length) {
    int* tmp_dst = malloc(length * sizeof(int));
    memcpy(tmp_dst, arr, length * sizeof(int)); // fill tmp array, cf recurse_merge_sort

    // note: we need to keep copies since recurse_merge_sort doesn't always
    // assign the right values to p_src and p_dst
    // todo: figure out why the fuck recurse_merge_sort does that (esp. since it worked before)
    int* original = arr;
    int* dest = tmp_dst;

    recurse_merge_sort(&arr, &tmp_dst, 0, length - 1);

    // if the buffers were swapped, just copy the other way
    if (tmp_dst != dest) {
    	memcpy(dest, original, length * sizeof(int));
    } else {
    	memcpy(original, dest, length * sizeof(int));
    }

    free(dest);
}


// recurse_merge_sort is just a modified version  of a basic MergeSort method
// with a few lines added to track the src/dst buffers for each side.

void recurse_merge_sort(int** p_src, int** p_dst, int start, int end) {
    if (start == end) {
    	// technically, we should copy the element from *p_src[start] to *p_dst[start]
    	// here since we're expected to write a correct value into the dst buffer.
    	// However,
    	//		(a) merge_sort calls memcpy(3) on its temp array (which will
    	// 			be the dst buffer in some situations),
    	//		(b)this path is only executed once for every index,
    	//		(c) AND those indexes are always "pristine," i.e. they haven't
    	// 			been written to by merge_sides yet;
    	// This guarantees that the value will be the same as in the original buffer,
    	// thus rendering assignment useless
    	return;
    } else if (end == start + 1) { // if this is only two elements
    	// just do a very basic swap in case its needed, and don't write otherwise
    	// (cf above, it's basically the same justification + the fact there's only two
    	// possible permutations for two elements, so this won't have the same catastrophic
    	// perf as BubbleSort)

        int a = (*p_src)[start];
	    int b = (*p_src)[end];

	    if (a > b) {
	    	(*p_dst)[start] = b;
	    	(*p_dst)[end] = a;
	    }

	    return;
    }

    // we add 1 to guarantee that we'll have a bias towards the left side
    // (i.e. potentially more elements on the left than the right)
    int mid = (start + end) / 2 + 1;

    // we need to keep track of each src and dst since recurse_merge_sort
    // could swap them (through merge_sides)

    int* left_src = *p_src;
    int* left_dst = *p_dst;
    recurse_merge_sort(&left_src, &left_dst, mid, end);

    int* right_src = *p_src;
    int* right_dst = *p_dst;
    recurse_merge_sort(&right_src, &right_dst, start, mid - 1);

    // since both recurse_merge_sort calls WROTE the sorted values INTO the dst
    // arrays, we use those for the merge, and not the "original" (i.e. src)
    merge_sides(p_src, p_dst, start, left_dst + start, mid - start, right_dst + mid, end - (start + end)/2);
}

void merge_sides(int** p_src, int** p_dst, int dst_offset, int* left, size_t l_length, int* right, size_t r_length) {
    size_t length = l_length + r_length;

    // the actual array we'll store stuff into,
    // add offset directly so that indexing is more readable and fool-proof
    int* dest = *p_dst + dst_offset;

    // If left overlaps with the destination buffer, then use the
    // "source" buffer for the destination instead
    if (left >= *p_dst + dst_offset && left < *p_dst + dst_offset + length) {
    	dest = *p_src;
    	*p_src = *p_dst;
    	*p_dst = dest;
    }

    int l_idx = 0;
    int r_idx = 0;

    for (int i = 0; i < length; i++) {
    	// if we've already copied everything from left,
    	// then just copy the rest of right into dest and exit
    	if (l_idx >= l_length) {
    		size_t rest_length = length - i; // == r_length - r_idx

    		// if we only have one element left, just do it manually, it'll be faster
    		// and doesn't take up too much source-code.
    		//
    		// If you absolutely wanted to, you could probably write variations up to
    		// something like 8 items and it (might) be faster, since it wouldn't fit
    		// into AVX-256 registers/buses anyway if under <32 bytes, i.e. 8*sizeof(int)
    		// and wouldn't require additional function calls, but that's pretty debatable.
    		// You might also be able to cast stuff to long/long* to get only half the
    		// number of mov's, although that's basically nano-optimization at that point,
    		// and the lack of data locality with this algorithm will be a much bigger
    		// problem if I had to guess
    		if (rest_length == 1) {
    			dest[i] = right[r_idx];
    		} else {
    			memcpy(dest + i, right + r_idx, rest_length * sizeof(int));
    		}

    		return;
    	}

    	// same as above
    	if (r_idx >= r_length) {
    		size_t rest_length = length - i;

    		if (rest_length == 1) {
    			dest[i] = left[l_idx];
    		} else {
    			memcpy(dest + i, left + l_idx, rest_length * sizeof(int));
    		}

    		return;
    	}

    	if (left[l_idx] > right[r_idx]) {
    		dest[i] = right[r_idx];
    		r_idx++;
    	} else {
    		dest[i] = left[l_idx];
    		l_idx++;
    	}
    }

    return;
}
	/*
	* MIT License
	*
	* Copyright (c) 2022 blokyk
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#include <stdlib.h>
	#include <string.h>

	// This version of MergeSort basically just alternates between two buffers
	// for writing the sorted version. To do this, the function that merges
	// each array checks if the left array is inside the dst buffer,
	// and, if that's the case, swaps the write destination to the src buffer.
	// Since recurse_merge_sort makes sure length(left) >= length(right), we can safely write
	// into the new destination buffer by starting "above" `left` (i.e. at the same
	// offset as `left` is but in the opposite buffer) no matter where `right` is actually
	// stored, because they will never overlap with `right` before we've written then to
	// the destination.
	// Once we've written into the destination, we just use that as the "source" for the
	// next merge, and up and up the stack frames we go.

	void merge_sort(int* arr, size_t length) {
	int* tmp_dst = malloc(length * sizeof(int));
	memcpy(tmp_dst, arr, length * sizeof(int)); // fill tmp array, cf recurse_merge_sort

	// note: we need to keep copies since recurse_merge_sort doesn't always
	// assign the right values to p_src and p_dst
	// todo: figure out why the fuck recurse_merge_sort does that (esp. since it worked before)
	int* original = arr;
	int* dest = tmp_dst;

	recurse_merge_sort(&arr, &tmp_dst, 0, length - 1);

	// if the buffers were swapped, just copy the other way
	if (tmp_dst != dest) {
	memcpy(dest, original, length * sizeof(int));
	} else {
	memcpy(original, dest, length * sizeof(int));
	}

	free(dest);
	}


	// recurse_merge_sort is just a modified version of a basic MergeSort method
	// with a few lines added to track the src/dst buffers for each side.

	void recurse_merge_sort(int p_src, int p_dst, int start, int end) {
	if (start == end) {
	// technically, we should copy the element from p_src[start] to p_dst[start]
	// here since we're expected to write a correct value into the dst buffer.
	// However,
	// (a) merge_sort calls memcpy(3) on its temp array (which will
	// be the dst buffer in some situations),
	// (b)this path is only executed once for every index,
	// (c) AND those indexes are always "pristine," i.e. they haven't
	// been written to by merge_sides yet;
	// This guarantees that the value will be the same as in the original buffer,
	// thus rendering assignment useless
	return;
	} else if (end == start + 1) { // if this is only two elements
	// just do a very basic swap in case its needed, and don't write otherwise
	// (cf above, it's basically the same justification + the fact there's only two
	// possible permutations for two elements, so this won't have the same catastrophic
	// perf as BubbleSort)

	int a = (*p_src)[start];
	int b = (*p_src)[end];

	if (a > b) {
	(*p_dst)[start] = b;
	(*p_dst)[end] = a;
	}

	return;
	}

	// we add 1 to guarantee that we'll have a bias towards the left side
	// (i.e. potentially more elements on the left than the right)
	int mid = (start + end) / 2 + 1;

	// we need to keep track of each src and dst since recurse_merge_sort
	// could swap them (through merge_sides)

	int* left_src = *p_src;
	int* left_dst = *p_dst;
	recurse_merge_sort(&left_src, &left_dst, mid, end);

	int* right_src = *p_src;
	int* right_dst = *p_dst;
	recurse_merge_sort(&right_src, &right_dst, start, mid - 1);

	// since both recurse_merge_sort calls WROTE the sorted values INTO the dst
	// arrays, we use those for the merge, and not the "original" (i.e. src)
	merge_sides(p_src, p_dst, start, left_dst + start, mid - start, right_dst + mid, end - (start + end)/2);
	}

	void merge_sides(int p_src, int p_dst, int dst_offset, int* left, size_t l_length, int* right, size_t r_length) {
	size_t length = l_length + r_length;

	// the actual array we'll store stuff into,
	// add offset directly so that indexing is more readable and fool-proof
	int* dest = *p_dst + dst_offset;

	// If left overlaps with the destination buffer, then use the
	// "source" buffer for the destination instead
	if (left >= p_dst + dst_offset && left < p_dst + dst_offset + length) {
	dest = *p_src;
	p_src = p_dst;
	*p_dst = dest;
	}

	int l_idx = 0;
	int r_idx = 0;

	for (int i = 0; i < length; i++) {
	// if we've already copied everything from left,
	// then just copy the rest of right into dest and exit
	if (l_idx >= l_length) {
	size_t rest_length = length - i; // == r_length - r_idx

	// if we only have one element left, just do it manually, it'll be faster
	// and doesn't take up too much source-code.
	//
	// If you absolutely wanted to, you could probably write variations up to
	// something like 8 items and it (might) be faster, since it wouldn't fit
	// into AVX-256 registers/buses anyway if under <32 bytes, i.e. 8*sizeof(int)
	// and wouldn't require additional function calls, but that's pretty debatable.
	// You might also be able to cast stuff to long/long* to get only half the
	// number of mov's, although that's basically nano-optimization at that point,
	// and the lack of data locality with this algorithm will be a much bigger
	// problem if I had to guess
	if (rest_length == 1) {
	dest[i] = right[r_idx];
	} else {
	memcpy(dest + i, right + r_idx, rest_length * sizeof(int));
	}

	return;
	}

	// same as above
	if (r_idx >= r_length) {
	size_t rest_length = length - i;

	if (rest_length == 1) {
	dest[i] = left[l_idx];
	} else {
	memcpy(dest + i, left + l_idx, rest_length * sizeof(int));
	}

	return;
	}

	if (left[l_idx] > right[r_idx]) {
	dest[i] = right[r_idx];
	r_idx++;
	} else {
	dest[i] = left[l_idx];
	l_idx++;
	}
	}

	return;
	}