Skip to content

Instantly share code, notes, and snippets.

@amonakov
Last active October 6, 2020 12:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amonakov/477f7370aa1fe51e77409f6f4a9f9dbe to your computer and use it in GitHub Desktop.
Save amonakov/477f7370aa1fe51e77409f6f4a9f9dbe to your computer and use it in GitHub Desktop.
// SPDX-License-Identifier: MIT
// Copyright 2018 Alexander Monakov <amonakov@ispras.ru>
/* This implements a sort function suitable for GCC use cases:
- signature-compatible to C qsort, but with relaxed contract:
- may apply the comparator to elements moved to a temporary buffer
- may abort on allocation failure
- deterministic (not stable, but allows a stable variant at low cost)
- fast, especially for common cases (0-5 elements of size 8 or 4)
The implementation uses a sorting network for up to 5 elements and
a merge sort on top of that. Neither stage has branches depending on
comparator result, trading extra arithmetic for branch mispredictions. */
#include <stdlib.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#ifdef __GNUC__
#define likely(cond) __builtin_expect((cond), 1)
#define noinline __attribute__((__noinline__))
#else
#define likely(cond) (cond)
#define noinline
#endif
typedef int cmp_fn(const void *, const void *);
/* Structure holding read-mostly (read-only in netsort) context. */
struct sort_ctx {
cmp_fn *cmp; // pointer to comparator
char *out; // output buffer
size_t n; // number of elements
size_t size; // element size
};
/* Helper for netsort. Permute, possibly in-place, 2 or 3 elements,
placing E0 to C->OUT, E1 to C->OUT + C->SIZE, and so on. */
static void reorder23(struct sort_ctx *c, char *e0, char *e1, char *e2)
{
#define REORDER_23(TYPE, STRIDE, OFFSET) do { \
TYPE t0, t1; \
memcpy(&t0, e0 + OFFSET, sizeof t0); \
memcpy(&t1, e1 + OFFSET, sizeof t1); \
char *out = c->out + OFFSET; \
if (likely(c->n == 3)) \
memmove(out + 2*STRIDE, e2 + OFFSET, sizeof t0); \
memcpy(out, &t0, sizeof t0); out += STRIDE; \
memcpy(out, &t1, sizeof t1); \
} while (0)
if (likely(c->size == sizeof(size_t)))
REORDER_23(size_t, sizeof(size_t), 0);
else if (likely(c->size == sizeof(int)))
REORDER_23(int, sizeof(int), 0);
else {
size_t offset = 0, step = sizeof(size_t);
for (; offset + step <= c->size; offset += step)
REORDER_23(size_t, c->size, offset);
for (; offset < c->size; offset++)
REORDER_23(char, c->size, offset);
}
}
/* Like reorder23, but permute 4 or 5 elements. */
static void
reorder45(struct sort_ctx *c, char *e0, char *e1, char *e2, char *e3, char *e4)
{
#define REORDER_45(TYPE, STRIDE, OFFSET) do { \
TYPE t0, t1, t2, t3; \
memcpy(&t0, e0 + OFFSET, sizeof t0); \
memcpy(&t1, e1 + OFFSET, sizeof t1); \
memcpy(&t2, e2 + OFFSET, sizeof t2); \
memcpy(&t3, e3 + OFFSET, sizeof t3); \
char *out = c->out + OFFSET; \
if (likely(c->n == 5)) \
memmove(out + 4*STRIDE, e4 + OFFSET, sizeof t0); \
memcpy(out, &t0, sizeof t0); out += STRIDE; \
memcpy(out, &t1, sizeof t1); out += STRIDE; \
memcpy(out, &t2, sizeof t2); out += STRIDE; \
memcpy(out, &t3, sizeof t3); \
} while (0)
if (likely(c->size == sizeof(size_t)))
REORDER_45(size_t, sizeof(size_t), 0);
else if (likely(c->size == sizeof(int)))
REORDER_45(int, sizeof(int), 0);
else {
size_t offset = 0, step = sizeof(size_t);
for (; offset + step <= c->size; offset += step)
REORDER_45(size_t, c->size, offset);
for (; offset < c->size; offset++)
REORDER_45(char, c->size, offset);
}
}
/* Helper for netsort. Invoke comparator CMP on E0 and E1.
Return E0^E1 if E0 compares less than E1, zero otherwise.
This is noinline to reduce code growth and confine invocation
to a single call site, assisting indirect branch prediction. */
noinline static intptr_t cmp1(char *e0, char *e1, cmp_fn *cmp)
{
intptr_t x = (intptr_t) e0 ^ (intptr_t) e1;
return x & (cmp(e0, e1) >> 31);
}
/* Apply a sorting network to 2 to 5 elements from IN, placing them into C->OUT.
IN may be equal to C->OUT, in which case elements are sorted in place.
For 2 or 3 elements this is a stable sort. */
static void netsort(char *in, struct sort_ctx *c)
{
#define CMP(e0, e1) do { \
intptr_t x = cmp1(e1, e0, c->cmp); \
e0 = (char *)((intptr_t)e0 ^ x); \
e1 = (char *)((intptr_t)e1 ^ x); \
} while (0)
char *e0 = in, *e1 = e0 + c->size, *e2 = e1 + c->size;
CMP(e0, e1);
if (likely(c->n == 3)) {
CMP(e1, e2);
CMP(e0, e1);
}
if (c->n <= 3) {
reorder23(c, e0, e1, e2);
return;
}
char *e3 = e2 + c->size, *e4 = e3 + c->size;
if (likely(c->n == 5)) {
CMP(e3, e4);
CMP(e2, e4);
}
CMP(e2, e3);
if (likely(c->n == 5)) {
CMP(e0, e3);
CMP(e1, e4);
}
CMP(e0, e2);
CMP(e1, e3);
CMP(e1, e2);
reorder45(c, e0, e1, e2, e3, e4);
}
/* Execute merge sort on N elements from IN, placing them into OUT,
using TMP as temporary storage if IN is equal to OUT.
This is a stable sort if netsort is used only for 2 or 3 elements. */
static void mergesort(char *in, struct sort_ctx *c, size_t n, char *out, char *tmp)
{
if (likely(n <= 5)) {
c->out = out;
c->n = n;
netsort(in, c);
return;
}
size_t nl = n / 2, nr = n - nl, sz = nl * c->size;
char *mid = in + sz, *r = out + sz, *l = in == out ? tmp : in;
/* Sort the right half, outputting to right half of OUT. */
mergesort(mid, c, nr, r, tmp);
/* Sort the left half, leaving left half of OUT free. */
mergesort(in, c, nl, l, mid);
/* Merge sorted halves given by L, R to [OUT, END). */
#define MERGE_ELTSIZE(SIZE) do { \
intptr_t mr = c->cmp(r, l) >> 31; \
intptr_t lr = (intptr_t)l ^ (intptr_t)r; \
lr = (intptr_t)l ^ (lr & mr); \
out = memcpy(out, (char *)lr, SIZE); \
out += SIZE; \
r += mr & SIZE; \
if (r == out) return; \
l += ~mr & SIZE; \
} while (r != end)
if (likely(c->cmp(r, l + (r - out) - c->size) < 0)) {
char *end = out + n * c->size;
if (sizeof(size_t) == 8 && likely(c->size == 8))
MERGE_ELTSIZE(8);
else if (likely(c->size == 4))
MERGE_ELTSIZE(4);
else
MERGE_ELTSIZE(c->size);
}
memcpy(out, l, r - out);
}
void metsort(void *base, size_t n, size_t size, cmp_fn *cmp)
{
if (n < 2)
return;
struct sort_ctx c = {cmp, base, n, size};
_Alignas(max_align_t) char scratch[256];
size_t bufsz = (n / 2) * size;
void *buf = bufsz <= sizeof scratch ? scratch : malloc(bufsz);
if (!buf) abort();
mergesort(base, &c, n, base, buf);
if (buf != scratch)
free(buf);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment