Skip to content

Instantly share code, notes, and snippets.

@Alexis-D
Created February 27, 2012 11:20
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Alexis-D/1923133 to your computer and use it in GitHub Desktop.
Save Alexis-D/1923133 to your computer and use it in GitHub Desktop.
Parallel Radix Sort with OpenMP
#include <stdlib.h>
#include <string.h>
#define INITIAL_SHIFT (((sizeof (long long)) - 1) * 010)
#define NTHREADS 64
#define SWITCH_TO_INSERTION 16
#define SWITCH_TO_QSORT 8192
#define DO_NOT_RADIX 2626144
/*
* Notes:
* - We assume 8-bits bytes
* - Lots of small (dirty) hacks to win a few cycles here & there
* - Compile only with GCC (adding a few macros and it should work everywhere)
*/
static inline unsigned long long max(unsigned long long, unsigned long long);
static void q_sort(unsigned long long [], size_t);
static void _radix_sort(unsigned long long [], unsigned long long [],
register size_t, unsigned long long);
static inline void radix_sort(long long [], size_t, unsigned long long);
void psort(long long [], int);
static inline unsigned long long max(unsigned long long a, unsigned long long b) {
/* return the max a and b */
return a > b ? a : b;
}
static void q_sort(unsigned long long a[], size_t size) {
/*
* sort in place the array a of size size using quicksort while
* the array size is > SWITCH_TO_INSERTION, otherwise switch
* to an insertion sort (faster on small arrays due to better
* caching)
*/
if(size < SWITCH_TO_INSERTION) {
/*
* embedded insertion sort to avoid function call
* (we may also have used an inline function)
*/
// use gcc __builtin_expect to optimize pipelining
if(__builtin_expect(size < 2, 0)) {
return;
}
register unsigned long long v;
register size_t i, j;
// ^= faster than = 0
for(i ^= i; i < size; ++i) {
v = a[i];
for(j = i; j && v < a[j - 1]; --j) {
a[j] = a[j - 1];
}
a[j] = v;
}
return;
}
register unsigned long long
mid = a[size >> 1],
*l = a,
*r = a + size - 1,
t,
/*
* take the pivot as the middle value of 3 items
* avoid to have a degenerated qsort (why not
* take the average?)
*/
p = *l < mid ?
(mid < *r ? mid : max(*r, *l))
: (*l < *r ? *l : max(mid, *r));
while(l <= r) {
// while well "ordered"
while(*l < p) {
l++;
}
// while well "ordered"
while(*r > p) {
r--;
}
// if we haven't finish the partition, swap elements
if(l <= r) {
t = *l;
*l++ = *r;
*r-- = t;
}
}
// sort sub-arrays
q_sort(a, (r - a) + 1);
q_sort(l, (a - l) + size);
}
static void _radix_sort(unsigned long long a[], unsigned long long *buffer,
register size_t size, unsigned long long shift) {
/*
* sort the array a using the memory provided by buffer.
* a & buffer should have the size size, shift is used
* for "extracting" digits (in base 0x100)
*/
// stop condition if we only use radix
// if(size < 2) {
// return;
// }
// switch to a quick sort if array is small or shift == 0
if(size < SWITCH_TO_QSORT || !shift) {
q_sort(a, size);
return;
}
register size_t i, radix;
size_t buckets_size[0x100] = {0}, // size needed in each bucket
buckets_index[0x100] = {0}, // first address of each bucket
buckets_cindex[0x101] = {0}; // 0x101 because of next loop
// add all indexes
if(__builtin_expect(shift == INITIAL_SHIFT, 0)) {
size_t local_buckets_size[0x100] = {0}; // size needed in each bucket/thread
#pragma omp parallel num_threads(NTHREADS) firstprivate(local_buckets_size) private(radix)
{
#pragma omp for nowait
for(i = 0; i < size; ++i) {
// increment the number of number in the bucket
radix = (a[i] >> shift) & 0xff;
++local_buckets_size[radix];
}
#pragma omp critical
// decreasing loop *may* be faster on some architectures
for(i = 0x100; i--;) {
buckets_size[i] += local_buckets_size[i];
}
}
}
else {
for(i = size; i;) {
radix = (a[--i] >> shift) & 0xff;
++buckets_size[radix];
}
}
unsigned long long *buckets[0x100];
register unsigned long long *ptr = buffer;
for(i ^= i; i < 0x100; ++i) {
buckets[i] = ptr;
ptr += (unsigned long long)buckets_size[i];
buckets_cindex[i + 1] = buckets_cindex[i] + buckets_size[i];
}
// add the numbers in the correct buckets
for(i = size; i--;) {
radix = (a[i] >> shift) & 0xff;
buckets[radix][buckets_index[radix]++] = a[i];
}
// in the next level of recursion we shift the shift to the right
const register unsigned long long new_shift = shift - 010;
/*
* copy buffer in a BEFORE recurring, and then free buffer
* much better for space comp parallellexity (need to do a scan/fold
* in the buckets filling loop to be efficient & easily
* parallelized + memcpy :))
*/
if(__builtin_expect(shift == INITIAL_SHIFT, 0)) {
#pragma omp parallel for num_threads(NTHREADS)
for(i = 0; i < 0x100; ++i) {
_radix_sort(buckets[i], a + buckets_cindex[i], buckets_size[i], new_shift);
memcpy(a + buckets_cindex[i], buckets[i], buckets_size[i] * sizeof (unsigned long long));
}
}
else {
// already sorted if shift == 0
// if(shift) {
for(i = 0; i < 0x100; ++i) {
_radix_sort(buckets[i], a + buckets_cindex[i], buckets_size[i], new_shift);
}
// }
memcpy(a, buffer, size * sizeof (unsigned long long));
}
}
static inline void radix_sort(long long a[], size_t size,
unsigned long long shift) {
const register unsigned long long
xor = ((unsigned long long)0x80) << shift;
register size_t i;
long long *copy = a;
#pragma omp parallel for num_threads(NTHREADS)
for(i = 0; i < size; ++i) {
/*
* flip the first bit, and then treat number as unsigned.
* here is why:
* min long long: 0b100...000
* min + 1 : 0b100...001
* min + 2 : 0b100...010
* ...
* 0 : 0b000...000
* 1 : 0b000...001
* 2 : 0b000...010
* ...
* max - 2 : 0b011...101
* max - 1 : 0b011...110
* max long long: 0b011...111
*
* if we flip the first bit, and consider the numbers
* as unsigned, then if the most significant byte of a number is
* bigger than the most significant byte on another one it
* means that this number is bigger (that would be false
* with signed long long, e.g.
* most_significant_byte(-1) > most_significant_byte(0))
* so the xor trick + unsigned solve the problem
*/
copy[i] ^= xor;
}
unsigned long long *buffer = malloc(sizeof (unsigned long long) * size);
// the magic happens here
_radix_sort((unsigned long long *)a, buffer, size, shift);
free(buffer);
#pragma omp parallel for num_threads(NTHREADS)
for(i = 0; i < size; ++i) {
// restore the true value of numbers
a[i] ^= xor;
}
}
void psort(long long a[], int size) {
/* sort a in place */
// switch to q_sort if size < DO_NOT_RADIX
if(__builtin_expect(size < DO_NOT_RADIX, 0)) {
register size_t i;
const register unsigned long long
xor = ((unsigned long long)0x80) << INITIAL_SHIFT;
for(i = 0; i < size; ++i) {
a[i] ^= xor;
}
q_sort((unsigned long long *)a, size);
for(i = 0; i < size; ++i) {
a[i] ^= xor;
}
}
else {
/*
* shift is equals to the length in bits of a long long
* minus 8 at the first iteration.
*/
radix_sort(a, size, INITIAL_SHIFT);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment