public
Last active

Quick parallel out-of-place radix-sort implementation

  • Download Gist
parallel_radix_sort.cpp
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <pthread.h>
#include <random>
 
const int DATA_LENGTH = 1000000000;
typedef uint32_t number;
 
void print_time(const char* msg) {
static auto time_previous = std::chrono::high_resolution_clock::now();
auto time_now = std::chrono::high_resolution_clock::now();
printf("%s: %d ms\n",
msg,
(int)std::chrono::duration_cast<std::chrono::milliseconds>(time_now - time_previous).count());
time_previous = time_now;
}
 
void one_step_radix_sort(number *data, number *sorted_data, uint32_t *counts,
int length, int current_cut, int cut_step) {
memset(counts, 0, (1 << cut_step) * sizeof(uint32_t));
uint32_t mask = (1 << cut_step) - 1;
for (int i = 0; i < length; ++i) ++counts[(data[i] >> current_cut) & mask];
for (int i = 1; i < (1 << cut_step); ++i) counts[i] += counts[i - 1];
for (int i = length - 1; i >= 0; --i)
sorted_data[--counts[(data[i] >> current_cut) & mask]] = data[i];
}
 
void radix_sort(number *data, int length, int cut_step = 8) {
auto sorted_data = new number[length];
auto counts = new uint32_t[1 << cut_step];
for (int cut = 0; cut < 64; cut += cut_step) {
one_step_radix_sort(data, sorted_data, counts, length, cut, cut_step);
memcpy(data, sorted_data, length * sizeof(number));
}
delete sorted_data;
}
 
struct radix_sort_arguments {
number *data;
int length;
int cut_step;
};
 
void *one_arg_radix_sort(void *arg) {
radix_sort_arguments arguments = *((radix_sort_arguments*) arg);
radix_sort(arguments.data, arguments.length, arguments.cut_step);
return NULL;
}
 
void parallel_radix_sort(number *data, int length, int front_cut = 3) {
auto sorted_data = new number[length];
uint32_t counts[(1 << front_cut) + 1];
pthread_t threads[1 << front_cut];
one_step_radix_sort(data, sorted_data, counts, length, 64 - front_cut, front_cut);
memcpy(data, sorted_data, length * sizeof(number));
delete sorted_data;
counts[1 << front_cut] = length;
radix_sort_arguments arguments[1 << front_cut];
for (int i = 0; i < (1 << front_cut); ++i) {
arguments[i] = (radix_sort_arguments) {
data + counts[i],
(int) (counts[i + 1] - counts[i]),
8
};
pthread_create(&threads[i],
NULL,
one_arg_radix_sort,
&arguments[i]);
}
for (int i = 0; i < (1 << front_cut); ++i)
pthread_join(threads[i], NULL);
}
 
int main() {
auto data = new number[DATA_LENGTH];
auto data2 = new number[DATA_LENGTH];
auto data3 = new number[DATA_LENGTH];
 
print_time("start");
std::minstd_rand rng;
std::uniform_int_distribution<number> uniform_uint64;
for (int i = 0; i < DATA_LENGTH; ++i)
data[i] = uniform_uint64(rng);
print_time("random number generation");
 
memcpy(data2, data, DATA_LENGTH * sizeof(number));
memcpy(data3, data, DATA_LENGTH * sizeof(number));
print_time("data copy");
 
parallel_radix_sort(data, DATA_LENGTH);
print_time("parallel sort");
 
radix_sort(data2, DATA_LENGTH);
print_time("sequential sort");
 
std::sort(data3, data3 + DATA_LENGTH);
print_time("C++ sort");
 
for (int i = 0; i < DATA_LENGTH; ++i)
if (data[i] != data2[i])
std::cout << "FAILURE: " << data[i] << " != " << data2[i] << std::endl;
for (int i = 0; i < DATA_LENGTH; ++i)
if (data[i] != data3[i])
std::cout << "FAILURE: " << data[i] << " != " << data3[i] << std::endl;
print_time("checking");
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.