-
-
Save zeux/2e4edeef8a09ee0bfab3ec76858aaec1 to your computer and use it in GitHub Desktop.
radix3 vs 4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <string> | |
#include <vector> | |
inline unsigned int radixUnsignedInt(unsigned int v) | |
{ | |
return v; | |
} | |
inline unsigned int radixInt(int v) | |
{ | |
// flip sign bit | |
return static_cast<unsigned int>(v) ^ 0x80000000; | |
} | |
inline unsigned int radixUnsignedFloat(const float& v) | |
{ | |
return *reinterpret_cast<const unsigned int*>(&v); | |
} | |
inline unsigned int radixFloat(const float& v) | |
{ | |
// if sign bit is 0, flip sign bit | |
// if sign bit is 1, flip everything | |
int f = *reinterpret_cast<const int*>(&v); | |
unsigned int mask = int(f >> 31) | 0x80000000; | |
return f ^ mask; | |
} | |
template <typename T, typename Pred> inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred) | |
{ | |
unsigned int h[2048*3]; | |
for (size_t i = 0; i < 2048*3; ++i) h[i] = 0; | |
unsigned int* h0 = h; | |
unsigned int* h1 = h + 2048; | |
unsigned int* h2 = h + 2048*2; | |
T* e0_end = e0 + count; | |
#define _0(h) ((h) & 2047) | |
#define _1(h) (((h) >> 11) & 2047) | |
#define _2(h) ((h) >> 22) | |
// fill histogram | |
for (const T* i = e0; i != e0_end; ++i) | |
{ | |
unsigned int h = pred(*i); | |
h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; | |
} | |
// compute offsets | |
{ | |
unsigned int sum0 = 0, sum1 = 0, sum2 = 0; | |
for (unsigned int i = 0; i < 2048; ++i) | |
{ | |
unsigned int c0 = h0[i]; | |
unsigned int c1 = h1[i]; | |
unsigned int c2 = h2[i]; | |
h0[i] = sum0; | |
h1[i] = sum1; | |
h2[i] = sum2; | |
sum0 += c0; | |
sum1 += c1; | |
sum2 += c2; | |
} | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e0[i]); | |
e1[h0[_0(h)]++] = e0[i]; | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e1[i]); | |
e0[h1[_1(h)]++] = e1[i]; | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e0[i]); | |
e1[h2[_2(h)]++] = e0[i]; | |
} | |
#undef _0 | |
#undef _1 | |
#undef _2 | |
return e1; | |
} | |
template <typename T, typename Pred> inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred) | |
{ | |
unsigned int h[256*4]; | |
for (size_t i = 0; i < 256*4; ++i) h[i] = 0; | |
unsigned int* h0 = h; | |
unsigned int* h1 = h + 256; | |
unsigned int* h2 = h + 256*2; | |
unsigned int* h3 = h + 256*3; | |
T* e0_end = e0 + count; | |
#define _0(h) ((h) & 255) | |
#define _1(h) (((h) >> 8) & 255) | |
#define _2(h) (((h) >> 16) & 255) | |
#define _3(h) ((h) >> 24) | |
// fill histogram | |
for (const T* i = e0; i != e0_end; ++i) | |
{ | |
unsigned int h = pred(*i); | |
h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++; | |
} | |
// compute offsets | |
{ | |
unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; | |
for (unsigned int i = 0; i < 256; ++i) | |
{ | |
unsigned int c0 = h0[i]; | |
unsigned int c1 = h1[i]; | |
unsigned int c2 = h2[i]; | |
unsigned int c3 = h3[i]; | |
h0[i] = sum0; | |
h1[i] = sum1; | |
h2[i] = sum2; | |
h3[i] = sum3; | |
sum0 += c0; | |
sum1 += c1; | |
sum2 += c2; | |
sum3 += c3; | |
} | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e0[i]); | |
e1[h0[_0(h)]++] = e0[i]; | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e1[i]); | |
e0[h1[_1(h)]++] = e1[i]; | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e0[i]); | |
e1[h2[_2(h)]++] = e0[i]; | |
} | |
for (size_t i = 0; i < count; ++i) | |
{ | |
unsigned int h = pred(e1[i]); | |
e0[h3[_3(h)]++] = e1[i]; | |
} | |
#undef _0 | |
#undef _1 | |
#undef _2 | |
#undef _3 | |
return e0; | |
} | |
const double kBenchRun = 0.1; | |
#if defined(__linux__) | |
double timestamp() { | |
timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); | |
} | |
#elif defined(_WIN32) | |
struct LARGE_INTEGER { | |
__int64 QuadPart; | |
}; | |
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter( | |
LARGE_INTEGER *lpPerformanceCount); | |
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency( | |
LARGE_INTEGER *lpFrequency); | |
double timestamp() { | |
LARGE_INTEGER freq, counter; | |
QueryPerformanceFrequency(&freq); | |
QueryPerformanceCounter(&counter); | |
return double(counter.QuadPart) / double(freq.QuadPart); | |
} | |
#else | |
double timestamp() { return double(clock()) / double(CLOCKS_PER_SEC); } | |
#endif | |
typedef struct { | |
uint64_t state; | |
uint64_t inc; | |
} pcg32_random_t; | |
uint32_t pcg32_random_r(pcg32_random_t *rng) { | |
uint64_t oldstate = rng->state; | |
// Advance internal state | |
rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1); | |
// Calculate output function (XSH RR), uses old state for max ILP | |
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; | |
uint32_t rot = oldstate >> 59u; | |
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); | |
} | |
template <typename T, typename Sort, typename Pred> | |
double runbench(Sort sort, const std::vector<T> &data, Pred pred) { | |
double divider = data.size(); | |
std::vector<T> copy(data.size()); | |
std::vector<T> copy2(data.size()); | |
double time = 0; | |
double start = timestamp(); | |
while (timestamp() - start < kBenchRun) { | |
copy = data; | |
double ts0 = timestamp(); | |
sort(copy.data(), copy2.data(), data.size(), pred); | |
double ts1 = timestamp(); | |
if (ts1 - ts0 < time || time == 0) time = ts1 - ts0; | |
} | |
return time * 1e9 / divider; | |
} | |
template <typename T, typename Pred> | |
void bench(const std::string &name, const std::vector<T> &data, Pred pred) { | |
double t1 = runbench(radixSort3<T, Pred>, data, pred); | |
double t2 = runbench(radixSort4<T, Pred>, data, pred); | |
printf("%s | %.2f ns/op | %.2f ns/op\n", name.c_str(), t1, t2); | |
} | |
int main() { | |
pcg32_random_t rng = {42, 0}; | |
std::vector<uint32_t> test(1000000); | |
for (size_t i = 0; i < test.size(); ++i) test[i] = pcg32_random_r(&rng); | |
bench("random int", test, [](int i) { return radixInt(i); }); | |
std::vector<float> test2(1000000); | |
for (size_t i = 0; i < test2.size(); ++i) test2[i] = pcg32_random_r(&rng) / 4e9; | |
bench("random float", test2, [](float i) { return radixUnsignedFloat(i); }); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment