Skip to content

Instantly share code, notes, and snippets.

@zeux

zeux/benchmark.cpp Secret

Created Apr 10, 2021
Embed
What would you like to do?
radix3 vs 4
#include <cmath>
#include <string>
#include <vector>
inline unsigned int radixUnsignedInt(unsigned int v)
{
return v;
}
inline unsigned int radixInt(int v)
{
// flip sign bit
return static_cast<unsigned int>(v) ^ 0x80000000;
}
inline unsigned int radixUnsignedFloat(const float& v)
{
return *reinterpret_cast<const unsigned int*>(&v);
}
inline unsigned int radixFloat(const float& v)
{
// if sign bit is 0, flip sign bit
// if sign bit is 1, flip everything
int f = *reinterpret_cast<const int*>(&v);
unsigned int mask = int(f >> 31) | 0x80000000;
return f ^ mask;
}
template <typename T, typename Pred> inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred)
{
unsigned int h[2048*3];
for (size_t i = 0; i < 2048*3; ++i) h[i] = 0;
unsigned int* h0 = h;
unsigned int* h1 = h + 2048;
unsigned int* h2 = h + 2048*2;
T* e0_end = e0 + count;
#define _0(h) ((h) & 2047)
#define _1(h) (((h) >> 11) & 2047)
#define _2(h) ((h) >> 22)
// fill histogram
for (const T* i = e0; i != e0_end; ++i)
{
unsigned int h = pred(*i);
h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++;
}
// compute offsets
{
unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
for (unsigned int i = 0; i < 2048; ++i)
{
unsigned int c0 = h0[i];
unsigned int c1 = h1[i];
unsigned int c2 = h2[i];
h0[i] = sum0;
h1[i] = sum1;
h2[i] = sum2;
sum0 += c0;
sum1 += c1;
sum2 += c2;
}
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e0[i]);
e1[h0[_0(h)]++] = e0[i];
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e1[i]);
e0[h1[_1(h)]++] = e1[i];
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e0[i]);
e1[h2[_2(h)]++] = e0[i];
}
#undef _0
#undef _1
#undef _2
return e1;
}
template <typename T, typename Pred> inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred)
{
unsigned int h[256*4];
for (size_t i = 0; i < 256*4; ++i) h[i] = 0;
unsigned int* h0 = h;
unsigned int* h1 = h + 256;
unsigned int* h2 = h + 256*2;
unsigned int* h3 = h + 256*3;
T* e0_end = e0 + count;
#define _0(h) ((h) & 255)
#define _1(h) (((h) >> 8) & 255)
#define _2(h) (((h) >> 16) & 255)
#define _3(h) ((h) >> 24)
// fill histogram
for (const T* i = e0; i != e0_end; ++i)
{
unsigned int h = pred(*i);
h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++;
}
// compute offsets
{
unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
for (unsigned int i = 0; i < 256; ++i)
{
unsigned int c0 = h0[i];
unsigned int c1 = h1[i];
unsigned int c2 = h2[i];
unsigned int c3 = h3[i];
h0[i] = sum0;
h1[i] = sum1;
h2[i] = sum2;
h3[i] = sum3;
sum0 += c0;
sum1 += c1;
sum2 += c2;
sum3 += c3;
}
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e0[i]);
e1[h0[_0(h)]++] = e0[i];
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e1[i]);
e0[h1[_1(h)]++] = e1[i];
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e0[i]);
e1[h2[_2(h)]++] = e0[i];
}
for (size_t i = 0; i < count; ++i)
{
unsigned int h = pred(e1[i]);
e0[h3[_3(h)]++] = e1[i];
}
#undef _0
#undef _1
#undef _2
#undef _3
return e0;
}
const double kBenchRun = 0.1;
#if defined(__linux__)
double timestamp() {
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
}
#elif defined(_WIN32)
struct LARGE_INTEGER {
__int64 QuadPart;
};
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter(
LARGE_INTEGER *lpPerformanceCount);
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency(
LARGE_INTEGER *lpFrequency);
double timestamp() {
LARGE_INTEGER freq, counter;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&counter);
return double(counter.QuadPart) / double(freq.QuadPart);
}
#else
double timestamp() { return double(clock()) / double(CLOCKS_PER_SEC); }
#endif
typedef struct {
uint64_t state;
uint64_t inc;
} pcg32_random_t;
uint32_t pcg32_random_r(pcg32_random_t *rng) {
uint64_t oldstate = rng->state;
// Advance internal state
rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1);
// Calculate output function (XSH RR), uses old state for max ILP
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
uint32_t rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}
template <typename T, typename Sort, typename Pred>
double runbench(Sort sort, const std::vector<T> &data, Pred pred) {
double divider = data.size();
std::vector<T> copy(data.size());
std::vector<T> copy2(data.size());
double time = 0;
double start = timestamp();
while (timestamp() - start < kBenchRun) {
copy = data;
double ts0 = timestamp();
sort(copy.data(), copy2.data(), data.size(), pred);
double ts1 = timestamp();
if (ts1 - ts0 < time || time == 0) time = ts1 - ts0;
}
return time * 1e9 / divider;
}
template <typename T, typename Pred>
void bench(const std::string &name, const std::vector<T> &data, Pred pred) {
double t1 = runbench(radixSort3<T, Pred>, data, pred);
double t2 = runbench(radixSort4<T, Pred>, data, pred);
printf("%s | %.2f ns/op | %.2f ns/op\n", name.c_str(), t1, t2);
}
int main() {
pcg32_random_t rng = {42, 0};
std::vector<uint32_t> test(1000000);
for (size_t i = 0; i < test.size(); ++i) test[i] = pcg32_random_r(&rng);
bench("random int", test, [](int i) { return radixInt(i); });
std::vector<float> test2(1000000);
for (size_t i = 0; i < test2.size(); ++i) test2[i] = pcg32_random_r(&rng) / 4e9;
bench("random float", test2, [](float i) { return radixUnsignedFloat(i); });
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment