zeux/benchmark.cpp Secret

## benchmark.cpp
#include <cmath>
#include <string>
#include <vector>

inline unsigned int radixUnsignedInt(unsigned int v)
{
  return v;
}

inline unsigned int radixInt(int v)
{
  // flip sign bit
  return static_cast<unsigned int>(v) ^ 0x80000000;
}

inline unsigned int radixUnsignedFloat(const float& v)
{
  return *reinterpret_cast<const unsigned int*>(&v);
}

inline unsigned int radixFloat(const float& v)
{
  // if sign bit is 0, flip sign bit
  // if sign bit is 1, flip everything
  int f = *reinterpret_cast<const int*>(&v);
  unsigned int mask = int(f >> 31) | 0x80000000;
  return f ^ mask;
}

template <typename T, typename Pred> inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred)
{
  unsigned int h[2048*3];

  for (size_t i = 0; i < 2048*3; ++i) h[i] = 0;

  unsigned int* h0 = h;
  unsigned int* h1 = h + 2048;
  unsigned int* h2 = h + 2048*2;

  T* e0_end = e0 + count;

  #define _0(h) ((h) & 2047)
  #define _1(h) (((h) >> 11) & 2047)
  #define _2(h) ((h) >> 22)

  // fill histogram
  for (const T* i = e0; i != e0_end; ++i)
  {
    unsigned int h = pred(*i);

    h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++;
  }

  // compute offsets
  {
    unsigned int sum0 = 0, sum1 = 0, sum2 = 0;

    for (unsigned int i = 0; i < 2048; ++i)
    {
      unsigned int c0 = h0[i];
      unsigned int c1 = h1[i];
      unsigned int c2 = h2[i];

      h0[i] = sum0;
      h1[i] = sum1;
      h2[i] = sum2;

      sum0 += c0;
      sum1 += c1;
      sum2 += c2;
    }
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e0[i]);
    e1[h0[_0(h)]++] = e0[i];
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e1[i]);
    e0[h1[_1(h)]++] = e1[i];
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e0[i]);
    e1[h2[_2(h)]++] = e0[i];
  }

  #undef _0
  #undef _1
  #undef _2

  return e1;
}

template <typename T, typename Pred> inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred)
{
  unsigned int h[256*4];

  for (size_t i = 0; i < 256*4; ++i) h[i] = 0;

  unsigned int* h0 = h;
  unsigned int* h1 = h + 256;
  unsigned int* h2 = h + 256*2;
  unsigned int* h3 = h + 256*3;

  T* e0_end = e0 + count;

  #define _0(h) ((h) & 255)
  #define _1(h) (((h) >> 8) & 255)
  #define _2(h) (((h) >> 16) & 255)
  #define _3(h) ((h) >> 24)

  // fill histogram
  for (const T* i = e0; i != e0_end; ++i)
  {
    unsigned int h = pred(*i);

    h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++;
  }

  // compute offsets
  {
    unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;

    for (unsigned int i = 0; i < 256; ++i)
    {
      unsigned int c0 = h0[i];
      unsigned int c1 = h1[i];
      unsigned int c2 = h2[i];
      unsigned int c3 = h3[i];

      h0[i] = sum0;
      h1[i] = sum1;
      h2[i] = sum2;
      h3[i] = sum3;

      sum0 += c0;
      sum1 += c1;
      sum2 += c2;
      sum3 += c3;
    }
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e0[i]);
    e1[h0[_0(h)]++] = e0[i];
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e1[i]);
    e0[h1[_1(h)]++] = e1[i];
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e0[i]);
    e1[h2[_2(h)]++] = e0[i];
  }

  for (size_t i = 0; i < count; ++i)
  {
    unsigned int h = pred(e1[i]);
    e0[h3[_3(h)]++] = e1[i];
  }

  #undef _0
  #undef _1
  #undef _2
  #undef _3

  return e0;
}

const double kBenchRun = 0.1;

#if defined(__linux__)
double timestamp() {
  timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
}
#elif defined(_WIN32)
struct LARGE_INTEGER {
  __int64 QuadPart;
};
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter(
    LARGE_INTEGER *lpPerformanceCount);
extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency(
    LARGE_INTEGER *lpFrequency);

double timestamp() {
  LARGE_INTEGER freq, counter;
  QueryPerformanceFrequency(&freq);
  QueryPerformanceCounter(&counter);
  return double(counter.QuadPart) / double(freq.QuadPart);
}
#else
double timestamp() { return double(clock()) / double(CLOCKS_PER_SEC); }
#endif

typedef struct {
  uint64_t state;
  uint64_t inc;
} pcg32_random_t;

uint32_t pcg32_random_r(pcg32_random_t *rng) {
  uint64_t oldstate = rng->state;
  // Advance internal state
  rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1);
  // Calculate output function (XSH RR), uses old state for max ILP
  uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
  uint32_t rot = oldstate >> 59u;
  return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}

template <typename T, typename Sort, typename Pred>
double runbench(Sort sort, const std::vector<T> &data, Pred pred) {
  double divider = data.size();

  std::vector<T> copy(data.size());
  std::vector<T> copy2(data.size());

  double time = 0;
  double start = timestamp();

  while (timestamp() - start < kBenchRun) {
    copy = data;

    double ts0 = timestamp();
    sort(copy.data(), copy2.data(), data.size(), pred);
    double ts1 = timestamp();

    if (ts1 - ts0 < time || time == 0) time = ts1 - ts0;
  }

  return time * 1e9 / divider;
}

template <typename T, typename Pred>
void bench(const std::string &name, const std::vector<T> &data, Pred pred) {
  double t1 = runbench(radixSort3<T, Pred>, data, pred);
  double t2 = runbench(radixSort4<T, Pred>, data, pred);

  printf("%s | %.2f ns/op | %.2f ns/op\n", name.c_str(), t1, t2);
}

int main() {
  pcg32_random_t rng = {42, 0};
  std::vector<uint32_t> test(1000000);
  for (size_t i = 0; i < test.size(); ++i) test[i] = pcg32_random_r(&rng);

  bench("random int", test, [](int i) { return radixInt(i); });

  std::vector<float> test2(1000000);
  for (size_t i = 0; i < test2.size(); ++i) test2[i] = pcg32_random_r(&rng) / 4e9;

  bench("random float", test2, [](float i) { return radixUnsignedFloat(i); });
}
	#include <cmath>
	#include <string>
	#include <vector>

	inline unsigned int radixUnsignedInt(unsigned int v)
	{
	return v;
	}

	inline unsigned int radixInt(int v)
	{
	// flip sign bit
	return static_cast<unsigned int>(v) ^ 0x80000000;
	}

	inline unsigned int radixUnsignedFloat(const float& v)
	{
	return reinterpret_cast<const unsigned int>(&v);
	}

	inline unsigned int radixFloat(const float& v)
	{
	// if sign bit is 0, flip sign bit
	// if sign bit is 1, flip everything
	int f = reinterpret_cast<const int>(&v);
	unsigned int mask = int(f >> 31) \| 0x80000000;
	return f ^ mask;
	}

	template <typename T, typename Pred> inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred)
	{
	unsigned int h[2048*3];

	for (size_t i = 0; i < 2048*3; ++i) h[i] = 0;

	unsigned int* h0 = h;
	unsigned int* h1 = h + 2048;
	unsigned int* h2 = h + 2048*2;

	T* e0_end = e0 + count;

	#define _0(h) ((h) & 2047)
	#define _1(h) (((h) >> 11) & 2047)
	#define _2(h) ((h) >> 22)

	// fill histogram
	for (const T* i = e0; i != e0_end; ++i)
	{
	unsigned int h = pred(*i);

	h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++;
	}

	// compute offsets
	{
	unsigned int sum0 = 0, sum1 = 0, sum2 = 0;

	for (unsigned int i = 0; i < 2048; ++i)
	{
	unsigned int c0 = h0[i];
	unsigned int c1 = h1[i];
	unsigned int c2 = h2[i];

	h0[i] = sum0;
	h1[i] = sum1;
	h2[i] = sum2;

	sum0 += c0;
	sum1 += c1;
	sum2 += c2;
	}
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e0[i]);
	e1[h0[_0(h)]++] = e0[i];
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e1[i]);
	e0[h1[_1(h)]++] = e1[i];
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e0[i]);
	e1[h2[_2(h)]++] = e0[i];
	}

	#undef _0
	#undef _1
	#undef _2

	return e1;
	}

	template <typename T, typename Pred> inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred)
	{
	unsigned int h[256*4];

	for (size_t i = 0; i < 256*4; ++i) h[i] = 0;

	unsigned int* h0 = h;
	unsigned int* h1 = h + 256;
	unsigned int* h2 = h + 256*2;
	unsigned int* h3 = h + 256*3;

	T* e0_end = e0 + count;

	#define _0(h) ((h) & 255)
	#define _1(h) (((h) >> 8) & 255)
	#define _2(h) (((h) >> 16) & 255)
	#define _3(h) ((h) >> 24)

	// fill histogram
	for (const T* i = e0; i != e0_end; ++i)
	{
	unsigned int h = pred(*i);

	h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++;
	}

	// compute offsets
	{
	unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;

	for (unsigned int i = 0; i < 256; ++i)
	{
	unsigned int c0 = h0[i];
	unsigned int c1 = h1[i];
	unsigned int c2 = h2[i];
	unsigned int c3 = h3[i];

	h0[i] = sum0;
	h1[i] = sum1;
	h2[i] = sum2;
	h3[i] = sum3;

	sum0 += c0;
	sum1 += c1;
	sum2 += c2;
	sum3 += c3;
	}
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e0[i]);
	e1[h0[_0(h)]++] = e0[i];
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e1[i]);
	e0[h1[_1(h)]++] = e1[i];
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e0[i]);
	e1[h2[_2(h)]++] = e0[i];
	}

	for (size_t i = 0; i < count; ++i)
	{
	unsigned int h = pred(e1[i]);
	e0[h3[_3(h)]++] = e1[i];
	}

	#undef _0
	#undef _1
	#undef _2
	#undef _3

	return e0;
	}

	const double kBenchRun = 0.1;

	#if defined(__linux__)
	double timestamp() {
	timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
	}
	#elif defined(_WIN32)
	struct LARGE_INTEGER {
	__int64 QuadPart;
	};
	extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter(
	LARGE_INTEGER *lpPerformanceCount);
	extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency(
	LARGE_INTEGER *lpFrequency);

	double timestamp() {
	LARGE_INTEGER freq, counter;
	QueryPerformanceFrequency(&freq);
	QueryPerformanceCounter(&counter);
	return double(counter.QuadPart) / double(freq.QuadPart);
	}
	#else
	double timestamp() { return double(clock()) / double(CLOCKS_PER_SEC); }
	#endif

	typedef struct {
	uint64_t state;
	uint64_t inc;
	} pcg32_random_t;

	uint32_t pcg32_random_r(pcg32_random_t *rng) {
	uint64_t oldstate = rng->state;
	// Advance internal state
	rng->state = oldstate * 6364136223846793005ULL + (rng->inc \| 1);
	// Calculate output function (XSH RR), uses old state for max ILP
	uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
	uint32_t rot = oldstate >> 59u;
	return (xorshifted >> rot) \| (xorshifted << ((-rot) & 31));
	}

	template <typename T, typename Sort, typename Pred>
	double runbench(Sort sort, const std::vector<T> &data, Pred pred) {
	double divider = data.size();

	std::vector<T> copy(data.size());
	std::vector<T> copy2(data.size());

	double time = 0;
	double start = timestamp();

	while (timestamp() - start < kBenchRun) {
	copy = data;

	double ts0 = timestamp();
	sort(copy.data(), copy2.data(), data.size(), pred);
	double ts1 = timestamp();

	if (ts1 - ts0 < time \|\| time == 0) time = ts1 - ts0;
	}

	return time * 1e9 / divider;
	}

	template <typename T, typename Pred>
	void bench(const std::string &name, const std::vector<T> &data, Pred pred) {
	double t1 = runbench(radixSort3<T, Pred>, data, pred);
	double t2 = runbench(radixSort4<T, Pred>, data, pred);

	printf("%s \| %.2f ns/op \| %.2f ns/op\n", name.c_str(), t1, t2);
	}

	int main() {
	pcg32_random_t rng = {42, 0};
	std::vector<uint32_t> test(1000000);
	for (size_t i = 0; i < test.size(); ++i) test[i] = pcg32_random_r(&rng);

	bench("random int", test, [](int i) { return radixInt(i); });

	std::vector<float> test2(1000000);
	for (size_t i = 0; i < test2.size(); ++i) test2[i] = pcg32_random_r(&rng) / 4e9;

	bench("random float", test2, [](float i) { return radixUnsignedFloat(i); });
	}