Last active
October 15, 2019 05:42
-
-
Save martinus/c43d99ad0008e11fcdbf06982e25f464 to your computer and use it in GitHub Desktop.
fast random bool in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
#define LIKELY(x) __builtin_expect((x), 1) | |
#define UNLIKELY(x) __builtin_expect((x), 0) | |
#define NO_INLINE __attribute__((noinline)) | |
// extremely fast random number generator that also produces very high quality random. | |
// see PractRand: http://pracrand.sourceforge.net/PractRand.txt | |
class sfc64 { | |
public: | |
using result_type = uint64_t; | |
static constexpr uint64_t(min)() { return 0; } | |
static constexpr uint64_t(max)() { return UINT64_C(-1); } | |
sfc64() : sfc64(std::random_device{}()) {} | |
explicit sfc64(uint64_t seed) : m_a(seed), m_b(seed), m_c(seed), m_counter(1) { | |
for (int i = 0; i < 12; ++i) { | |
operator()(); | |
} | |
} | |
uint64_t operator()() noexcept { | |
auto const tmp = m_a + m_b + m_counter++; | |
m_a = m_b ^ (m_b >> right_shift); | |
m_b = m_c + (m_c << left_shift); | |
m_c = rotl(m_c, rotation) + tmp; | |
return tmp; | |
} | |
private: | |
template <typename T> T rotl(T const x, int k) { return (x << k) | (x >> (8 * sizeof(T) - k)); } | |
static constexpr int rotation = 24; | |
static constexpr int right_shift = 11; | |
static constexpr int left_shift = 3; | |
uint64_t m_a; | |
uint64_t m_b; | |
uint64_t m_c; | |
uint64_t m_counter; | |
}; | |
// Probably the simplest correct implementation of random bool. | |
// Also one of the slowest variant. | |
class UniformDistribution { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
return std::uniform_int_distribution<>{0, 1}(rng); | |
} | |
}; | |
// bounded() is a highly optimized way to produce unbiased bounded random numbers. | |
// It's not really fast when just used for bool though. | |
class BoundedOperator { | |
public: | |
template <typename Rng> uint64_t bounded(Rng &rng, uint64_t boundExcluded) { | |
#ifdef __SIZEOF_INT128__ | |
// fastest solution I could find | |
// 1.78746 ns/op total | |
using u128 = unsigned __int128; | |
u128 m = u128(std::uniform_int_distribution<uint64_t>{}(rng)) * u128(boundExcluded); | |
uint64_t l = (uint64_t)m; | |
if (l < boundExcluded) { | |
uint64_t t = (0 - boundExcluded) % boundExcluded; | |
while (l < t) { | |
m = u128(rng()) * u128(boundExcluded); | |
l = (uint64_t)m; | |
} | |
} | |
return m >> 64; | |
#else | |
// java's method usually needs only 1 modulus, faster than OpenBSD's method or pcg's. | |
// also faster than std::shuffle's method. 9.73759 ns/op total | |
// 7.02644 ns/op for std::random_shuffle which uses this. | |
uint64_t x, r; | |
do { | |
x = rng(); | |
r = x % boundExcluded; | |
} while (x - r > (0 - boundExcluded)); | |
return r; | |
#endif | |
} | |
template <typename Rng> bool operator()(Rng &rng) { return bounded(rng, 2); } | |
}; | |
// Unbiased, but very slow. bernoulli_distribution is too generic for this. | |
class BernoulliDistribution { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { return m_dist(rng); } | |
private: | |
std::bernoulli_distribution m_dist; | |
}; | |
// Biased if the rng does not give random numbers in [0, 2^x]. | |
class BinaryAnd { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { return rng() & 1; } | |
}; | |
// unbiased | |
template <typename U = uint64_t> class BinaryAndUnbiased { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
return (std::uniform_int_distribution<U>{}(rng)) & 1; | |
} | |
}; | |
// Unbiased & quite fast but requires 16byte on 64bit platforms. | |
template <typename U = uint64_t> class RandomizerWithCounterT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_counter)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_counter = sizeof(m_rand) * 8; | |
} | |
return (m_rand >> --m_counter) & 1; | |
} | |
private: | |
U m_rand = 0; | |
int m_counter = 0; | |
}; | |
// Unbiased & quite fast but requires 16byte on 64bit platforms. | |
template <typename U = uint64_t> class RandomizerWithCounter2 { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
auto b = m_counter & s_mask; | |
if (UNLIKELY(0 == b)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
} | |
++m_counter; | |
return (m_rand >> b) & 1; | |
} | |
private: | |
static constexpr U s_mask = sizeof(U) * 8 - 1; | |
U m_rand = 0; | |
uint_fast8_t m_counter = 0; | |
}; | |
// Unbiased, fastest variant. Gets rid of the counter by sacrificing 1 bit of randomness. | |
// UNLIKELY macro seems important for clang++. | |
template <typename U = uint64_t> class RandomizerWithShiftT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_rand)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng) | s_mask_left1; | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
static constexpr const U s_mask_left1 = U(1) << (sizeof(U) * 8 - 1); | |
U m_rand = 1; | |
}; | |
template <typename U = size_t> class RandomizerWithShiftMaxSize { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_rand)) { | |
m_rand = std::uniform_int_distribution<U>{0, s_mask_left1 - 1}(rng) | s_mask_left1; | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
static constexpr const U s_mask_left1 = U(1) << (sizeof(U) * 8 - 1); | |
U m_rand = 1; | |
}; | |
template <typename U = uint64_t> class RandomizerWithShiftTNoLikely { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (1 == m_rand) { | |
m_rand = std::uniform_int_distribution<U>{}(rng) | (U(1) << (sizeof(U) * 8 - 1)); | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
U m_rand = 1; | |
}; | |
// fastest variant for 4x unrolled loop | |
template <typename U = uint64_t> class RandomizerWithCounterShiftT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_counter)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_counter = sizeof(U) * 8; | |
} | |
--m_counter; | |
bool ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
U m_rand; | |
int m_counter = 0; | |
}; | |
template <typename U> class RandomizerWithMaskShift { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_mask)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_mask = 1; | |
} | |
bool ret = m_rand & m_mask; | |
m_mask <<= 1; | |
return ret; | |
} | |
private: | |
U m_rand; | |
U m_mask = 0; | |
}; | |
template <typename U> class RandomizerWithRotl { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_mask)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
} | |
m_mask = rotl(m_mask, 1); | |
return m_rand & m_mask; | |
} | |
private: | |
U m_rand; | |
U m_mask = 1; | |
}; | |
template <typename B, typename Rng> | |
NO_INLINE void benchU4(size_t iters, const char *rng_name, const char *method_name) { | |
size_t loops = iters / 4; | |
size_t s = 0; | |
std::vector<double> times; | |
for (size_t i = 0; i < 7; ++i) { | |
Rng rng{123}; | |
B b; | |
auto t_start = std::chrono::high_resolution_clock::now(); | |
for (size_t t = 0; t < loops; ++t) { | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
} | |
auto t_end = std::chrono::high_resolution_clock::now(); | |
auto t_sec = std::chrono::duration<double>(t_end - t_start).count(); | |
times.push_back(t_sec / iters * 1e9); | |
} | |
std::sort(times.begin(), times.end()); | |
for (auto t : times) { | |
std::cout << t << "; "; | |
} | |
std::cout << sizeof(B) << "; \"" << method_name << "\"; \"" << rng_name << "\"; 4; " << s | |
<< std::endl; | |
} | |
template <typename B, typename Rng> | |
NO_INLINE void benchU1(size_t iters, const char *rng_name, const char *method_name) { | |
size_t s = 0; | |
std::vector<double> times; | |
for (size_t i = 0; i < 7; ++i) { | |
Rng rng{123}; | |
B b; | |
auto t_start = std::chrono::high_resolution_clock::now(); | |
for (size_t t = 0; t < iters; ++t) { | |
if (b(rng)) { | |
++s; | |
} | |
} | |
auto t_end = std::chrono::high_resolution_clock::now(); | |
auto t_sec = std::chrono::duration<double>(t_end - t_start).count(); | |
times.push_back(t_sec / iters * 1e9); | |
} | |
std::sort(times.begin(), times.end()); | |
for (auto t : times) { | |
std::cout << t << "; "; | |
} | |
std::cout << sizeof(B) << "; \"" << method_name << "\"; \"" << rng_name << "\"; 1; " << s | |
<< std::endl; | |
} | |
template <typename Rng> void bench(size_t iters, const char *rng_name) { | |
benchU1<RandomizerWithCounterT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterT<uint64_t>"); | |
benchU4<RandomizerWithCounterT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterT<uint64_t>"); | |
benchU1<BinaryAndUnbiased<uint64_t>, Rng>(iters, rng_name, "BinaryAndUnbiased<uint64_t>"); | |
benchU4<BinaryAndUnbiased<uint64_t>, Rng>(iters, rng_name, "BinaryAndUnbiased<uint64_t>"); | |
benchU1<RandomizerWithShiftT<uint64_t>, Rng>(iters, rng_name, "RandomizerWithShiftT<uint64_t>"); | |
benchU4<RandomizerWithShiftT<uint64_t>, Rng>(iters, rng_name, "RandomizerWithShiftT<uint64_t>"); | |
benchU1<RandomizerWithShiftMaxSize<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftMaxSize<uint64_t>"); | |
benchU4<RandomizerWithShiftMaxSize<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftMaxSize<uint64_t>"); | |
benchU1<RandomizerWithMaskShift<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithMaskShift<uint64_t>"); | |
benchU4<RandomizerWithMaskShift<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithMaskShift<uint64_t>"); | |
benchU1<RandomizerWithRotl<uint64_t>, Rng>(iters, rng_name, "RandomizerWithRotl<uint64_t>"); | |
benchU4<RandomizerWithRotl<uint64_t>, Rng>(iters, rng_name, "RandomizerWithRotl<uint64_t>"); | |
benchU1<RandomizerWithCounter2<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounter2<uint64_t>"); | |
benchU4<RandomizerWithCounter2<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounter2<uint64_t>"); | |
benchU1<RandomizerWithShiftTNoLikely<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftTNoLikely<uint64_t>"); | |
benchU4<RandomizerWithShiftTNoLikely<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftTNoLikely<uint64_t>"); | |
benchU1<RandomizerWithCounterShiftT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterShiftT<uint64_t>"); | |
benchU4<RandomizerWithCounterShiftT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterShiftT<uint64_t>"); | |
benchU1<BinaryAnd, Rng>(iters, rng_name, "BinaryAnd"); | |
benchU4<BinaryAnd, Rng>(iters, rng_name, "BinaryAnd"); | |
benchU1<BoundedOperator, Rng>(iters, rng_name, "BoundedOperator"); | |
benchU4<BoundedOperator, Rng>(iters, rng_name, "BoundedOperator"); | |
benchU1<BernoulliDistribution, Rng>(iters, rng_name, "BernoulliDistribution"); | |
benchU4<BernoulliDistribution, Rng>(iters, rng_name, "BernoulliDistribution"); | |
benchU1<UniformDistribution, Rng>(iters, rng_name, "UniformDistribution"); | |
benchU4<UniformDistribution, Rng>(iters, rng_name, "UniformDistribution"); | |
} | |
int main(int, char **) { | |
size_t iters = UINT64_C(1'000'000'000); | |
bench<sfc64>(iters, "sfc64"); | |
bench<std::mt19937>(iters, "std::mt19937"); | |
bench<std::mt19937_64>(iters, "std::mt19937_64"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment