fast random bool in C++
#include <algorithm> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
#define LIKELY(x) __builtin_expect((x), 1) | |
#define UNLIKELY(x) __builtin_expect((x), 0) | |
#define NO_INLINE __attribute__((noinline)) | |
// extremely fast random number generator that also produces very high quality random. | |
// see PractRand: http://pracrand.sourceforge.net/PractRand.txt | |
class sfc64 { | |
public: | |
using result_type = uint64_t; | |
static constexpr uint64_t(min)() { return 0; } | |
static constexpr uint64_t(max)() { return UINT64_C(-1); } | |
sfc64() : sfc64(std::random_device{}()) {} | |
explicit sfc64(uint64_t seed) : m_a(seed), m_b(seed), m_c(seed), m_counter(1) { | |
for (int i = 0; i < 12; ++i) { | |
operator()(); | |
} | |
} | |
uint64_t operator()() noexcept { | |
auto const tmp = m_a + m_b + m_counter++; | |
m_a = m_b ^ (m_b >> right_shift); | |
m_b = m_c + (m_c << left_shift); | |
m_c = rotl(m_c, rotation) + tmp; | |
return tmp; | |
} | |
private: | |
template <typename T> T rotl(T const x, int k) { return (x << k) | (x >> (8 * sizeof(T) - k)); } | |
static constexpr int rotation = 24; | |
static constexpr int right_shift = 11; | |
static constexpr int left_shift = 3; | |
uint64_t m_a; | |
uint64_t m_b; | |
uint64_t m_c; | |
uint64_t m_counter; | |
}; | |
// Probably the simplest correct implementation of random bool. | |
// Also one of the slowest variant. | |
class UniformDistribution { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
return std::uniform_int_distribution<>{0, 1}(rng); | |
} | |
}; | |
// bounded() is a highly optimized way to produce unbiased bounded random numbers. | |
// It's not really fast when just used for bool though. | |
class BoundedOperator { | |
public: | |
template <typename Rng> uint64_t bounded(Rng &rng, uint64_t boundExcluded) { | |
#ifdef __SIZEOF_INT128__ | |
// fastest solution I could find | |
// 1.78746 ns/op total | |
using u128 = unsigned __int128; | |
u128 m = u128(std::uniform_int_distribution<uint64_t>{}(rng)) * u128(boundExcluded); | |
uint64_t l = (uint64_t)m; | |
if (l < boundExcluded) { | |
uint64_t t = (0 - boundExcluded) % boundExcluded; | |
while (l < t) { | |
m = u128(rng()) * u128(boundExcluded); | |
l = (uint64_t)m; | |
} | |
} | |
return m >> 64; | |
#else | |
// java's method usually needs only 1 modulus, faster than OpenBSD's method or pcg's. | |
// also faster than std::shuffle's method. 9.73759 ns/op total | |
// 7.02644 ns/op for std::random_shuffle which uses this. | |
uint64_t x, r; | |
do { | |
x = rng(); | |
r = x % boundExcluded; | |
} while (x - r > (0 - boundExcluded)); | |
return r; | |
#endif | |
} | |
template <typename Rng> bool operator()(Rng &rng) { return bounded(rng, 2); } | |
}; | |
// Unbiased, but very slow. bernoulli_distribution is too generic for this. | |
class BernoulliDistribution { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { return m_dist(rng); } | |
private: | |
std::bernoulli_distribution m_dist; | |
}; | |
// Biased if the rng does not give random numbers in [0, 2^x]. | |
class BinaryAnd { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { return rng() & 1; } | |
}; | |
// unbiased | |
template <typename U = uint64_t> class BinaryAndUnbiased { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
return (std::uniform_int_distribution<U>{}(rng)) & 1; | |
} | |
}; | |
// Unbiased & quite fast but requires 16byte on 64bit platforms. | |
template <typename U = uint64_t> class RandomizerWithCounterT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_counter)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_counter = sizeof(m_rand) * 8; | |
} | |
return (m_rand >> --m_counter) & 1; | |
} | |
private: | |
U m_rand = 0; | |
int m_counter = 0; | |
}; | |
// Unbiased & quite fast but requires 16byte on 64bit platforms. | |
template <typename U = uint64_t> class RandomizerWithCounter2 { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
auto b = m_counter & s_mask; | |
if (UNLIKELY(0 == b)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
} | |
++m_counter; | |
return (m_rand >> b) & 1; | |
} | |
private: | |
static constexpr U s_mask = sizeof(U) * 8 - 1; | |
U m_rand = 0; | |
uint_fast8_t m_counter = 0; | |
}; | |
// Unbiased, fastest variant. Gets rid of the counter by sacrificing 1 bit of randomness. | |
// UNLIKELY macro seems important for clang++. | |
template <typename U = uint64_t> class RandomizerWithShiftT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_rand)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng) | s_mask_left1; | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
static constexpr const U s_mask_left1 = U(1) << (sizeof(U) * 8 - 1); | |
U m_rand = 1; | |
}; | |
template <typename U = size_t> class RandomizerWithShiftMaxSize { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_rand)) { | |
m_rand = std::uniform_int_distribution<U>{0, s_mask_left1 - 1}(rng) | s_mask_left1; | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
static constexpr const U s_mask_left1 = U(1) << (sizeof(U) * 8 - 1); | |
U m_rand = 1; | |
}; | |
template <typename U = uint64_t> class RandomizerWithShiftTNoLikely { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (1 == m_rand) { | |
m_rand = std::uniform_int_distribution<U>{}(rng) | (U(1) << (sizeof(U) * 8 - 1)); | |
} | |
bool const ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
U m_rand = 1; | |
}; | |
// fastest variant for 4x unrolled loop | |
template <typename U = uint64_t> class RandomizerWithCounterShiftT { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_counter)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_counter = sizeof(U) * 8; | |
} | |
--m_counter; | |
bool ret = m_rand & 1; | |
m_rand >>= 1; | |
return ret; | |
} | |
private: | |
U m_rand; | |
int m_counter = 0; | |
}; | |
template <typename U> class RandomizerWithMaskShift { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(0 == m_mask)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
m_mask = 1; | |
} | |
bool ret = m_rand & m_mask; | |
m_mask <<= 1; | |
return ret; | |
} | |
private: | |
U m_rand; | |
U m_mask = 0; | |
}; | |
template <typename U> class RandomizerWithRotl { | |
public: | |
template <typename Rng> bool operator()(Rng &rng) { | |
if (UNLIKELY(1 == m_mask)) { | |
m_rand = std::uniform_int_distribution<U>{}(rng); | |
} | |
m_mask = rotl(m_mask, 1); | |
return m_rand & m_mask; | |
} | |
private: | |
U m_rand; | |
U m_mask = 1; | |
}; | |
template <typename B, typename Rng> | |
NO_INLINE void benchU4(size_t iters, const char *rng_name, const char *method_name) { | |
size_t loops = iters / 4; | |
size_t s = 0; | |
std::vector<double> times; | |
for (size_t i = 0; i < 7; ++i) { | |
Rng rng{123}; | |
B b; | |
auto t_start = std::chrono::high_resolution_clock::now(); | |
for (size_t t = 0; t < loops; ++t) { | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
if (b(rng)) { | |
++s; | |
} | |
} | |
auto t_end = std::chrono::high_resolution_clock::now(); | |
auto t_sec = std::chrono::duration<double>(t_end - t_start).count(); | |
times.push_back(t_sec / iters * 1e9); | |
} | |
std::sort(times.begin(), times.end()); | |
for (auto t : times) { | |
std::cout << t << "; "; | |
} | |
std::cout << sizeof(B) << "; \"" << method_name << "\"; \"" << rng_name << "\"; 4; " << s | |
<< std::endl; | |
} | |
template <typename B, typename Rng> | |
NO_INLINE void benchU1(size_t iters, const char *rng_name, const char *method_name) { | |
size_t s = 0; | |
std::vector<double> times; | |
for (size_t i = 0; i < 7; ++i) { | |
Rng rng{123}; | |
B b; | |
auto t_start = std::chrono::high_resolution_clock::now(); | |
for (size_t t = 0; t < iters; ++t) { | |
if (b(rng)) { | |
++s; | |
} | |
} | |
auto t_end = std::chrono::high_resolution_clock::now(); | |
auto t_sec = std::chrono::duration<double>(t_end - t_start).count(); | |
times.push_back(t_sec / iters * 1e9); | |
} | |
std::sort(times.begin(), times.end()); | |
for (auto t : times) { | |
std::cout << t << "; "; | |
} | |
std::cout << sizeof(B) << "; \"" << method_name << "\"; \"" << rng_name << "\"; 1; " << s | |
<< std::endl; | |
} | |
template <typename Rng> void bench(size_t iters, const char *rng_name) { | |
benchU1<RandomizerWithCounterT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterT<uint64_t>"); | |
benchU4<RandomizerWithCounterT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterT<uint64_t>"); | |
benchU1<BinaryAndUnbiased<uint64_t>, Rng>(iters, rng_name, "BinaryAndUnbiased<uint64_t>"); | |
benchU4<BinaryAndUnbiased<uint64_t>, Rng>(iters, rng_name, "BinaryAndUnbiased<uint64_t>"); | |
benchU1<RandomizerWithShiftT<uint64_t>, Rng>(iters, rng_name, "RandomizerWithShiftT<uint64_t>"); | |
benchU4<RandomizerWithShiftT<uint64_t>, Rng>(iters, rng_name, "RandomizerWithShiftT<uint64_t>"); | |
benchU1<RandomizerWithShiftMaxSize<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftMaxSize<uint64_t>"); | |
benchU4<RandomizerWithShiftMaxSize<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftMaxSize<uint64_t>"); | |
benchU1<RandomizerWithMaskShift<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithMaskShift<uint64_t>"); | |
benchU4<RandomizerWithMaskShift<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithMaskShift<uint64_t>"); | |
benchU1<RandomizerWithRotl<uint64_t>, Rng>(iters, rng_name, "RandomizerWithRotl<uint64_t>"); | |
benchU4<RandomizerWithRotl<uint64_t>, Rng>(iters, rng_name, "RandomizerWithRotl<uint64_t>"); | |
benchU1<RandomizerWithCounter2<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounter2<uint64_t>"); | |
benchU4<RandomizerWithCounter2<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounter2<uint64_t>"); | |
benchU1<RandomizerWithShiftTNoLikely<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftTNoLikely<uint64_t>"); | |
benchU4<RandomizerWithShiftTNoLikely<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithShiftTNoLikely<uint64_t>"); | |
benchU1<RandomizerWithCounterShiftT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterShiftT<uint64_t>"); | |
benchU4<RandomizerWithCounterShiftT<uint64_t>, Rng>(iters, rng_name, | |
"RandomizerWithCounterShiftT<uint64_t>"); | |
benchU1<BinaryAnd, Rng>(iters, rng_name, "BinaryAnd"); | |
benchU4<BinaryAnd, Rng>(iters, rng_name, "BinaryAnd"); | |
benchU1<BoundedOperator, Rng>(iters, rng_name, "BoundedOperator"); | |
benchU4<BoundedOperator, Rng>(iters, rng_name, "BoundedOperator"); | |
benchU1<BernoulliDistribution, Rng>(iters, rng_name, "BernoulliDistribution"); | |
benchU4<BernoulliDistribution, Rng>(iters, rng_name, "BernoulliDistribution"); | |
benchU1<UniformDistribution, Rng>(iters, rng_name, "UniformDistribution"); | |
benchU4<UniformDistribution, Rng>(iters, rng_name, "UniformDistribution"); | |
} | |
int main(int, char **) { | |
size_t iters = UINT64_C(1'000'000'000); | |
bench<sfc64>(iters, "sfc64"); | |
bench<std::mt19937>(iters, "std::mt19937"); | |
bench<std::mt19937_64>(iters, "std::mt19937_64"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment