Created
February 2, 2025 22:53
-
-
Save Strilanc/4f0f826e7a00c9dd6e32d5c9d5f913a1 to your computer and use it in GitHub Desktop.
Implements a class that can quickly approximately sample from geometric distributions whose miss probabilities are specified as an integer root of 1/2. For example, for p=0.1% you would use the 693'th root because 1 - 0.5^(1/693) ~= 0.1%.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <chrono> | |
| #include <iostream> | |
| #include <random> | |
| #include <bit> | |
| #include "immintrin.h" | |
| #include <algorithm> | |
| __m256i countl_zero_vec(__m256i x) { | |
| // Tried doing vectorized bit twiddling instead, but it was slower. | |
| uint64_t a = (uint64_t)_mm256_extract_epi64(x, 0); | |
| uint64_t b = (uint64_t)_mm256_extract_epi64(x, 1); | |
| uint64_t c = (uint64_t)_mm256_extract_epi64(x, 2); | |
| uint64_t d = (uint64_t)_mm256_extract_epi64(x, 3); | |
| a = std::countl_zero(a); | |
| b = std::countl_zero(b); | |
| c = std::countl_zero(c); | |
| d = std::countl_zero(d); | |
| return _mm256_set_epi64x(a, b, c, d); | |
| } | |
| /// Approximates floor(step * -log2(x / pow(2, 33) - 0.5)) for each even 32 bit word of x. | |
| /// The odd 32 bit words are ignored (due using _mm256_mul_epi32). | |
| /// | |
| /// Args: | |
| /// x: The vector value to operate on. | |
| /// x32[2*k+0] is used. | |
| /// x32[2*k+1] is ignored. | |
| /// step: The end of the range of possible results. The number of biased 0s | |
| /// per leading unbiased 0, when converting biased bits into jumps across | |
| /// unbiased bits. The k in p^k = 0.5 where p is the chance of each biased | |
| /// bit being ON. | |
| /// | |
| /// Returns: | |
| /// A vector value r with r.u64[k] storing the output for input x.u32[2*k]. | |
| /// The outputs are guaranteed to be in the range [0, step). | |
| /// Assuming x.u32[2*k] is a uniformly random integer, then seeing the result | |
| /// r.u64[k] = V+1 should be approximately 1-p times less likely than seeing | |
| /// r.u64[k] = V. | |
| __m256i vec_approx_geometric_step(__m256i x, size_t step) { | |
| x = _mm256_srli_epi32(x, 1); | |
| __m256i total = _mm256_set1_epi64x(-22531715); | |
| total = _mm256_mul_epi32(total, x); | |
| total = _mm256_srli_epi64(total, 32); | |
| total = _mm256_add_epi64(total, _mm256_set1_epi64x(49873802)); | |
| total = _mm256_mul_epi32(total, x); | |
| total = _mm256_srli_epi64(total, 32); | |
| total = _mm256_add_epi64(total, _mm256_set1_epi64x(-54912070)); | |
| total = _mm256_mul_epi32(total, x); | |
| total = _mm256_srli_epi64(total, 32); | |
| total = _mm256_add_epi64(total, _mm256_set1_epi64x(47448117)); | |
| total = _mm256_mul_epi32(total, x); | |
| total = _mm256_srli_epi64(total, 32); | |
| total = _mm256_add_epi64(total, _mm256_set1_epi64x(-48376425)); | |
| total = _mm256_mul_epi32(total, x); | |
| total = _mm256_srli_epi64(total, 32); | |
| total = _mm256_add_epi64(total, _mm256_set1_epi64x(16777200)); | |
| total = _mm256_mul_epi32(total, _mm256_set1_epi64x(step)); | |
| total = _mm256_srli_epi64(total, 24); | |
| return total; | |
| } | |
| struct FastRng { | |
| __m256i state; | |
| static FastRng from_external_entropy() { | |
| std::random_device device; | |
| __m256i state = _mm256_set_epi64x(device(), device(), device(), device()); | |
| return FastRng{state}; | |
| } | |
| __m256i next_uniform() { | |
| // ranqd1, Chapter 7.1, §An Even Quicker Generator, Eq. 7.1.6 parameters | |
| // from Knuth and H. W. Lewis (via Wikipedia) | |
| __m256i multiplier = _mm256_set1_epi32(1664525); | |
| __m256i offset = _mm256_set1_epi32(1013904223); | |
| state = _mm256_mullo_epi32(state, multiplier); | |
| state = _mm256_add_epi32(state, offset); | |
| return state; | |
| } | |
| __m256i next_jump(size_t offset_per_countl_zero) { | |
| __m256i r = next_uniform(); | |
| // Only support probabilities where leading 0s cause an integer offset. | |
| __m256i counts = countl_zero_vec(r); | |
| __m256i offset_from_count = _mm256_mul_epu32(counts,_mm256_set1_epi64x(offset_per_countl_zero)); | |
| // Use low bits to to pick one of the values between leading-zero steps. | |
| // There's a 2^-32 chance of the low bits having been double-used by the leading zero count. | |
| __m256i extra = vec_approx_geometric_step(r, offset_per_countl_zero); | |
| return _mm256_add_epi64(extra, offset_from_count); | |
| } | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment