guicho271828/.gitignore

## .gitignore
*~
random_generators_speed_*

## Makefile

all: run

run: run0

run%: random_generators_speed_%
	./$<

random_generators_speed_0: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O0 -o $@ $<
random_generators_speed_1: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O1 -o $@ $<
random_generators_speed_2: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O2 -o $@ $<
random_generators_speed_3: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O3 -o $@ $<

clean:
	-rm random_generators_speed_*

disassemble: random_generators_speed_0
	objdump -d -M att -CSl --no-show-raw-insn $< | awk -v RS= '/operator\(\)\(\)>:/'

## MWC128.c
/*  Written in 2023 by Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>
#include <strings.h>

/* This is a Marsaglia multiply-with-carry generator with period
   approximately 2^127. It is close in speed to a scrambled linear
   generator, as its only 128-bit operations are a multiplication and sum;
   it is an excellent generator based on congruential arithmetic.

   As all MWC generators, it simulates a multiplicative LCG with
   prime modulus m = 0xffebb71d94fcdaf8ffffffffffffffff and
   multiplier given by the inverse of 2^64 modulo m. The modulus has a
   particular form, which creates some theoretical issues, but at this
   size a generator of this kind passes all known statistical tests,
   except for very large size birthday-spacings tests in three dimensions
   (as predicted from the theory). These failures are unlikely to have
   any impact in practice.

   For a generator of the same type with stronger theoretical guarantees,
   consider a Goresky-Klapper generalized multiply-with-carry generator.

   Alternatively, applying a xorshift to the result (i.e., returning
   x ^ (x << 32) instead of x) will eliminate the birthday-spacings
   failures.

   Note that a previous version had a different MWC_A1. Moreover, now
   the result is computed using the current state.
*/

#define MWC_A1 0xffebb71d94fcdaf9

/* The state must be initialized so that 0 < c < MWC_A1 - 1.
   For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
uint64_t x, c;

uint64_t inline next() {
	const uint64_t result = x; // Or, result = x ^ (x << 32) (see above)
	const __uint128_t t = MWC_A1 * (__uint128_t)x + c;
	x = t;
	c = t >> 64;
	return result;
}


/* The following jump functions use a minimal multiprecision library. */

#define MP_SIZE 3
#include "mp.c"

static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, MWC_A1 - 1 };


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int m = MWC_A1 * b - 1;
   cpp_int r = cpp_int("0x2f65fed2e8400983a72f9a3547208003");
   cpp_int s = ((x + c * b) * r) % m;
   x = uint64_t(s);
   c = uint64_t(s >> 64);
*/

void jump(void) {
	static uint64_t jump[MP_SIZE] = { 0xa72f9a3547208003, 0x2f65fed2e8400983 };
	uint64_t state[MP_SIZE] = { x, c };
	mul(state, jump, mod);
	x = state[0];
	c = state[1];
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int m = MWC_A1 * b - 1;
   cpp_int r = cpp_int("0x394649cfd6769c91e6f7814467f3fcdd");
   cpp_int s = ((x + c * b) * r) % m;
   x = uint64_t(s);
   c = uint64_t(s >> 64);
*/

void long_jump(void) {
	static uint64_t long_jump[MP_SIZE] = { 0xe6f7814467f3fcdd, 0x394649cfd6769c91 };
	uint64_t state[MP_SIZE] = { x, c };
	mul(state, long_jump, mod);
	x = state[0];
	c = state[1];
}

## MWC192.c
/*  Written in 2023 by Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>
#include <strings.h>

/* This is a Marsaglia multiply-with-carry generator with period
   approximately 2^191. It is faster than a scrambled linear
   generator, as its only 128-bit operations are a multiplication and sum;
   it is an excellent generator based on congruential arithmetic.

   As all MWC generators, it simulates a multiplicative LCG with prime
   modulus m = 0xffa04e67b3c95d85ffffffffffffffffffffffffffffffff
   and multiplier given by the inverse of 2^64 modulo m. The modulus has a
   particular form, which creates some theoretical issues, but at this
   size a generator of this kind passes all known statistical tests. For a
   generator of the same type with stronger theoretical guarantees
   consider a Goresky-Klapper generalized multiply-with-carry generator.
*/

#define MWC_A2 0xffa04e67b3c95d86

/* The state must be initialized so that 0 < c < MWC_A2 - 1.
   For simplicity, we suggest to set c = 1 and x, y to a 128-bit seed. */
uint64_t x, y, c;

uint64_t inline next() {
	const uint64_t result = y;
	const __uint128_t t = MWC_A2 * (__uint128_t)x + c;
	x = y;
	y = t;
	c = t >> 64;
	return result;
}


/* The following jump functions use a minimal multiprecision library. */

#define MP_SIZE 4
#include "mp.c"

static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, MWC_A2 - 1 };

/* This is the jump function for the generator. It is equivalent
   to 2^96 calls to next(); it can be used to generate 2^96
   non-overlapping subsequences for parallel computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int b2 = b * b;
   cpp_int m = MWC_A2 * b2 - 1;
   cpp_int r = cpp_int("0xdc2be36e4bd21a2afc217e3b9edf985d94fb8d87c7c6437");
   cpp_int s = ((x + y * b + c * b2) * r) % m;
   x = uint64_t(s);
   y = uint64_t(s >> 64);
   c = uint64_t(s >> 128);
*/

void jump(void) {
   static uint64_t jump[MP_SIZE] = { 0xd94fb8d87c7c6437, 0xafc217e3b9edf985, 0xdc2be36e4bd21a2 };
   uint64_t state[MP_SIZE] = { x, y, c };
	mul(state, jump, mod);
	x = state[0];
	y = state[1];
	c = state[2];
}

/* This is the long-jump function for the generator. It is equivalent to
   2^144 calls to next(); it can be used to generate 2^96 starting points,
   from each of which jump() will generate 2^96 non-overlapping
   subsequences for parallel distributed computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int b2 = b * b;
   cpp_int m = MWC_A2 * b2 - 1;
   cpp_int r = cpp_int("0x3c6528aaead6bbddec956c3909137b2dd0e7cedd16a0758e");
   cpp_int s = ((x + y * b + c * b2) * r) % m;
   x = uint64_t(s);
   y = uint64_t(s >> 64);
   c = uint64_t(s >> 128);
*/

void long_jump(void) {
   static uint64_t long_jump[MP_SIZE] = { 0xd0e7cedd16a0758e, 0xec956c3909137b2d, 0x3c6528aaead6bbdd };
   uint64_t state[MP_SIZE] = { x, y, c };
	mul(state, long_jump, mod);
	x = state[0];
	y = state[1];
	c = state[2];
}

## MWC256.c
/*  Written in 2021 by Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>
#include <strings.h>

/* This is a Marsaglia multiply-with-carry generator with period
   approximately 2^255. It is faster than a scrambled linear
   generator, as its only 128-bit operations are a multiplication and sum;
   it is an excellent generator based on congruential arithmetic.

   As all MWC generators, it simulates a multiplicative LCG with prime
   modulus m = 0xfff62cf2ccc0cdaeffffffffffffffffffffffffffffffffffffffffffffffff
   and multiplier given by the inverse of 2^64 modulo m. The modulus has a
   particular form, which creates some theoretical issues, but at this
   size a generator of this kind passes all known statistical tests. For a
   generator of the same type with stronger theoretical guarantees
   consider a Goresky-Klapper generalized multiply-with-carry generator.

   Note that a previous version had a different MWC_A3. Moreover, now
   the result is computed using the current state.
*/

#define MWC_A3 0xfff62cf2ccc0cdaf

/* The state must be initialized so that 0 < c < MWC_A3 - 1.
   For simplicity, we suggest to set c = 1 and x, y, z to a 192-bit seed. */
uint64_t x, y, z, c;

uint64_t inline next() {
	const uint64_t result = z;
	const __uint128_t t = MWC_A3 * (__uint128_t)x + c;
	x = y;
	y = z;
	z = t;
	c = t >> 64;
	return result;
}


/* The following jump functions use a minimal multiprecision library. */

#define MP_SIZE 5
#include "mp.c"

static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, MWC_A3 - 1 };


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int b3 = b * b * b;
   cpp_int m = MWC_A3 * b3 - 1;
   cpp_int r = cpp_int("0x4b89aa2cd51c37b9f6f8c3fd02ec98fbfe88c291203b225428c3ff11313847eb");
   cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
   x = uint64_t(s);
   y = uint64_t(s >> 64);
   z = uint64_t(s >> 128);
   c = uint64_t(s >> 192);
*/

void jump(void) {
	static uint64_t jump[MP_SIZE] = { 0x28c3ff11313847eb, 0xfe88c291203b2254, 0xf6f8c3fd02ec98fb, 0x4b89aa2cd51c37b9 };
	uint64_t state[MP_SIZE] = { x, y, z, c };
	mul(state, jump, mod);
	x = state[0];
	y = state[1];
	z = state[2];
	c = state[3];
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations.

   Equivalent C++ Boost multiprecision code:

   cpp_int b = cpp_int(1) << 64;
   cpp_int b3 = b * b * b;
   cpp_int m = MWC_A3 * b3 - 1;
   cpp_int r = cpp_int("0xaf5ca22408cdc8306c40ce860e0d702f95382f758ac987764c6e39cf92f77a4");
   cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
   x = uint64_t(s);
   y = uint64_t(s >> 64);
   z = uint64_t(s >> 128);
   c = uint64_t(s >> 192);
*/

void long_jump(void) {
	static uint64_t long_jump[MP_SIZE] = { 0x64c6e39cf92f77a4, 0xf95382f758ac9877, 0x6c40ce860e0d702, 0xaf5ca22408cdc83 };
	uint64_t state[MP_SIZE] = { x, y, z, c };
	mul(state, long_jump, mod);
	x = state[0];
	y = state[1];
	z = state[2];
	c = state[3];
}

## random.h
/* Copyright (c) 2018 Arvid Gerstmann. */
/* Copyright (c) 2024 Masataro Asai. */
/* This code is licensed under MIT license. */
#ifndef AG_RANDOM_H
#define AG_RANDOM_H

#include <immintrin.h>

class generator
{
public:
    virtual ~generator() = default;
    using result_type = uint32_t;
    static constexpr result_type (min)() { return 0; }
    static constexpr result_type (max)() { return UINT32_MAX; }

    void seed()
    {
        std::random_device rd;  // non-deterministic source if possible; otherwise a PRNG
        seed(rd);
    }
    void seed(std::random_device &rd)
    {
        seed(rd());
    }
    void seed(std::uint32_t s)
    {
        std::seed_seq seq{s};
        seed(seq);
    }
    void seed(std::seed_seq &seq)
    {
        std::vector<std::uint32_t> seeds(32); // support up to 1024 bit state
        seq.generate(seeds.begin(), seeds.end());
        seed(seeds);
    }

    result_type operator()();

    void discard(unsigned long long n)
    {
        for (unsigned long long i = 0; i < n; ++i)
            operator()();
    }

protected:
    virtual void seed(std::vector<std::uint32_t> &seeds) = 0;
};

class generator64 : public generator
{
public:
    virtual ~generator64() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        m_seed = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
    }

    uint64_t m_seed;
};

class generator64x2 : public generator
{
public:
    virtual ~generator64x2() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 2; i++){
            s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
        }
    }

    uint64_t s[2];
};

class generator64x4 : public generator
{
public:
    virtual ~generator64x4() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 4; i++){
            s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
        }
    }

    uint64_t s[4];
};

class generator64x8 : public generator
{
public:
    virtual ~generator64x8() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 8; i++){
            s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
        }
    }

    uint64_t s[8];
};

class generator64x16 : public generator
{
public:
    virtual ~generator64x16() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 16; i++){
            s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
        }
    }

    uint64_t s[16];
};

class generator32 : public generator
{
public:
    virtual ~generator32() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        m_seed = seeds[0];
    }

    uint32_t m_seed;
};

class generator32x2 : public generator
{
public:
    virtual ~generator32x2() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 2; i++){
            s[i] = seeds[i];
        }
    }

    uint32_t s[2];
};

class generator32x4 : public generator
{
public:
    virtual ~generator32x4() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 4; i++){
            s[i] = seeds[i];
        }
    }

    uint32_t s[4];
};

class generator32x8 : public generator
{
public:
    virtual ~generator32x8() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 8; i++){
            s[i] = seeds[i];
        }
    }

    uint32_t s[8];
};

class generator32x16 : public generator
{
public:
    virtual ~generator32x16() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 16; i++){
            s[i] = seeds[i];
        }
    }

    uint32_t s[16];
};

class generator32x32 : public generator
{
public:
    virtual ~generator32x32() = default;
    using generator::seed;
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        for (int i = 0; i < 32; i++){
            s[i] = seeds[i];
        }
    }

    uint32_t s[32];
};


class splitmix : public generator64
{
public:
    virtual ~splitmix() = default;
    using generator::seed;
    splitmix() { seed(); }
    splitmix(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        uint64_t z = (m_seed += UINT64_C(0x9E3779B97F4A7C15));
        z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
        z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
        return result_type((z ^ (z >> 31)) >> 31);
    }
};

class trunc_xorshift32star : public generator64
{
public:
    virtual ~trunc_xorshift32star() = default;
    using generator::seed;
    trunc_xorshift32star() { seed(); }
    trunc_xorshift32star(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        uint64_t result = m_seed * 0xd989bcacc137dcd5ull;
        m_seed ^= m_seed >> 11;
        m_seed ^= m_seed << 31;
        m_seed ^= m_seed >> 18;
        return uint32_t(result >> 32ull);
    }
};

class xorshift32star : public generator64
{
public:
    virtual ~xorshift32star() = default;
    using generator::seed;
    xorshift32star() { seed(); }
    xorshift32star(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        uint64_t x;
        uint32_t answer;

        x = m_seed;
        x = x ^ (x >> 12);
        x = x ^ (x << 25);
        x = x ^ (x >> 27);
        m_seed = x;
        answer = ((x * MAGIC) >> 32);

        return answer;
    }

private:
    const uint64_t MAGIC = 0x2545F4914F6CDD1D;
};


class xorshift32 : public generator32
{
public:
    virtual ~xorshift32() = default;
    using generator::seed;
    xorshift32() { seed(); }
    xorshift32(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
	uint64_t x = m_seed;
	x ^= x << 13;
	x ^= x >> 17;
	x ^= x << 5;
	return m_seed = x;
    }
};

class xorshift64 : public generator64
{
public:
    virtual ~xorshift64() = default;
    using generator::seed;
    xorshift64() { seed(); }
    xorshift64(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
	uint64_t x = m_seed;
	x ^= x << 13;
	x ^= x >> 7;
	x ^= x << 17;
	return m_seed = x;
    }
};

// compute 4 numbers at once with SSE.
class xorshift32x4 : public generator32x4
{
    int counter = -1;
public:
    virtual ~xorshift32x4() = default;
    using generator::seed;
    xorshift32x4() { seed(); }
    xorshift32x4(std::uint32_t seed) {generator::seed(seed);}
    result_type operator()()
    {
        counter++;
        int cycle = counter % 4;
        result_type result = s[cycle];
        if (cycle == 3){
            // uint64_t x = m_seed;
            __m128i x = _mm_loadu_si128((__m128i*) s);
            // x ^= x << 13;
            x = _mm_xor_epi32(x, _mm_slli_epi32(x, 13));
            // x ^= x >> 7;
            x = _mm_xor_epi32(x, _mm_srli_epi32(x, 17));
            // x ^= x << 17;
            x = _mm_xor_epi32(x, _mm_slli_epi32(x, 5));
            // return m_seed = x;
            _mm_storeu_si128((__m128i*) s, x);
        }
        return result;
    }
};

// compute 4 numbers at once with SSE.
class xorshift64x4 : public generator64x4
{
    int counter = -1;
public:
    virtual ~xorshift64x4() = default;
    using generator::seed;
    xorshift64x4() { seed(); }
    xorshift64x4(std::uint32_t seed) { generator::seed(seed); }
    result_type operator()()
    {
        counter++;
        int cycle = counter % 4;
        result_type result = s[cycle];
        if (cycle == 3){
            // uint64_t x = m_seed;
            __m256i x = _mm256_loadu_si256((__m256i*) s);
            // x ^= x << 13;
            x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 13));
            // x ^= x >> 7;
            x = _mm256_xor_epi64(x, _mm256_srli_epi64(x, 7));
            // x ^= x << 17;
            x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 17));
            // return m_seed = x;
            _mm256_storeu_si256((__m256i*) s, x);
        }
        return result;
    }
};

// compute 8 numbers at once with SSE.
class xorshift32x8 : public generator32x8
{
    int counter = -1;
public:
    virtual ~xorshift32x8() = default;
    using generator::seed;
    xorshift32x8() { seed(); }
    xorshift32x8(std::uint32_t seed) { generator::seed(seed); }
    result_type operator()()
    {
        counter++;
        int cycle = counter % 8;
        result_type result = s[cycle];
        if (cycle == 7){
            // uint64_t x = m_seed;
            __m256i x = _mm256_loadu_si256((__m256i*) s);
            // x ^= x << 13;
            x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 13));
            // x ^= x >> 7;
            x = _mm256_xor_epi32(x, _mm256_srli_epi32(x, 17));
            // x ^= x << 17;
            x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 5));
            // return m_seed = x;
            _mm256_storeu_si256((__m256i*) s, x);
        }
        return result;
    }
};

// compute 16 numbers at once with AVX512.
class xorshift32x16 : public generator32x16
{
    int counter = -1;
public:
    virtual ~xorshift32x16() = default;
    using generator::seed;
    xorshift32x16() { seed(); }
    xorshift32x16(std::uint32_t seed) { generator::seed(seed); }
    result_type operator()()
    {
        counter++;
        int cycle = counter % 16;
        result_type result = s[cycle];
        if (cycle == 15){
            // uint64_t x = m_seed;
            __m512i x = _mm512_loadu_si512((__m512i*) s);
            // x ^= x << 13;
            x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 13));
            // x ^= x >> 7;
            x = _mm512_xor_epi32(x, _mm512_srli_epi32(x, 17));
            // x ^= x << 17;
            x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 5));
            // return m_seed = x;
            _mm512_storeu_si512((__m512i*) s, x);
        }
        return result;
    }
};


// https://www.pcg-random.org/download.html#minimal-c-implementation
class pcg : public generator64x2
{
public:
    virtual ~pcg() = default;
    using generator::seed;
    pcg() { seed();}
    pcg(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        uint64_t oldstate = s[0];
        s[0] = oldstate * 6364136223846793005ULL + s[1];
        uint32_t xorshifted = uint32_t(((oldstate >> 18u) ^ oldstate) >> 27u);
        int rot = oldstate >> 59u;
        return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
    }
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        // proper seeding
        uint64_t s0 = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
        uint64_t s1 = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]);
        s[0] = 0;
        s[1] = (s1 << 1) | 1;
        operator()();
        s[0] += s0;
        operator()();
    }
};


static uint32_t rotl(const uint32_t x, int k) __attribute__((always_inline));

static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


class xoshiro128plusplus : public generator32x4
{
public:
    virtual ~xoshiro128plusplus() = default;
    using generator::seed;
    xoshiro128plusplus() { generator::seed(1); }
    xoshiro128plusplus(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
	const uint32_t result = rotl(s[0] + s[3], 7) + s[0];

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
    }
};

class xoshiro128starstar : public generator32x4
{
public:
    virtual ~xoshiro128starstar() = default;
    using generator::seed;
    xoshiro128starstar() { generator::seed(1); }
    xoshiro128starstar(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
	const uint32_t result = rotl(s[1] * 5, 7) * 9;

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
    }
};


// note : dispatching.
static __m128i rotl(const __m128i x, int k) __attribute__((always_inline));

static inline __m128i rotl(const __m128i x, const int k) {
    return _mm_or_epi32(_mm_slli_epi32(x, k), _mm_slli_epi32(x, (32 - k)));
}

class xoshiro128plusplusx4 : public generator32x16
{
    int counter = -1;
    result_type buffer[4];
public:
    virtual ~xoshiro128plusplusx4() = default;
    using generator::seed;
    xoshiro128plusplusx4() { generator::seed(1); }
    xoshiro128plusplusx4(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        counter++;
        int cycle = counter % 4;
        if (cycle == 0){

            __m128i s0 = _mm_loadu_si128((__m128i*) s);
            __m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
            __m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
            __m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);

            // const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
            const __m128i results = _mm_add_epi32(rotl(_mm_add_epi32(s0, s3), 7), s0);

            // const uint32_t t = s[1] << 9;
            const __m128i t = _mm_slli_epi32(s1, 9);

            // s[2] ^= s[0];
            s2 = _mm_xor_epi32(s2, s0);

            // s[3] ^= s[1];
            s3 = _mm_xor_epi32(s3, s1);

            // s[1] ^= s[2];
            s1 = _mm_xor_epi32(s1, s2);

            // s[0] ^= s[3];
            s0 = _mm_xor_epi32(s0, s3);

            // s[2] ^= t;
            s2 = _mm_xor_epi32(s2, t);

            // s[3] = rotl(s[3], 11);
            s3 = rotl(s3, 11);

            _mm_storeu_si128((__m128i*) s, s0);
            _mm_storeu_si128(((__m128i*) s)+1, s1);
            _mm_storeu_si128(((__m128i*) s)+2, s2);
            _mm_storeu_si128(((__m128i*) s)+3, s3);
            _mm_storeu_si128((__m128i*) buffer, results);
        }
        return buffer[cycle];
    }
};

class xoshiro128starstarx4 : public generator32x16
{
    int counter = -1;
    result_type buffer[4];
public:
    virtual ~xoshiro128starstarx4() = default;
    using generator::seed;
    xoshiro128starstarx4() { generator::seed(1); }
    xoshiro128starstarx4(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        counter++;
        int cycle = counter % 4;
        if (cycle == 0){

            __m128i s0 = _mm_loadu_si128((__m128i*) s);
            __m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
            __m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
            __m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);

            const __m128i c5 = _mm_set1_epi32(5);
            const __m128i c9 = _mm_set1_epi32(9);

            // const uint32_t result = rotl(s[1] * 5, 7) * 9;
            const __m128i results = _mm_mul_epi32(rotl(_mm_mul_epi32(s1, c5), 7), c9);

            // const uint32_t t = s[1] << 9;
            const __m128i t = _mm_slli_epi32(s1, 9);

            // s[2] ^= s[0];
            s2 = _mm_xor_epi32(s2, s0);

            // s[3] ^= s[1];
            s3 = _mm_xor_epi32(s3, s1);

            // s[1] ^= s[2];
            s1 = _mm_xor_epi32(s1, s2);

            // s[0] ^= s[3];
            s0 = _mm_xor_epi32(s0, s3);

            // s[2] ^= t;
            s2 = _mm_xor_epi32(s2, t);

            // s[3] = rotl(s[3], 11);
            s3 = rotl(s3, 11);

            _mm_storeu_si128((__m128i*) s, s0);
            _mm_storeu_si128(((__m128i*) s)+1, s1);
            _mm_storeu_si128(((__m128i*) s)+2, s2);
            _mm_storeu_si128(((__m128i*) s)+3, s3);
            _mm_storeu_si128((__m128i*) buffer, results);
        }
        return buffer[cycle];
    }
};


// note : dispatching.
static __m256i rotl(const __m256i x, int k) __attribute__((always_inline));

static inline __m256i rotl(const __m256i x, const int k) {
    return _mm256_or_epi32(_mm256_slli_epi32(x, k), _mm256_slli_epi32(x, (32 - k)));
}

class xoshiro128plusplusx8 : public generator32x32
{
    int counter = -1;
    result_type buffer[4];
public:
    virtual ~xoshiro128plusplusx8() = default;
    using generator::seed;
    xoshiro128plusplusx8() { generator::seed(1); }
    xoshiro128plusplusx8(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        counter++;
        int cycle = counter % 8;
        if (cycle == 0){

            __m256i s0 = _mm256_loadu_si256((__m256i*) s);
            __m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
            __m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
            __m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);

            // const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
            const __m256i results = _mm256_add_epi32(rotl(_mm256_add_epi32(s0, s3), 7), s0);

            // const uint32_t t = s[1] << 9;
            const __m256i t = _mm256_slli_epi32(s1, 9);

            // s[2] ^= s[0];
            s2 = _mm256_xor_epi32(s2, s0);

            // s[3] ^= s[1];
            s3 = _mm256_xor_epi32(s3, s1);

            // s[1] ^= s[2];
            s1 = _mm256_xor_epi32(s1, s2);

            // s[0] ^= s[3];
            s0 = _mm256_xor_epi32(s0, s3);

            // s[2] ^= t;
            s2 = _mm256_xor_epi32(s2, t);

            // s[3] = rotl(s[3], 11);
            s3 = rotl(s3, 11);

            _mm256_storeu_si256((__m256i*) s, s0);
            _mm256_storeu_si256(((__m256i*) s)+1, s1);
            _mm256_storeu_si256(((__m256i*) s)+2, s2);
            _mm256_storeu_si256(((__m256i*) s)+3, s3);
            _mm256_storeu_si256((__m256i*) buffer, results);
        }
        return buffer[cycle];
    }
};

class xoshiro128starstarx8 : public generator32x32
{
    int counter = -1;
    result_type buffer[4];
public:
    virtual ~xoshiro128starstarx8() = default;
    using generator::seed;
    xoshiro128starstarx8() { generator::seed(1); }
    xoshiro128starstarx8(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
        counter++;
        int cycle = counter % 8;
        if (cycle == 0){

            __m256i s0 = _mm256_loadu_si256((__m256i*) s);
            __m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
            __m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
            __m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);

            const __m256i c5 = _mm256_set1_epi32(5);
            const __m256i c9 = _mm256_set1_epi32(9);

            // const uint32_t result = rotl(s[1] * 5, 7) * 9;
            const __m256i results = _mm256_mul_epi32(rotl(_mm256_mul_epi32(s1, c5), 7), c9);

            // const uint32_t t = s[1] << 9;
            const __m256i t = _mm256_slli_epi32(s1, 9);

            // s[2] ^= s[0];
            s2 = _mm256_xor_epi32(s2, s0);

            // s[3] ^= s[1];
            s3 = _mm256_xor_epi32(s3, s1);

            // s[1] ^= s[2];
            s1 = _mm256_xor_epi32(s1, s2);

            // s[0] ^= s[3];
            s0 = _mm256_xor_epi32(s0, s3);

            // s[2] ^= t;
            s2 = _mm256_xor_epi32(s2, t);

            // s[3] = rotl(s[3], 11);
            s3 = rotl(s3, 11);

            _mm256_storeu_si256((__m256i*) s, s0);
            _mm256_storeu_si256(((__m256i*) s)+1, s1);
            _mm256_storeu_si256(((__m256i*) s)+2, s2);
            _mm256_storeu_si256(((__m256i*) s)+3, s3);
            _mm256_storeu_si256((__m256i*) buffer, results);
        }
        return buffer[cycle];
    }
};


// from George Marsaglia (1994)
// https://groups.google.com/g/sci.stat.math/c/p7aLW3TsJys/m/QGb1kti6kN0J

#define MWC_A0 1111111464

class MWC64 : public generator32x2
{
public:
    virtual ~MWC64() = default;
    using generator::seed;

/* The state must be initialized so that 0 < c < MWC_A0 - 1.
   For simplicity, we suggest to set c = 1 and x to a 32-bit seed. */
    MWC64() { seed(); }
    MWC64(std::uint32_t s) {seed(s);}
    result_type operator()()
    {
	const uint32_t result = s[0];
	const uint64_t t = MWC_A0 * (uint64_t)s[0] + s[1];
	s[0] = t;
	s[1] = t >> 32;
	return result;
    }
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        s[0] = seeds[0];
        s[1] = seeds[1] / (MWC_A0 - 1);
    }
};


#define MWC_A1 0xffebb71d94fcdaf9

class MWC128 : public generator64x2
{
public:
    virtual ~MWC128() = default;
    using generator::seed;

/* The state must be initialized so that 0 < c < MWC_A1 - 1.
   For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
    MWC128() { seed(); }
    MWC128(std::uint32_t s) {seed(s);}

    result_type operator()()
    {
	const uint64_t result = s[0];
	const __uint128_t t = MWC_A1 * (__uint128_t)s[0] + s[1];
	s[0] = t;
	s[1] = t >> 64;
	return result;
    }
protected:
    void seed(std::vector<std::uint32_t> &seeds)
    {
        s[0] = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
        s[1] = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]);
        s[1] = s[1] / (MWC_A1 - 1);
    }
};


#endif /* AG_RANDOM_H */

## random_generators_speed.cpp
#include <iostream>
#include <random>
#include <chrono>
#include <iomanip>
#include "random.h"

template <typename RandomEngineType> void benchmark_random_generator(const std::string &name) {
  RandomEngineType generator(42);

  generator.seed(5);

  constexpr int nIterations = 300000000;

  std::cout << std::setw(20) << name << std::setw(0);

  // check sequence
  // std::cout << " first 8: " ;
  // for (int i = 0; i < 8; ++i) {
  //     std::cout << std::setw(0) << " 0x"
  //               << std::hex
  //               << std::setw(8) << std::setfill('0')
  //               << generator()
  //               << std::setw(0) << std::setfill(' ');
  //
  // }

  const auto start = std::chrono::steady_clock::now();

  for (int i = 0; i < nIterations; ++i) {
      volatile int n = generator();
  }

  const auto end = std::chrono::steady_clock::now();

  std::cout << std::setprecision(3)
            << " | "
            << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / static_cast<double>(nIterations)
            << " ns / num\n";
};


int main() {

  benchmark_random_generator<trunc_xorshift32star>("trunc_xorshift32star");
  benchmark_random_generator<xorshift32star>("xorshift32star");
  benchmark_random_generator<xorshift64>("xorshift32");
  benchmark_random_generator<xorshift64>("xorshift64");
  benchmark_random_generator<xorshift32x4>("xorshift32x4");
  benchmark_random_generator<xorshift64x4>("xorshift64x4");
  benchmark_random_generator<xorshift32x8>("xorshift32x8");
  benchmark_random_generator<xorshift32x16>("xorshift32x16");

  benchmark_random_generator<splitmix>("splitmix");
  benchmark_random_generator<pcg>("pcg");
  benchmark_random_generator<xoshiro128plusplus>("xoshiro128plusplus");
  benchmark_random_generator<xoshiro128starstar>("xoshiro128starstar");
  benchmark_random_generator<xoshiro128plusplusx4>("xoshiro128plusplusx4");
  benchmark_random_generator<xoshiro128starstarx4>("xoshiro128starstarx4");
  benchmark_random_generator<xoshiro128plusplusx8>("xoshiro128plusplusx8");
  benchmark_random_generator<xoshiro128starstarx8>("xoshiro128starstarx8");
  benchmark_random_generator<MWC128>("MWC128");
  benchmark_random_generator<MWC64>("MWC64");
  benchmark_random_generator<std::minstd_rand>("minstd_rand");
  benchmark_random_generator<std::minstd_rand0>("minstd_rand0");
  benchmark_random_generator<std::mt19937>("mt19937");
  benchmark_random_generator<std::mt19937_64>("mt19937_64");
  benchmark_random_generator<std::ranlux24_base>("ranlux24_base");
  benchmark_random_generator<std::ranlux48_base>("ranlux48_base");
  // benchmark_random_generator<std::ranlux24>("ranlux24"); // very slow
  // benchmark_random_generator<std::ranlux48>("ranlux48"); // very slow
  benchmark_random_generator<std::knuth_b>("knuth_b");

  return EXIT_SUCCESS;
}

## xoroshiro128plus.c
/*  Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro128+ 1.0, our best and fastest small-state generator
   for floating-point numbers, but its state space is large enough only
   for mild parallelism. We suggest to use its upper bits for
   floating-point generation, as it is slightly faster than
   xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
   except for the four lower bits, which might fail linearity tests (and
   just those), so if low linear complexity is not considered an issue (as
   it is usually the case) it can be used to generate 64-bit outputs, too;
   moreover, this generator has a very mild Hamming-weight dependency
   making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
   output; we believe this slight bias cannot affect any application. If
   you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s.

   NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
   better results in our test than the 2016 version (a=55, b=14, c=36).
*/

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[2];

uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = s0 + s1;

	s1 ^= s0;
	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
	s[1] = rotl(s1, 37); // c

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}

## xoroshiro128plusplus.c
/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
   small-state generators. It is extremely (sub-ns) fast and it passes all
   tests we are aware of, but its state space is large enough only for
   mild parallelism.

   For generating just floating-point numbers, xoroshiro128+ is even
   faster (but it has a very mild bias, see notes in the comments).

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[2];

uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = rotl(s0 + s1, 17) + s0;

	s1 ^= s0;
	s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
	s[1] = rotl(s1, 28); // c

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x2bd7a6a6e99c2ddc, 0x0992ccaf6a6fca05 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x360fd5f2cf8d5d99, 0x9c6e6877736c46e3 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}

## xoroshiro128starstar.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro128** 1.0, one of our all-purpose, rock-solid,
   small-state generators. It is extremely (sub-ns) fast and it passes all
   tests we are aware of, but its state space is large enough only for
   mild parallelism.

   For generating just floating-point numbers, xoroshiro128+ is even
   faster (but it has a very mild bias, see notes in the comments).

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[2];

uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = rotl(s0 * 5, 7) * 9;

	s1 ^= s0;
	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
	s[1] = rotl(s1, 37); // c

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}

## xoroshiro64star.c
/*  Written in 2016 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro64* 1.0, our best and fastest 32-bit small-state
   generator for 32-bit floating-point numbers. We suggest to use its
   upper bits for floating-point generation, as it is slightly faster than
   xoroshiro64**. It passes all tests we are aware of except for linearity
   tests, as the lowest six bits have low linear complexity, so if low
   linear complexity is not considered an issue (as it is usually the
   case) it can be used to generate 32-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. */


static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


static uint32_t s[2];

uint32_t next(void) {
	const uint32_t s0 = s[0];
	uint32_t s1 = s[1];
	const uint32_t result = s0 * 0x9E3779BB;

	s1 ^= s0;
	s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
	s[1] = rotl(s1, 13); // c

	return result;
}

## xoroshiro64starstar.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro64** 1.0, our 32-bit all-purpose, rock-solid,
   small-state generator. It is extremely fast and it passes all tests we
   are aware of, but its state space is not large enough for any parallel
   application.

   For generating just single-precision (i.e., 32-bit) floating-point
   numbers, xoroshiro64* is even faster.

   The state must be seeded so that it is not everywhere zero. */


static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


static uint32_t s[2];

uint32_t next(void) {
	const uint32_t s0 = s[0];
	uint32_t s1 = s[1];
	const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5;

	s1 ^= s0;
	s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
	s[1] = rotl(s1, 13); // c

	return result;
}

## xoshiro128plus.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro128+ 1.0, our best and fastest 32-bit generator for 32-bit
   floating-point numbers. We suggest to use its upper bits for
   floating-point generation, as it is slightly faster than xoshiro128**.
   It passes all tests we are aware of except for
   linearity tests, as the lowest four bits have low linear complexity, so
   if low linear complexity is not considered an issue (as it is usually
   the case) it can be used to generate 32-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. */


static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


static uint32_t s[4];

uint32_t next(void) {
	const uint32_t result = s[0] + s[3];

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (LONG_JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

## xoshiro128plusplus.c
/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro128++ 1.0, one of our 32-bit all-purpose, rock-solid
   generators. It has excellent speed, a state size (128 bits) that is
   large enough for mild parallelism, and it passes all tests we are aware
   of.

   For generating just single-precision (i.e., 32-bit) floating-point
   numbers, xoshiro128+ is even faster.

   The state must be seeded so that it is not everywhere zero. */


static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


static uint32_t s[4];

uint32_t next(void) {
	const uint32_t result = rotl(s[0] + s[3], 7) + s[0];

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (LONG_JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

## xoshiro128starstar.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro128** 1.1, one of our 32-bit all-purpose, rock-solid
   generators. It has excellent speed, a state size (128 bits) that is
   large enough for mild parallelism, and it passes all tests we are aware
   of.

   Note that version 1.0 had mistakenly s[0] instead of s[1] as state
   word passed to the scrambler.

   For generating just single-precision (i.e., 32-bit) floating-point
   numbers, xoshiro128+ is even faster.

   The state must be seeded so that it is not everywhere zero. */


static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) | (x >> (32 - k));
}


static uint32_t s[4];

uint32_t next(void) {
	const uint32_t result = rotl(s[1] * 5, 7) * 9;

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };

	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 32; b++) {
			if (LONG_JUMP[i] & UINT32_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

## xoshiro256plus.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro256+ 1.0, our best and fastest generator for floating-point
   numbers. We suggest to use its upper bits for floating-point
   generation, as it is slightly faster than xoshiro256++/xoshiro256**. It
   passes all tests we are aware of except for the lowest three bits,
   which might fail linearity tests (and just those), so if low linear
   complexity is not considered an issue (as it is usually the case) it
   can be used to generate 64-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result = s[0] + s[3];

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

## xoshiro256plusplus.c
/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
   It has excellent (sub-ns) speed, a state (256 bits) that is large
   enough for any parallel application, and it passes all tests we are
   aware of.

   For generating just floating-point numbers, xoshiro256+ is even faster.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result = rotl(s[0] + s[3], 23) + s[0];

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

## xoshiro256starstar.c
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro256** 1.0, one of our all-purpose, rock-solid
   generators. It has excellent (sub-ns) speed, a state (256 bits) that is
   large enough for any parallel application, and it passes all tests we
   are aware of.

   For generating just floating-point numbers, xoshiro256+ is even faster.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result = rotl(s[1] * 5, 7) * 9;

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}

	all: run

	run: run0

	run%: random_generators_speed_%
	./$<

	random_generators_speed_0: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O0 -o $@ $<
	random_generators_speed_1: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O1 -o $@ $<
	random_generators_speed_2: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O2 -o $@ $<
	random_generators_speed_3: random_generators_speed.cpp random.h Makefile
	g++ -march=native -O3 -o $@ $<

	clean:
	-rm random_generators_speed_*

	disassemble: random_generators_speed_0
	objdump -d -M att -CSl --no-show-raw-insn $< \| awk -v RS= '/operator\(\)\(\)>:/'
	/* Written in 2023 by Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>
	#include <strings.h>

	/* This is a Marsaglia multiply-with-carry generator with period
	approximately 2^127. It is close in speed to a scrambled linear
	generator, as its only 128-bit operations are a multiplication and sum;
	it is an excellent generator based on congruential arithmetic.

	As all MWC generators, it simulates a multiplicative LCG with
	prime modulus m = 0xffebb71d94fcdaf8ffffffffffffffff and
	multiplier given by the inverse of 2^64 modulo m. The modulus has a
	particular form, which creates some theoretical issues, but at this
	size a generator of this kind passes all known statistical tests,
	except for very large size birthday-spacings tests in three dimensions
	(as predicted from the theory). These failures are unlikely to have
	any impact in practice.

	For a generator of the same type with stronger theoretical guarantees,
	consider a Goresky-Klapper generalized multiply-with-carry generator.

	Alternatively, applying a xorshift to the result (i.e., returning
	x ^ (x << 32) instead of x) will eliminate the birthday-spacings
	failures.

	Note that a previous version had a different MWC_A1. Moreover, now
	the result is computed using the current state.
	*/

	#define MWC_A1 0xffebb71d94fcdaf9

	/* The state must be initialized so that 0 < c < MWC_A1 - 1.
	For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
	uint64_t x, c;

	uint64_t inline next() {
	const uint64_t result = x; // Or, result = x ^ (x << 32) (see above)
	const __uint128_t t = MWC_A1 * (__uint128_t)x + c;
	x = t;
	c = t >> 64;
	return result;
	}


	/* The following jump functions use a minimal multiprecision library. */

	#define MP_SIZE 3
	#include "mp.c"

	static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, MWC_A1 - 1 };


	/* This is the jump function for the generator. It is equivalent
	to 2^64 calls to next(); it can be used to generate 2^64
	non-overlapping subsequences for parallel computations.

	Equivalent C++ Boost multiprecision code:

	cpp_int b = cpp_int(1) << 64;
	cpp_int m = MWC_A1 * b - 1;
	cpp_int r = cpp_int("0x2f65fed2e8400983a72f9a3547208003");
	cpp_int s = ((x + c * b) * r) % m;
	x = uint64_t(s);
	c = uint64_t(s >> 64);
	*/

	void jump(void) {
	static uint64_t jump[MP_SIZE] = { 0xa72f9a3547208003, 0x2f65fed2e8400983 };
	uint64_t state[MP_SIZE] = { x, c };
	mul(state, jump, mod);
	x = state[0];
	c = state[1];
	}


	/* This is the long-jump function for the generator. It is equivalent to
	2^96 calls to next(); it can be used to generate 2^32 starting points,
	from each of which jump() will generate 2^32 non-overlapping
	subsequences for parallel distributed computations.

	Equivalent C++ Boost multiprecision code:

	cpp_int b = cpp_int(1) << 64;
	cpp_int m = MWC_A1 * b - 1;
	cpp_int r = cpp_int("0x394649cfd6769c91e6f7814467f3fcdd");
	cpp_int s = ((x + c * b) * r) % m;
	x = uint64_t(s);
	c = uint64_t(s >> 64);
	*/

	void long_jump(void) {
	static uint64_t long_jump[MP_SIZE] = { 0xe6f7814467f3fcdd, 0x394649cfd6769c91 };
	uint64_t state[MP_SIZE] = { x, c };
	mul(state, long_jump, mod);
	x = state[0];
	c = state[1];
	}
	/* Written in 2021 by Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>
	#include <strings.h>

	/* This is a Marsaglia multiply-with-carry generator with period
	approximately 2^255. It is faster than a scrambled linear
	generator, as its only 128-bit operations are a multiplication and sum;
	it is an excellent generator based on congruential arithmetic.

	As all MWC generators, it simulates a multiplicative LCG with prime
	modulus m = 0xfff62cf2ccc0cdaeffffffffffffffffffffffffffffffffffffffffffffffff
	and multiplier given by the inverse of 2^64 modulo m. The modulus has a
	particular form, which creates some theoretical issues, but at this
	size a generator of this kind passes all known statistical tests. For a
	generator of the same type with stronger theoretical guarantees
	consider a Goresky-Klapper generalized multiply-with-carry generator.

	Note that a previous version had a different MWC_A3. Moreover, now
	the result is computed using the current state.
	*/

	#define MWC_A3 0xfff62cf2ccc0cdaf

	/* The state must be initialized so that 0 < c < MWC_A3 - 1.
	For simplicity, we suggest to set c = 1 and x, y, z to a 192-bit seed. */
	uint64_t x, y, z, c;

	uint64_t inline next() {
	const uint64_t result = z;
	const __uint128_t t = MWC_A3 * (__uint128_t)x + c;
	x = y;
	y = z;
	z = t;
	c = t >> 64;
	return result;
	}


	/* The following jump functions use a minimal multiprecision library. */

	#define MP_SIZE 5
	#include "mp.c"

	static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, MWC_A3 - 1 };


	/* This is the jump function for the generator. It is equivalent
	to 2^128 calls to next(); it can be used to generate 2^128
	non-overlapping subsequences for parallel computations.

	Equivalent C++ Boost multiprecision code:

	cpp_int b = cpp_int(1) << 64;
	cpp_int b3 = b * b * b;
	cpp_int m = MWC_A3 * b3 - 1;
	cpp_int r = cpp_int("0x4b89aa2cd51c37b9f6f8c3fd02ec98fbfe88c291203b225428c3ff11313847eb");
	cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
	x = uint64_t(s);
	y = uint64_t(s >> 64);
	z = uint64_t(s >> 128);
	c = uint64_t(s >> 192);
	*/

	void jump(void) {
	static uint64_t jump[MP_SIZE] = { 0x28c3ff11313847eb, 0xfe88c291203b2254, 0xf6f8c3fd02ec98fb, 0x4b89aa2cd51c37b9 };
	uint64_t state[MP_SIZE] = { x, y, z, c };
	mul(state, jump, mod);
	x = state[0];
	y = state[1];
	z = state[2];
	c = state[3];
	}


	/* This is the long-jump function for the generator. It is equivalent to
	2^192 calls to next(); it can be used to generate 2^64 starting points,
	from each of which jump() will generate 2^64 non-overlapping
	subsequences for parallel distributed computations.

	Equivalent C++ Boost multiprecision code:

	cpp_int b = cpp_int(1) << 64;
	cpp_int b3 = b * b * b;
	cpp_int m = MWC_A3 * b3 - 1;
	cpp_int r = cpp_int("0xaf5ca22408cdc8306c40ce860e0d702f95382f758ac987764c6e39cf92f77a4");
	cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
	x = uint64_t(s);
	y = uint64_t(s >> 64);
	z = uint64_t(s >> 128);
	c = uint64_t(s >> 192);
	*/

	void long_jump(void) {
	static uint64_t long_jump[MP_SIZE] = { 0x64c6e39cf92f77a4, 0xf95382f758ac9877, 0x6c40ce860e0d702, 0xaf5ca22408cdc83 };
	uint64_t state[MP_SIZE] = { x, y, z, c };
	mul(state, long_jump, mod);
	x = state[0];
	y = state[1];
	z = state[2];
	c = state[3];
	}
	/* Copyright (c) 2018 Arvid Gerstmann. */
	/* Copyright (c) 2024 Masataro Asai. */
	/* This code is licensed under MIT license. */
	#ifndef AG_RANDOM_H
	#define AG_RANDOM_H

	#include <immintrin.h>

	class generator
	{
	public:
	virtual ~generator() = default;
	using result_type = uint32_t;
	static constexpr result_type (min)() { return 0; }
	static constexpr result_type (max)() { return UINT32_MAX; }

	void seed()
	{
	std::random_device rd; // non-deterministic source if possible; otherwise a PRNG
	seed(rd);
	}
	void seed(std::random_device &rd)
	{
	seed(rd());
	}
	void seed(std::uint32_t s)
	{
	std::seed_seq seq{s};
	seed(seq);
	}
	void seed(std::seed_seq &seq)
	{
	std::vector<std::uint32_t> seeds(32); // support up to 1024 bit state
	seq.generate(seeds.begin(), seeds.end());
	seed(seeds);
	}

	result_type operator()();

	void discard(unsigned long long n)
	{
	for (unsigned long long i = 0; i < n; ++i)
	operator()();
	}

	protected:
	virtual void seed(std::vector<std::uint32_t> &seeds) = 0;
	};

	class generator64 : public generator
	{
	public:
	virtual ~generator64() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	m_seed = uint64_t(seeds[0]) << 31 \| uint64_t(seeds[1]);
	}

	uint64_t m_seed;
	};

	class generator64x2 : public generator
	{
	public:
	virtual ~generator64x2() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 2; i++){
	s[i] = uint64_t(seeds[2i]) << 31 \| uint64_t(seeds[2i+1]);
	}
	}

	uint64_t s[2];
	};

	class generator64x4 : public generator
	{
	public:
	virtual ~generator64x4() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 4; i++){
	s[i] = uint64_t(seeds[2i]) << 31 \| uint64_t(seeds[2i+1]);
	}
	}

	uint64_t s[4];
	};

	class generator64x8 : public generator
	{
	public:
	virtual ~generator64x8() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 8; i++){
	s[i] = uint64_t(seeds[2i]) << 31 \| uint64_t(seeds[2i+1]);
	}
	}

	uint64_t s[8];
	};

	class generator64x16 : public generator
	{
	public:
	virtual ~generator64x16() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 16; i++){
	s[i] = uint64_t(seeds[2i]) << 31 \| uint64_t(seeds[2i+1]);
	}
	}

	uint64_t s[16];
	};

	class generator32 : public generator
	{
	public:
	virtual ~generator32() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	m_seed = seeds[0];
	}

	uint32_t m_seed;
	};

	class generator32x2 : public generator
	{
	public:
	virtual ~generator32x2() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 2; i++){
	s[i] = seeds[i];
	}
	}

	uint32_t s[2];
	};

	class generator32x4 : public generator
	{
	public:
	virtual ~generator32x4() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 4; i++){
	s[i] = seeds[i];
	}
	}

	uint32_t s[4];
	};

	class generator32x8 : public generator
	{
	public:
	virtual ~generator32x8() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 8; i++){
	s[i] = seeds[i];
	}
	}

	uint32_t s[8];
	};

	class generator32x16 : public generator
	{
	public:
	virtual ~generator32x16() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 16; i++){
	s[i] = seeds[i];
	}
	}

	uint32_t s[16];
	};

	class generator32x32 : public generator
	{
	public:
	virtual ~generator32x32() = default;
	using generator::seed;
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	for (int i = 0; i < 32; i++){
	s[i] = seeds[i];
	}
	}

	uint32_t s[32];
	};


	class splitmix : public generator64
	{
	public:
	virtual ~splitmix() = default;
	using generator::seed;
	splitmix() { seed(); }
	splitmix(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t z = (m_seed += UINT64_C(0x9E3779B97F4A7C15));
	z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
	z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
	return result_type((z ^ (z >> 31)) >> 31);
	}
	};

	class trunc_xorshift32star : public generator64
	{
	public:
	virtual ~trunc_xorshift32star() = default;
	using generator::seed;
	trunc_xorshift32star() { seed(); }
	trunc_xorshift32star(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t result = m_seed * 0xd989bcacc137dcd5ull;
	m_seed ^= m_seed >> 11;
	m_seed ^= m_seed << 31;
	m_seed ^= m_seed >> 18;
	return uint32_t(result >> 32ull);
	}
	};

	class xorshift32star : public generator64
	{
	public:
	virtual ~xorshift32star() = default;
	using generator::seed;
	xorshift32star() { seed(); }
	xorshift32star(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t x;
	uint32_t answer;

	x = m_seed;
	x = x ^ (x >> 12);
	x = x ^ (x << 25);
	x = x ^ (x >> 27);
	m_seed = x;
	answer = ((x * MAGIC) >> 32);

	return answer;
	}

	private:
	const uint64_t MAGIC = 0x2545F4914F6CDD1D;
	};


	class xorshift32 : public generator32
	{
	public:
	virtual ~xorshift32() = default;
	using generator::seed;
	xorshift32() { seed(); }
	xorshift32(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t x = m_seed;
	x ^= x << 13;
	x ^= x >> 17;
	x ^= x << 5;
	return m_seed = x;
	}
	};

	class xorshift64 : public generator64
	{
	public:
	virtual ~xorshift64() = default;
	using generator::seed;
	xorshift64() { seed(); }
	xorshift64(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t x = m_seed;
	x ^= x << 13;
	x ^= x >> 7;
	x ^= x << 17;
	return m_seed = x;
	}
	};

	// compute 4 numbers at once with SSE.
	class xorshift32x4 : public generator32x4
	{
	int counter = -1;
	public:
	virtual ~xorshift32x4() = default;
	using generator::seed;
	xorshift32x4() { seed(); }
	xorshift32x4(std::uint32_t seed) {generator::seed(seed);}
	result_type operator()()
	{
	counter++;
	int cycle = counter % 4;
	result_type result = s[cycle];
	if (cycle == 3){
	// uint64_t x = m_seed;
	__m128i x = _mm_loadu_si128((__m128i*) s);
	// x ^= x << 13;
	x = _mm_xor_epi32(x, _mm_slli_epi32(x, 13));
	// x ^= x >> 7;
	x = _mm_xor_epi32(x, _mm_srli_epi32(x, 17));
	// x ^= x << 17;
	x = _mm_xor_epi32(x, _mm_slli_epi32(x, 5));
	// return m_seed = x;
	_mm_storeu_si128((__m128i*) s, x);
	}
	return result;
	}
	};

	// compute 4 numbers at once with SSE.
	class xorshift64x4 : public generator64x4
	{
	int counter = -1;
	public:
	virtual ~xorshift64x4() = default;
	using generator::seed;
	xorshift64x4() { seed(); }
	xorshift64x4(std::uint32_t seed) { generator::seed(seed); }
	result_type operator()()
	{
	counter++;
	int cycle = counter % 4;
	result_type result = s[cycle];
	if (cycle == 3){
	// uint64_t x = m_seed;
	__m256i x = _mm256_loadu_si256((__m256i*) s);
	// x ^= x << 13;
	x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 13));
	// x ^= x >> 7;
	x = _mm256_xor_epi64(x, _mm256_srli_epi64(x, 7));
	// x ^= x << 17;
	x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 17));
	// return m_seed = x;
	_mm256_storeu_si256((__m256i*) s, x);
	}
	return result;
	}
	};

	// compute 8 numbers at once with SSE.
	class xorshift32x8 : public generator32x8
	{
	int counter = -1;
	public:
	virtual ~xorshift32x8() = default;
	using generator::seed;
	xorshift32x8() { seed(); }
	xorshift32x8(std::uint32_t seed) { generator::seed(seed); }
	result_type operator()()
	{
	counter++;
	int cycle = counter % 8;
	result_type result = s[cycle];
	if (cycle == 7){
	// uint64_t x = m_seed;
	__m256i x = _mm256_loadu_si256((__m256i*) s);
	// x ^= x << 13;
	x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 13));
	// x ^= x >> 7;
	x = _mm256_xor_epi32(x, _mm256_srli_epi32(x, 17));
	// x ^= x << 17;
	x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 5));
	// return m_seed = x;
	_mm256_storeu_si256((__m256i*) s, x);
	}
	return result;
	}
	};

	// compute 16 numbers at once with AVX512.
	class xorshift32x16 : public generator32x16
	{
	int counter = -1;
	public:
	virtual ~xorshift32x16() = default;
	using generator::seed;
	xorshift32x16() { seed(); }
	xorshift32x16(std::uint32_t seed) { generator::seed(seed); }
	result_type operator()()
	{
	counter++;
	int cycle = counter % 16;
	result_type result = s[cycle];
	if (cycle == 15){
	// uint64_t x = m_seed;
	__m512i x = _mm512_loadu_si512((__m512i*) s);
	// x ^= x << 13;
	x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 13));
	// x ^= x >> 7;
	x = _mm512_xor_epi32(x, _mm512_srli_epi32(x, 17));
	// x ^= x << 17;
	x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 5));
	// return m_seed = x;
	_mm512_storeu_si512((__m512i*) s, x);
	}
	return result;
	}
	};


	// https://www.pcg-random.org/download.html#minimal-c-implementation
	class pcg : public generator64x2
	{
	public:
	virtual ~pcg() = default;
	using generator::seed;
	pcg() { seed();}
	pcg(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	uint64_t oldstate = s[0];
	s[0] = oldstate * 6364136223846793005ULL + s[1];
	uint32_t xorshifted = uint32_t(((oldstate >> 18u) ^ oldstate) >> 27u);
	int rot = oldstate >> 59u;
	return (xorshifted >> rot) \| (xorshifted << ((-rot) & 31));
	}
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	// proper seeding
	uint64_t s0 = uint64_t(seeds[0]) << 31 \| uint64_t(seeds[1]);
	uint64_t s1 = uint64_t(seeds[2]) << 31 \| uint64_t(seeds[3]);
	s[0] = 0;
	s[1] = (s1 << 1) \| 1;
	operator()();
	s[0] += s0;
	operator()();
	}
	};


	static uint32_t rotl(const uint32_t x, int k) __attribute__((always_inline));

	static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) \| (x >> (32 - k));
	}


	class xoshiro128plusplus : public generator32x4
	{
	public:
	virtual ~xoshiro128plusplus() = default;
	using generator::seed;
	xoshiro128plusplus() { generator::seed(1); }
	xoshiro128plusplus(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	const uint32_t result = rotl(s[0] + s[3], 7) + s[0];

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
	}
	};

	class xoshiro128starstar : public generator32x4
	{
	public:
	virtual ~xoshiro128starstar() = default;
	using generator::seed;
	xoshiro128starstar() { generator::seed(1); }
	xoshiro128starstar(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	const uint32_t result = rotl(s[1] * 5, 7) * 9;

	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 11);

	return result;
	}
	};


	// note : dispatching.
	static __m128i rotl(const __m128i x, int k) __attribute__((always_inline));

	static inline __m128i rotl(const __m128i x, const int k) {
	return _mm_or_epi32(_mm_slli_epi32(x, k), _mm_slli_epi32(x, (32 - k)));
	}

	class xoshiro128plusplusx4 : public generator32x16
	{
	int counter = -1;
	result_type buffer[4];
	public:
	virtual ~xoshiro128plusplusx4() = default;
	using generator::seed;
	xoshiro128plusplusx4() { generator::seed(1); }
	xoshiro128plusplusx4(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	counter++;
	int cycle = counter % 4;
	if (cycle == 0){

	__m128i s0 = _mm_loadu_si128((__m128i*) s);
	__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
	__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
	__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);

	// const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
	const __m128i results = _mm_add_epi32(rotl(_mm_add_epi32(s0, s3), 7), s0);

	// const uint32_t t = s[1] << 9;
	const __m128i t = _mm_slli_epi32(s1, 9);

	// s[2] ^= s[0];
	s2 = _mm_xor_epi32(s2, s0);

	// s[3] ^= s[1];
	s3 = _mm_xor_epi32(s3, s1);

	// s[1] ^= s[2];
	s1 = _mm_xor_epi32(s1, s2);

	// s[0] ^= s[3];
	s0 = _mm_xor_epi32(s0, s3);

	// s[2] ^= t;
	s2 = _mm_xor_epi32(s2, t);

	// s[3] = rotl(s[3], 11);
	s3 = rotl(s3, 11);

	_mm_storeu_si128((__m128i*) s, s0);
	_mm_storeu_si128(((__m128i*) s)+1, s1);
	_mm_storeu_si128(((__m128i*) s)+2, s2);
	_mm_storeu_si128(((__m128i*) s)+3, s3);
	_mm_storeu_si128((__m128i*) buffer, results);
	}
	return buffer[cycle];
	}
	};

	class xoshiro128starstarx4 : public generator32x16
	{
	int counter = -1;
	result_type buffer[4];
	public:
	virtual ~xoshiro128starstarx4() = default;
	using generator::seed;
	xoshiro128starstarx4() { generator::seed(1); }
	xoshiro128starstarx4(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	counter++;
	int cycle = counter % 4;
	if (cycle == 0){

	__m128i s0 = _mm_loadu_si128((__m128i*) s);
	__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
	__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
	__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);

	const __m128i c5 = _mm_set1_epi32(5);
	const __m128i c9 = _mm_set1_epi32(9);

	// const uint32_t result = rotl(s[1] * 5, 7) * 9;
	const __m128i results = _mm_mul_epi32(rotl(_mm_mul_epi32(s1, c5), 7), c9);

	// const uint32_t t = s[1] << 9;
	const __m128i t = _mm_slli_epi32(s1, 9);

	// s[2] ^= s[0];
	s2 = _mm_xor_epi32(s2, s0);

	// s[3] ^= s[1];
	s3 = _mm_xor_epi32(s3, s1);

	// s[1] ^= s[2];
	s1 = _mm_xor_epi32(s1, s2);

	// s[0] ^= s[3];
	s0 = _mm_xor_epi32(s0, s3);

	// s[2] ^= t;
	s2 = _mm_xor_epi32(s2, t);

	// s[3] = rotl(s[3], 11);
	s3 = rotl(s3, 11);

	_mm_storeu_si128((__m128i*) s, s0);
	_mm_storeu_si128(((__m128i*) s)+1, s1);
	_mm_storeu_si128(((__m128i*) s)+2, s2);
	_mm_storeu_si128(((__m128i*) s)+3, s3);
	_mm_storeu_si128((__m128i*) buffer, results);
	}
	return buffer[cycle];
	}
	};


	// note : dispatching.
	static __m256i rotl(const __m256i x, int k) __attribute__((always_inline));

	static inline __m256i rotl(const __m256i x, const int k) {
	return _mm256_or_epi32(_mm256_slli_epi32(x, k), _mm256_slli_epi32(x, (32 - k)));
	}

	class xoshiro128plusplusx8 : public generator32x32
	{
	int counter = -1;
	result_type buffer[4];
	public:
	virtual ~xoshiro128plusplusx8() = default;
	using generator::seed;
	xoshiro128plusplusx8() { generator::seed(1); }
	xoshiro128plusplusx8(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	counter++;
	int cycle = counter % 8;
	if (cycle == 0){

	__m256i s0 = _mm256_loadu_si256((__m256i*) s);
	__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
	__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
	__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);

	// const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
	const __m256i results = _mm256_add_epi32(rotl(_mm256_add_epi32(s0, s3), 7), s0);

	// const uint32_t t = s[1] << 9;
	const __m256i t = _mm256_slli_epi32(s1, 9);

	// s[2] ^= s[0];
	s2 = _mm256_xor_epi32(s2, s0);

	// s[3] ^= s[1];
	s3 = _mm256_xor_epi32(s3, s1);

	// s[1] ^= s[2];
	s1 = _mm256_xor_epi32(s1, s2);

	// s[0] ^= s[3];
	s0 = _mm256_xor_epi32(s0, s3);

	// s[2] ^= t;
	s2 = _mm256_xor_epi32(s2, t);

	// s[3] = rotl(s[3], 11);
	s3 = rotl(s3, 11);

	_mm256_storeu_si256((__m256i*) s, s0);
	_mm256_storeu_si256(((__m256i*) s)+1, s1);
	_mm256_storeu_si256(((__m256i*) s)+2, s2);
	_mm256_storeu_si256(((__m256i*) s)+3, s3);
	_mm256_storeu_si256((__m256i*) buffer, results);
	}
	return buffer[cycle];
	}
	};

	class xoshiro128starstarx8 : public generator32x32
	{
	int counter = -1;
	result_type buffer[4];
	public:
	virtual ~xoshiro128starstarx8() = default;
	using generator::seed;
	xoshiro128starstarx8() { generator::seed(1); }
	xoshiro128starstarx8(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	counter++;
	int cycle = counter % 8;
	if (cycle == 0){

	__m256i s0 = _mm256_loadu_si256((__m256i*) s);
	__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
	__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
	__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);

	const __m256i c5 = _mm256_set1_epi32(5);
	const __m256i c9 = _mm256_set1_epi32(9);

	// const uint32_t result = rotl(s[1] * 5, 7) * 9;
	const __m256i results = _mm256_mul_epi32(rotl(_mm256_mul_epi32(s1, c5), 7), c9);

	// const uint32_t t = s[1] << 9;
	const __m256i t = _mm256_slli_epi32(s1, 9);

	// s[2] ^= s[0];
	s2 = _mm256_xor_epi32(s2, s0);

	// s[3] ^= s[1];
	s3 = _mm256_xor_epi32(s3, s1);

	// s[1] ^= s[2];
	s1 = _mm256_xor_epi32(s1, s2);

	// s[0] ^= s[3];
	s0 = _mm256_xor_epi32(s0, s3);

	// s[2] ^= t;
	s2 = _mm256_xor_epi32(s2, t);

	// s[3] = rotl(s[3], 11);
	s3 = rotl(s3, 11);

	_mm256_storeu_si256((__m256i*) s, s0);
	_mm256_storeu_si256(((__m256i*) s)+1, s1);
	_mm256_storeu_si256(((__m256i*) s)+2, s2);
	_mm256_storeu_si256(((__m256i*) s)+3, s3);
	_mm256_storeu_si256((__m256i*) buffer, results);
	}
	return buffer[cycle];
	}
	};


	// from George Marsaglia (1994)
	// https://groups.google.com/g/sci.stat.math/c/p7aLW3TsJys/m/QGb1kti6kN0J

	#define MWC_A0 1111111464

	class MWC64 : public generator32x2
	{
	public:
	virtual ~MWC64() = default;
	using generator::seed;

	/* The state must be initialized so that 0 < c < MWC_A0 - 1.
	For simplicity, we suggest to set c = 1 and x to a 32-bit seed. */
	MWC64() { seed(); }
	MWC64(std::uint32_t s) {seed(s);}
	result_type operator()()
	{
	const uint32_t result = s[0];
	const uint64_t t = MWC_A0 * (uint64_t)s[0] + s[1];
	s[0] = t;
	s[1] = t >> 32;
	return result;
	}
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	s[0] = seeds[0];
	s[1] = seeds[1] / (MWC_A0 - 1);
	}
	};


	#define MWC_A1 0xffebb71d94fcdaf9

	class MWC128 : public generator64x2
	{
	public:
	virtual ~MWC128() = default;
	using generator::seed;

	/* The state must be initialized so that 0 < c < MWC_A1 - 1.
	For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
	MWC128() { seed(); }
	MWC128(std::uint32_t s) {seed(s);}

	result_type operator()()
	{
	const uint64_t result = s[0];
	const __uint128_t t = MWC_A1 * (__uint128_t)s[0] + s[1];
	s[0] = t;
	s[1] = t >> 64;
	return result;
	}
	protected:
	void seed(std::vector<std::uint32_t> &seeds)
	{
	s[0] = uint64_t(seeds[0]) << 31 \| uint64_t(seeds[1]);
	s[1] = uint64_t(seeds[2]) << 31 \| uint64_t(seeds[3]);
	s[1] = s[1] / (MWC_A1 - 1);
	}
	};


	#endif /* AG_RANDOM_H */
	#include <iostream>
	#include <random>
	#include <chrono>
	#include <iomanip>
	#include "random.h"

	template <typename RandomEngineType> void benchmark_random_generator(const std::string &name) {
	RandomEngineType generator(42);

	generator.seed(5);

	constexpr int nIterations = 300000000;

	std::cout << std::setw(20) << name << std::setw(0);

	// check sequence
	// std::cout << " first 8: " ;
	// for (int i = 0; i < 8; ++i) {
	// std::cout << std::setw(0) << " 0x"
	// << std::hex
	// << std::setw(8) << std::setfill('0')
	// << generator()
	// << std::setw(0) << std::setfill(' ');
	//
	// }

	const auto start = std::chrono::steady_clock::now();

	for (int i = 0; i < nIterations; ++i) {
	volatile int n = generator();
	}

	const auto end = std::chrono::steady_clock::now();

	std::cout << std::setprecision(3)
	<< " \| "
	<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / static_cast<double>(nIterations)
	<< " ns / num\n";
	};




	int main() {

	benchmark_random_generator<trunc_xorshift32star>("trunc_xorshift32star");
	benchmark_random_generator<xorshift32star>("xorshift32star");
	benchmark_random_generator<xorshift64>("xorshift32");
	benchmark_random_generator<xorshift64>("xorshift64");
	benchmark_random_generator<xorshift32x4>("xorshift32x4");
	benchmark_random_generator<xorshift64x4>("xorshift64x4");
	benchmark_random_generator<xorshift32x8>("xorshift32x8");
	benchmark_random_generator<xorshift32x16>("xorshift32x16");

	benchmark_random_generator<splitmix>("splitmix");
	benchmark_random_generator<pcg>("pcg");
	benchmark_random_generator<xoshiro128plusplus>("xoshiro128plusplus");
	benchmark_random_generator<xoshiro128starstar>("xoshiro128starstar");
	benchmark_random_generator<xoshiro128plusplusx4>("xoshiro128plusplusx4");
	benchmark_random_generator<xoshiro128starstarx4>("xoshiro128starstarx4");
	benchmark_random_generator<xoshiro128plusplusx8>("xoshiro128plusplusx8");
	benchmark_random_generator<xoshiro128starstarx8>("xoshiro128starstarx8");
	benchmark_random_generator<MWC128>("MWC128");
	benchmark_random_generator<MWC64>("MWC64");
	benchmark_random_generator<std::minstd_rand>("minstd_rand");
	benchmark_random_generator<std::minstd_rand0>("minstd_rand0");
	benchmark_random_generator<std::mt19937>("mt19937");
	benchmark_random_generator<std::mt19937_64>("mt19937_64");
	benchmark_random_generator<std::ranlux24_base>("ranlux24_base");
	benchmark_random_generator<std::ranlux48_base>("ranlux48_base");
	// benchmark_random_generator<std::ranlux24>("ranlux24"); // very slow
	// benchmark_random_generator<std::ranlux48>("ranlux48"); // very slow
	benchmark_random_generator<std::knuth_b>("knuth_b");

	return EXIT_SUCCESS;
	}
	/* Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>

	/* This is xoroshiro128+ 1.0, our best and fastest small-state generator
	for floating-point numbers, but its state space is large enough only
	for mild parallelism. We suggest to use its upper bits for
	floating-point generation, as it is slightly faster than
	xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
	except for the four lower bits, which might fail linearity tests (and
	just those), so if low linear complexity is not considered an issue (as
	it is usually the case) it can be used to generate 64-bit outputs, too;
	moreover, this generator has a very mild Hamming-weight dependency
	making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
	output; we believe this slight bias cannot affect any application. If
	you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.

	We suggest to use a sign test to extract a random Boolean value, and
	right shifts to extract subsets of bits.

	The state must be seeded so that it is not everywhere zero. If you have
	a 64-bit seed, we suggest to seed a splitmix64 generator and use its
	output to fill s.

	NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
	better results in our test than the 2016 version (a=55, b=14, c=36).
	*/

	static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) \| (x >> (64 - k));
	}


	static uint64_t s[2];

	uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = s0 + s1;

	s1 ^= s0;
	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
	s[1] = rotl(s1, 37); // c

	return result;
	}


	/* This is the jump function for the generator. It is equivalent
	to 2^64 calls to next(); it can be used to generate 2^64
	non-overlapping subsequences for parallel computations. */

	void jump(void) {
	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}


	/* This is the long-jump function for the generator. It is equivalent to
	2^96 calls to next(); it can be used to generate 2^32 starting points,
	from each of which jump() will generate 2^32 non-overlapping
	subsequences for parallel distributed computations. */

	void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (LONG_JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}
	/* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>

	/* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
	small-state generators. It is extremely (sub-ns) fast and it passes all
	tests we are aware of, but its state space is large enough only for
	mild parallelism.

	For generating just floating-point numbers, xoroshiro128+ is even
	faster (but it has a very mild bias, see notes in the comments).

	The state must be seeded so that it is not everywhere zero. If you have
	a 64-bit seed, we suggest to seed a splitmix64 generator and use its
	output to fill s. */


	static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) \| (x >> (64 - k));
	}


	static uint64_t s[2];

	uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = rotl(s0 + s1, 17) + s0;

	s1 ^= s0;
	s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
	s[1] = rotl(s1, 28); // c

	return result;
	}


	/* This is the jump function for the generator. It is equivalent
	to 2^64 calls to next(); it can be used to generate 2^64
	non-overlapping subsequences for parallel computations. */

	void jump(void) {
	static const uint64_t JUMP[] = { 0x2bd7a6a6e99c2ddc, 0x0992ccaf6a6fca05 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}


	/* This is the long-jump function for the generator. It is equivalent to
	2^96 calls to next(); it can be used to generate 2^32 starting points,
	from each of which jump() will generate 2^32 non-overlapping
	subsequences for parallel distributed computations. */

	void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x360fd5f2cf8d5d99, 0x9c6e6877736c46e3 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (LONG_JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}
	/* Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>

	/* This is xoroshiro128** 1.0, one of our all-purpose, rock-solid,
	small-state generators. It is extremely (sub-ns) fast and it passes all
	tests we are aware of, but its state space is large enough only for
	mild parallelism.

	For generating just floating-point numbers, xoroshiro128+ is even
	faster (but it has a very mild bias, see notes in the comments).

	The state must be seeded so that it is not everywhere zero. If you have
	a 64-bit seed, we suggest to seed a splitmix64 generator and use its
	output to fill s. */


	static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) \| (x >> (64 - k));
	}


	static uint64_t s[2];

	uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = rotl(s0 * 5, 7) * 9;

	s1 ^= s0;
	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
	s[1] = rotl(s1, 37); // c

	return result;
	}


	/* This is the jump function for the generator. It is equivalent
	to 2^64 calls to next(); it can be used to generate 2^64
	non-overlapping subsequences for parallel computations. */

	void jump(void) {
	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}


	/* This is the long-jump function for the generator. It is equivalent to
	2^96 calls to next(); it can be used to generate 2^32 starting points,
	from each of which jump() will generate 2^32 non-overlapping
	subsequences for parallel distributed computations. */

	void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
	for(int b = 0; b < 64; b++) {
	if (LONG_JUMP[i] & UINT64_C(1) << b) {
	s0 ^= s[0];
	s1 ^= s[1];
	}
	next();
	}

	s[0] = s0;
	s[1] = s1;
	}
	/* Written in 2016 by David Blackman and Sebastiano Vigna (vigna@acm.org)

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>

	/* This is xoroshiro64* 1.0, our best and fastest 32-bit small-state
	generator for 32-bit floating-point numbers. We suggest to use its
	upper bits for floating-point generation, as it is slightly faster than
	xoroshiro64**. It passes all tests we are aware of except for linearity
	tests, as the lowest six bits have low linear complexity, so if low
	linear complexity is not considered an issue (as it is usually the
	case) it can be used to generate 32-bit outputs, too.

	We suggest to use a sign test to extract a random Boolean value, and
	right shifts to extract subsets of bits.

	The state must be seeded so that it is not everywhere zero. */


	static inline uint32_t rotl(const uint32_t x, int k) {
	return (x << k) \| (x >> (32 - k));
	}


	static uint32_t s[2];

	uint32_t next(void) {
	const uint32_t s0 = s[0];
	uint32_t s1 = s[1];
	const uint32_t result = s0 * 0x9E3779BB;

	s1 ^= s0;
	s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
	s[1] = rotl(s1, 13); // c

	return result;
	}