-
-
Save StefanKarpinski/11284d26d6376a6fdeeb6eef351b5b09 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit 669261ffeceaf3af452aeca788cd74f39cd2a7f3 | |
Author: Stefan Karpinski <stefan@karpinski.org> | |
Date: Tue Mar 14 12:26:39 2023 -0400 | |
Tasks: don't advance task RNG on task spawn | |
Previously we had this unfortunate behavior: | |
julia> Random.seed!(123) | |
TaskLocalRNG() | |
julia> randn() | |
-0.6457306721039767 | |
julia> Random.seed!(123) | |
TaskLocalRNG() | |
julia> fetch(@async nothing) | |
julia> randn() | |
0.4922456865251828 | |
In other words: the mere act of spawning a child task affects the parent | |
task's RNG (by advancing it four times). This PR preserves the desirable | |
parts of the previous situation: when seeded, the parent and child RNG | |
streams are reproducible. Moreover, it fixes the undesirable behavior: | |
julia> Random.seed!(123) | |
TaskLocalRNG() | |
julia> randn() | |
-0.6457306721039767 | |
julia> Random.seed!(123) | |
TaskLocalRNG() | |
julia> fetch(@async nothing) | |
julia> randn() | |
-0.6457306721039767 | |
In other words: the parent RNG is unaffected by spawning a child. | |
The design is based on the SplitMix [1] and DotMix [2] RNGs, but with | |
some simplifications based on observing that we only need binary | |
pedigree coefficients: when a task forks a child, zero is appended to | |
its pedigree, and the child's pedigree is the same with a one in the | |
last place. Thus all the pedigree coefficients are binary. How does this | |
help matters? In the of collision resistance in the DotMix paper, | |
working in a prime modulus is only necessary to guarantee that the | |
difference between pedigree coordinates has a multiplicative inverse. If | |
the coefficients are binary, then the difference is always 1, which | |
means we can work in any modulus, including Z/2^64, which lets us use | |
native integer arithmetic. | |
Similar to SplitMix, instead of explicitly storing pedigree coordinates, | |
we store each task's dot product and derive each child task's dot | |
product by adding to the parent dot product, the random weight | |
coefficeint for the current tree depth. No multiplication is required, | |
we just add the random weight for the current tree depth to the parent's | |
dot product to get the child's dot product. | |
Pseudorandom weights for the SplitMix dot product are generated using an | |
internal PCG RNG, specifically the PCG-RXS-M-XS variant. We chose this | |
RNG because we need something small and fast but unlikely to have | |
artifacts which might sabbotage the collision resistance of SplitMix. | |
Since PCG-RXS-M-XS passes BigCrush with only 36 bits of state and we use | |
64 bits of state, it fits the bill quite well. The major caveat of this | |
RNG is that it produces each value once, which makes it insecure, but | |
this is actually beneficial here: we don't want any repeated weights. We | |
also don't care that this RNG is insecure and invertible since it's only | |
used internally to seed forked child tasks. Rather than the classic LCG | |
multiplier inherited from Knuth, we use the best full width multiplier | |
found by Steele & Vigna [3] searching for high spectrum constants. | |
We reuse the PCG-RXS-M-XS output function for mixing the bits of the | |
SplitMix dot product, instead of the MurmurHash3 mixing function that it | |
normally uses since we're already using for the internal PCG generator. | |
The choice of mixing output function for SplitMix is essentially | |
arbitrary and only serves to cascade bit differences in dot products. | |
[1]: https://gee.cs.oswego.edu/dl/papers/oopsla14.pdf | |
[2]: http://supertech.csail.mit.edu/papers/dprng.pdf | |
[3]: https://arxiv.org/pdf/2001.05304.pdf | |
diff --git a/base/sysimg.jl b/base/sysimg.jl | |
index 7d3826eb13..84f88174b8 100644 | |
--- a/base/sysimg.jl | |
+++ b/base/sysimg.jl | |
@@ -27,6 +27,8 @@ let | |
task.rngState1 = 0x7431eaead385992c | |
task.rngState2 = 0x503e1d32781c2608 | |
task.rngState3 = 0x3a77f7189200c20b | |
+ task.rngState4 = 0x5502376d099035ae | |
+ task.rngState5 = 0x01dd7c407e7dcb1b | |
# Stdlibs sorted in dependency, then alphabetical, order by contrib/print_sorted_stdlibs.jl | |
# Run with the `--exclude-jlls` option to filter out all JLL packages | |
diff --git a/src/gc.c b/src/gc.c | |
index 298194b59b..e61f671d5e 100644 | |
--- a/src/gc.c | |
+++ b/src/gc.c | |
@@ -382,9 +382,9 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO | |
ct->sticky = sticky; | |
} | |
-static uint64_t finalizer_rngState[4]; | |
+static uint64_t finalizer_rngState[6]; | |
-void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT; | |
+void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT; | |
JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void) | |
{ | |
@@ -413,7 +413,7 @@ static void run_finalizers(jl_task_t *ct) | |
jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0); | |
arraylist_new(&to_finalize, 0); | |
- uint64_t save_rngState[4]; | |
+ uint64_t save_rngState[6]; | |
memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState)); | |
jl_rng_split(ct->rngState, finalizer_rngState); | |
diff --git a/src/jltypes.c b/src/jltypes.c | |
index 0439c8d034..b8a6b95515 100644 | |
--- a/src/jltypes.c | |
+++ b/src/jltypes.c | |
@@ -2768,7 +2768,7 @@ void jl_init_types(void) JL_GC_DISABLED | |
NULL, | |
jl_any_type, | |
jl_emptysvec, | |
- jl_perm_symsvec(15, | |
+ jl_perm_symsvec(17, | |
"next", | |
"queue", | |
"storage", | |
@@ -2780,11 +2780,13 @@ void jl_init_types(void) JL_GC_DISABLED | |
"rngState1", | |
"rngState2", | |
"rngState3", | |
+ "rngState4", | |
+ "rngState5", | |
"_state", | |
"sticky", | |
"_isexception", | |
"priority"), | |
- jl_svec(15, | |
+ jl_svec(17, | |
jl_any_type, | |
jl_any_type, | |
jl_any_type, | |
@@ -2796,6 +2798,8 @@ void jl_init_types(void) JL_GC_DISABLED | |
jl_uint64_type, | |
jl_uint64_type, | |
jl_uint64_type, | |
+ jl_uint64_type, | |
+ jl_uint64_type, | |
jl_uint8_type, | |
jl_bool_type, | |
jl_bool_type, | |
diff --git a/src/julia.h b/src/julia.h | |
index 2186ee3464..c846a47a38 100644 | |
--- a/src/julia.h | |
+++ b/src/julia.h | |
@@ -1921,7 +1921,7 @@ typedef struct _jl_task_t { | |
jl_function_t *start; | |
// 4 byte padding on 32-bit systems | |
// uint32_t padding0; | |
- uint64_t rngState[4]; | |
+ uint64_t rngState[6]; // xoshiro 4 + splitmix 2 | |
_Atomic(uint8_t) _state; | |
uint8_t sticky; // record whether this Task can be migrated to a new thread | |
_Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with | |
@@ -1929,7 +1929,6 @@ typedef struct _jl_task_t { | |
// uint8_t padding1; | |
// multiqueue priority | |
uint16_t priority; | |
- | |
// hidden state: | |
// id of owning thread - does not need to be defined until the task runs | |
_Atomic(int16_t) tid; | |
diff --git a/src/task.c b/src/task.c | |
index 7102cdec35..92d4536826 100644 | |
--- a/src/task.c | |
+++ b/src/task.c | |
@@ -866,28 +866,34 @@ uint64_t jl_genrandom(uint64_t rngState[4]) JL_NOTSAFEPOINT | |
return res; | |
} | |
-void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT | |
+// pcg_out = pcg_output_rxs_m_xs_64_64 from | |
+// https://github.com/imneme/pcg-c/blob/83252d9c23df9c82ecb42210afed61a7b42402d7/include/pcg_variants.h#L188-L193 | |
+// | |
+// This is the best statistical output function of the PCG family; it produces | |
+// statistically good output even in the case when the state and output are the | |
+// same size, in this case both being 64 bits. | |
+// | |
+inline uint64_t pcg_out(uint64_t x) | |
{ | |
- /* TODO: consider a less ad-hoc construction | |
- Ideally we could just use the output of the random stream to seed the initial | |
- state of the child. Out of an overabundance of caution we multiply with | |
- effectively random coefficients, to break possible self-interactions. | |
+ int s = x >> 59; | |
+ x ^= x >> (s + 5); | |
+ x *= 0xaef17502108ef2d9; | |
+ return x ^ (x >> 43); | |
+} | |
- It is not the goal to mix bits -- we work under the assumption that the | |
- source is well-seeded, and its output looks effectively random. | |
- However, xoshiro has never been studied in the mode where we seed the | |
- initial state with the output of another xoshiro instance. | |
+const uint64_t LCG_MUL = 0xd1342543de82ef95; // https://arxiv.org/abs/2001.05304 | |
- Constants have nothing up their sleeve: | |
- 0x02011ce34bce797f == hash(UInt(1))|0x01 | |
- 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 | |
- 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 | |
- 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 | |
- */ | |
- to[0] = 0x02011ce34bce797f * jl_genrandom(from); | |
- to[1] = 0x5a94851fb48a6e05 * jl_genrandom(from); | |
- to[2] = 0x3688cf5d48899fa7 * jl_genrandom(from); | |
- to[3] = 0x867b4bb4c42e5661 * jl_genrandom(from); | |
+void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT | |
+{ | |
+ uint64_t lcg = src[4] * LCG_MUL + 1; // advance PCG's LCG state | |
+ uint64_t dot = src[5] + pcg_out(lcg); // update splitmix dot product | |
+ uint64_t mul = pcg_out(dot) | 1; // state multiplier (odd) | |
+ dst[0] = mul * src[1]; // we multiply here because: | |
+ dst[1] = mul * src[2]; // 1. guarantees not accidentally hitting all zeros state | |
+ dst[2] = mul * src[3]; // 2. multiply is highly non-commutive with the ops that | |
+ dst[3] = mul * src[0]; // xoshiro uses internally & result should be uncorrelated | |
+ dst[4] = src[4] = lcg; // LCG advances in both child and parent | |
+ dst[5] = dot; // dot product is modified in child only | |
} | |
JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) | |
diff --git a/stdlib/Random/src/Xoshiro.jl b/stdlib/Random/src/Xoshiro.jl | |
index 5b8aa4644d..6ad302e139 100644 | |
--- a/stdlib/Random/src/Xoshiro.jl | |
+++ b/stdlib/Random/src/Xoshiro.jl | |
@@ -113,12 +113,19 @@ struct TaskLocalRNG <: AbstractRNG end | |
TaskLocalRNG(::Nothing) = TaskLocalRNG() | |
rng_native_52(::TaskLocalRNG) = UInt64 | |
-function setstate!(x::TaskLocalRNG, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) | |
+function setstate!( | |
+ x::TaskLocalRNG, | |
+ s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64, # xoshiro256 state | |
+ s4::UInt64 = hash((s0, s1)), # splitmix weight rng state | |
+ s5::UInt64 = hash((s2, s3)), # splitmix dot product | |
+) | |
t = current_task() | |
t.rngState0 = s0 | |
t.rngState1 = s1 | |
t.rngState2 = s2 | |
t.rngState3 = s3 | |
+ t.rngState4 = s4 | |
+ t.rngState5 = s5 | |
x | |
end | |
@@ -128,11 +135,11 @@ end | |
tmp = s0 + s3 | |
res = ((tmp << 23) | (tmp >> 41)) + s0 | |
t = s1 << 17 | |
- s2 = xor(s2, s0) | |
- s3 = xor(s3, s1) | |
- s1 = xor(s1, s2) | |
- s0 = xor(s0, s3) | |
- s2 = xor(s2, t) | |
+ s2 ⊻= s0 | |
+ s3 ⊻= s1 | |
+ s1 ⊻= s2 | |
+ s0 ⊻= s3 | |
+ s2 ⊻= t | |
s3 = s3 << 45 | s3 >> 19 | |
task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 | |
res | |
@@ -159,7 +166,7 @@ seed!(rng::Union{TaskLocalRNG, Xoshiro}, seed::Integer) = seed!(rng, make_seed(s | |
@inline function rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{UInt128}) | |
first = rand(rng, UInt64) | |
second = rand(rng,UInt64) | |
- second + UInt128(first)<<64 | |
+ second + UInt128(first) << 64 | |
end | |
@inline rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128 | |
@@ -178,14 +185,14 @@ end | |
function copy!(dst::TaskLocalRNG, src::Xoshiro) | |
t = current_task() | |
- t.rngState0, t.rngState1, t.rngState2, t.rngState3 = src.s0, src.s1, src.s2, src.s3 | |
- dst | |
+ setstate!(dst, src.s0, src.s1, src.s2, src.s3) | |
+ return dst | |
end | |
function copy!(dst::Xoshiro, src::TaskLocalRNG) | |
t = current_task() | |
- dst.s0, dst.s1, dst.s2, dst.s3 = t.rngState0, t.rngState1, t.rngState2, t.rngState3 | |
- dst | |
+ setstate!(dst, t.rngState0, t.rngState1, t.rngState2, t.rngState3) | |
+ return dst | |
end | |
function ==(a::Xoshiro, b::TaskLocalRNG) | |
commit 0b8b0fce5df9b04a309a86faa01ab7572b091286 | |
Author: Stefan Karpinski <stefan@karpinski.org> | |
Date: Thu Mar 23 10:44:46 2023 -0400 | |
RNG split: update LCG after => more ILP | |
The incoming LCG state is already random, so just use it immediately to | |
generate a random weight coefficient instead of doing an LCG step first. | |
This gives a little more instruction-level parallelism (ILP) since the | |
LCG step can happen in parallel with the rest of the work. | |
diff --git a/src/task.c b/src/task.c | |
index 92d4536826..f9b57f676f 100644 | |
--- a/src/task.c | |
+++ b/src/task.c | |
@@ -885,15 +885,15 @@ const uint64_t LCG_MUL = 0xd1342543de82ef95; // https://arxiv.org/abs/2001.05304 | |
void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT | |
{ | |
- uint64_t lcg = src[4] * LCG_MUL + 1; // advance PCG's LCG state | |
+ uint64_t lcg = src[4]; // load internal PCG's LCG state | |
uint64_t dot = src[5] + pcg_out(lcg); // update splitmix dot product | |
- uint64_t mul = pcg_out(dot) | 1; // state multiplier (odd) | |
+ uint64_t mul = pcg_out(dot) | 1; // state multiplier (odd ∴ invertible) | |
dst[0] = mul * src[1]; // we multiply here because: | |
dst[1] = mul * src[2]; // 1. guarantees not accidentally hitting all zeros state | |
- dst[2] = mul * src[3]; // 2. multiply is highly non-commutive with the ops that | |
- dst[3] = mul * src[0]; // xoshiro uses internally & result should be uncorrelated | |
- dst[4] = src[4] = lcg; // LCG advances in both child and parent | |
- dst[5] = dot; // dot product is modified in child only | |
+ dst[2] = mul * src[3]; // 2. multiply is fairly non-commutive with ops xoshiro | |
+ dst[3] = mul * src[0]; // stepping uses so the result should be uncorrelated | |
+ dst[4] = src[4] = lcg * LCG_MUL + 1; // LCG advances in both child and parent | |
+ dst[5] = dot; // dot product modified in child only | |
} | |
JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) | |
commit 5c6bbdcb6d5a63c0d999d493168ce8e1fa775fe6 | |
Author: Stefan Karpinski <stefan@karpinski.org> | |
Date: Thu Mar 23 11:32:27 2023 -0400 | |
Variation: parent RNG also doesn't affect child RNG | |
diff --git a/src/task.c b/src/task.c | |
index f9b57f676f..63fbe19248 100644 | |
--- a/src/task.c | |
+++ b/src/task.c | |
@@ -887,13 +887,15 @@ void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT | |
{ | |
uint64_t lcg = src[4]; // load internal PCG's LCG state | |
uint64_t dot = src[5] + pcg_out(lcg); // update splitmix dot product | |
- uint64_t mul = pcg_out(dot) | 1; // state multiplier (odd ∴ invertible) | |
- dst[0] = mul * src[1]; // we multiply here because: | |
- dst[1] = mul * src[2]; // 1. guarantees not accidentally hitting all zeros state | |
- dst[2] = mul * src[3]; // 2. multiply is fairly non-commutive with ops xoshiro | |
- dst[3] = mul * src[0]; // stepping uses so the result should be uncorrelated | |
dst[4] = src[4] = lcg * LCG_MUL + 1; // LCG advances in both child and parent | |
dst[5] = dot; // dot product modified in child only | |
+ // use dot as a PCG state to seed the xoshiro256 registers: | |
+ dst[0] = pcg_out(dot = dot * LCG_MUL + 1); | |
+ dst[1] = pcg_out(dot = dot * LCG_MUL + 1); | |
+ dst[2] = pcg_out(dot = dot * LCG_MUL + 1); | |
+ dst[3] = pcg_out(dot = dot * LCG_MUL + 1); | |
+ // since the PCG state and output are the same size, the outputs must all be | |
+ // distinct, which guarantees that the xoshiro256 state cannot be all zeros | |
} | |
JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) | |
commit ea965a17818d3f2c24c9dfa8aa5cb95ab67d5567 | |
Author: Stefan Karpinski <stefan@karpinski.org> | |
Date: Fri Mar 24 09:39:08 2023 -0400 | |
Use 4 PCG output function variants instead of itertaing 4x per fork | |
Using four PCG steps iterates through the PCG space 4x faster, getting | |
back to the start after 2^60 splits instead of 2^64 (eh, who cares?). | |
It's also a bit slow and hard to parallelize. It also reduces the space | |
of possible weights for each register of SplitMix dot product from 2^64 | |
to 2^60, which is a significant reduction in collision resistance. | |
This commit instead implements an approach where we use four different | |
PCG output functions on the same LCG state to get four sufficiently | |
linearly unrelated streams of pseudorandom SplitMix dot product weights | |
(one for each xoshiro256 state register). This should give us 256 bits | |
of SplitMix collision resistance, which is formidable and means that | |
collisions are a impossible in practice. | |
Appealingly, this appraich is also easy to vectorize. I even implemented | |
it with SIMD intrinsices just to be sure (code compiled but not tested): | |
``` | |
void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) | |
{ | |
// load and advance PCG's LCG state | |
uint64_t x = src[4]; | |
// high spectrum multiplier from https://arxiv.org/abs/2001.05304 | |
src[4] = dst[4] = x * 0xd1342543de82ef95 + 1; | |
// manually vectorized PCG-RXS-M-XS with four variants | |
static const uint64_t a[4] = { | |
0xe5f8fa077b92a8a8, // random additive offsets... | |
0x7a0cd918958c124d, | |
0x86222f7d388588d4, | |
0xd30cbd35f2b64f52 | |
}; | |
static const uint64_t m[4] = { | |
0xaef17502108ef2d9, // standard multiplier | |
0xf34026eeb86766af, // random odd multipliers... | |
0x38fd70ad58dd9fbb, | |
0x6677f9b93ab0c04d | |
}; | |
__m256i p, s; | |
p = _mm256_set1_epi64x(x); // p = x | |
p = _mm256_add_epi64(p, _mm256_load_epi64(a)); // p += a | |
s = _mm256_srlv_epi64(p, _mm256_set1_epi64x(59)); // s = x >> 59 | |
s = _mm256_add_epi64(s, _mm256_set1_epi64x(5)); // s += 5 | |
p = _mm256_xor_epi64(p, _mm256_srlv_epi64(p, s)); // p ^= p >> s | |
p = _mm256_mullo_epi64(p, _mm256_load_epi64(m)); // p *= m | |
s = _mm256_set1_epi64x(43); // s = 43 | |
p = _mm256_xor_epi64(p, _mm256_srlv_epi64(p, s)); // p ^= p >> s | |
// load, modify & store xoshiro256 state | |
__m256i sv = _mm256_load_epi64(src); | |
__m256i dv = _mm256_add_epi64(sv, p); // SplitMix dot product | |
_mm256_store_epi64(dst, dv); | |
} | |
``` | |
I didn't end up using this because it only works on hardware with the | |
necessary AVX instructions, so it's not portable, but I wanted to be | |
sure it could be done. The committed version just uses a loop. | |
One concern wit this approach that the 256 bits of SplitMix dot product | |
collision resistance could actually be a mirage. Why? Because the random | |
weights are generated from 64 bits of LCG state. How is that an issue? | |
In the proof of DotMix's collision avoidance, which SplitMix inherits, | |
the number of possible weight values is key: the collision probability | |
is 1/N where N is the number of possible weight vectors. If we consider | |
all four xoshihro256 registers as one big dot product and apply the | |
proof to it, we have a problem: depsite 256 bits of register, there are | |
only 2^64 possible weight values we can generate, so the proof only | |
gives us a pairwise collision probability of 1/2^64. | |
Another way to look at this, however, is to consider the four xoshiro256 | |
register dot products separately: each one has a 1/2^64 collision | |
probability and there are four of them; as long as the chance of each | |
one colliding is independent, the probability of all of them colliding | |
together is (1/2^64)^4 = 1/2^256. Clearly there are ways to generate the | |
four weights that don't satisfy independence. You could use the same | |
weights four times, for example. Or you could use weights that are just | |
scaled copies of each other. Basically any linear relationship between | |
the weights is be problematic. That's yet another reason that iterating | |
PCG multiple times to generate weights may not be ideal: the LCG that | |
drives PCG is very linear; only the output function sabotages the | |
linearity. If the output function being non-linear is crucial, why not | |
use multiple different output functions instead? | |
So that's what I'm doing here: using four different variations on the | |
PCG output function. First we perturb the LCG state by four different | |
random additive constants, which moves it to four distant and unrelated | |
places in the state space and gives the xor shifts different bits to | |
work with. We also use four different multiplicative constants in the | |
middle of the output function: the first is the standard PCG multiplier, | |
so we get known-good weights for one of the registers; the rest are | |
random odd multipliers. A potential improvement is to look for weights | |
with optimal cascading behaviors, but random constants tend to be good. | |
Assuming our four output variants are sufficiently independent, we | |
should get very strong collision resistance with a pairwise collision | |
probability of 1/2^256. It would, however, be reassuring to have | |
empirical evidence that this approach actually works. To that end, I did | |
a test by scaling everything down to four 8-bit SplitMix dot products | |
and tested how many simulated task spawns before we get collisions, and | |
compared to a single 8-bit dot product. Here's the test code: | |
``` | |
function pcg_output_rxs_m_xs_8_8(x::UInt8) | |
p = x | |
p += 0xa0 # random but same as below | |
p ⊻= p >> ((p >> 6) + 2) | |
p *= 0xd9 # standard multiplier | |
p ⊻= p >> 6 | |
end | |
function pcg_output_rxs_m_xs_8_32(x::UInt8) | |
ntuple(4) do i | |
p = x | |
p += (0xa0, 0x98, 0x66, 0x8d)[i] | |
p ⊻= p >> ((p >> 6) + 2) | |
p *= (0xd9, 0x2b, 0x19, 0x9b)[i] | |
p ⊻= p >> 6 | |
end | |
end | |
Base.zero(::Type{NTuple{4, UInt8}}) = (0x0, 0x0, 0x0, 0x0) | |
function gen_collisions( | |
::Type{T}, | |
rec :: Int; | |
cnt :: Dict{T,Int} = Dict{T,Int}(), | |
lcg :: UInt8 = zero(UInt8), | |
dot :: T = zero(T), | |
out :: Function = T == UInt8 ? | |
pcg_output_rxs_m_xs_8_8 : | |
pcg_output_rxs_m_xs_8_32 | |
) where {T} | |
if rec > 0 | |
h = out(lcg) | |
lcg = lcg * 0x8d + 0x01 | |
gen_collisions(rec - 1, T; out, cnt, lcg, dot = dot) | |
gen_collisions(rec - 1, T; out, cnt, lcg, dot = dot .+ h) | |
else | |
cnt[dot] = get(cnt, dot, 0) + 1 | |
end | |
return cnt | |
end | |
``` | |
With this, `gen_collisions(UInt8, 5)` generating 2^5 = 32 dot products, | |
already has collisions, whereas we have to generate 2^20 = 1048576 dot | |
products `gen_collisions(NTuple{4,UInt8}, 20)` to get collisions with | |
four registers. This provides empirical evidence that this approach to | |
generating weights is sufficiently independent and that we can really | |
expect 256 bits of SplitMix collision resistance. | |
diff --git a/base/sysimg.jl b/base/sysimg.jl | |
index 84f88174b8..6f7219afda 100644 | |
--- a/base/sysimg.jl | |
+++ b/base/sysimg.jl | |
@@ -28,7 +28,6 @@ let | |
task.rngState2 = 0x503e1d32781c2608 | |
task.rngState3 = 0x3a77f7189200c20b | |
task.rngState4 = 0x5502376d099035ae | |
- task.rngState5 = 0x01dd7c407e7dcb1b | |
# Stdlibs sorted in dependency, then alphabetical, order by contrib/print_sorted_stdlibs.jl | |
# Run with the `--exclude-jlls` option to filter out all JLL packages | |
diff --git a/src/gc.c b/src/gc.c | |
index e61f671d5e..ee8bd94da9 100644 | |
--- a/src/gc.c | |
+++ b/src/gc.c | |
@@ -382,9 +382,9 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO | |
ct->sticky = sticky; | |
} | |
-static uint64_t finalizer_rngState[6]; | |
+static uint64_t finalizer_rngState[JL_RNG_SIZE]; | |
-void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT; | |
+void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT; | |
JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void) | |
{ | |
@@ -413,7 +413,7 @@ static void run_finalizers(jl_task_t *ct) | |
jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0); | |
arraylist_new(&to_finalize, 0); | |
- uint64_t save_rngState[6]; | |
+ uint64_t save_rngState[JL_RNG_SIZE]; | |
memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState)); | |
jl_rng_split(ct->rngState, finalizer_rngState); | |
diff --git a/src/jltypes.c b/src/jltypes.c | |
index b8a6b95515..b4146aae4d 100644 | |
--- a/src/jltypes.c | |
+++ b/src/jltypes.c | |
@@ -2768,7 +2768,7 @@ void jl_init_types(void) JL_GC_DISABLED | |
NULL, | |
jl_any_type, | |
jl_emptysvec, | |
- jl_perm_symsvec(17, | |
+ jl_perm_symsvec(16, | |
"next", | |
"queue", | |
"storage", | |
@@ -2781,12 +2781,11 @@ void jl_init_types(void) JL_GC_DISABLED | |
"rngState2", | |
"rngState3", | |
"rngState4", | |
- "rngState5", | |
"_state", | |
"sticky", | |
"_isexception", | |
"priority"), | |
- jl_svec(17, | |
+ jl_svec(16, | |
jl_any_type, | |
jl_any_type, | |
jl_any_type, | |
@@ -2799,7 +2798,6 @@ void jl_init_types(void) JL_GC_DISABLED | |
jl_uint64_type, | |
jl_uint64_type, | |
jl_uint64_type, | |
- jl_uint64_type, | |
jl_uint8_type, | |
jl_bool_type, | |
jl_bool_type, | |
diff --git a/src/julia.h b/src/julia.h | |
index c846a47a38..880a6017a9 100644 | |
--- a/src/julia.h | |
+++ b/src/julia.h | |
@@ -1910,6 +1910,8 @@ typedef struct _jl_handler_t { | |
size_t world_age; | |
} jl_handler_t; | |
+#define JL_RNG_SIZE 5 // xoshiro 4 + splitmix 1 | |
+ | |
typedef struct _jl_task_t { | |
JL_DATA_TYPE | |
jl_value_t *next; // invasive linked list for scheduler | |
@@ -1921,7 +1923,7 @@ typedef struct _jl_task_t { | |
jl_function_t *start; | |
// 4 byte padding on 32-bit systems | |
// uint32_t padding0; | |
- uint64_t rngState[6]; // xoshiro 4 + splitmix 2 | |
+ uint64_t rngState[JL_RNG_SIZE]; | |
_Atomic(uint8_t) _state; | |
uint8_t sticky; // record whether this Task can be migrated to a new thread | |
_Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with | |
diff --git a/src/task.c b/src/task.c | |
index 63fbe19248..096ac3291c 100644 | |
--- a/src/task.c | |
+++ b/src/task.c | |
@@ -866,36 +866,34 @@ uint64_t jl_genrandom(uint64_t rngState[4]) JL_NOTSAFEPOINT | |
return res; | |
} | |
-// pcg_out = pcg_output_rxs_m_xs_64_64 from | |
-// https://github.com/imneme/pcg-c/blob/83252d9c23df9c82ecb42210afed61a7b42402d7/include/pcg_variants.h#L188-L193 | |
-// | |
-// This is the best statistical output function of the PCG family; it produces | |
-// statistically good output even in the case when the state and output are the | |
-// same size, in this case both being 64 bits. | |
-// | |
-inline uint64_t pcg_out(uint64_t x) | |
+void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) | |
{ | |
- int s = x >> 59; | |
- x ^= x >> (s + 5); | |
- x *= 0xaef17502108ef2d9; | |
- return x ^ (x >> 43); | |
-} | |
+ // load and advance PCG's LCG state | |
+ uint64_t x = src[4]; | |
+ src[4] = dst[4] = x * 0xd1342543de82ef95 + 1; | |
+ // high spectrum multiplier from https://arxiv.org/abs/2001.05304 | |
-const uint64_t LCG_MUL = 0xd1342543de82ef95; // https://arxiv.org/abs/2001.05304 | |
+ static const uint64_t a[4] = { | |
+ 0xe5f8fa077b92a8a8, // random additive offsets... | |
+ 0x7a0cd918958c124d, | |
+ 0x86222f7d388588d4, | |
+ 0xd30cbd35f2b64f52 | |
+ }; | |
+ static const uint64_t m[4] = { | |
+ 0xaef17502108ef2d9, // standard multiplier | |
+ 0xf34026eeb86766af, // random odd multipliers... | |
+ 0x38fd70ad58dd9fbb, | |
+ 0x6677f9b93ab0c04d | |
+ }; | |
-void jl_rng_split(uint64_t dst[6], uint64_t src[6]) JL_NOTSAFEPOINT | |
-{ | |
- uint64_t lcg = src[4]; // load internal PCG's LCG state | |
- uint64_t dot = src[5] + pcg_out(lcg); // update splitmix dot product | |
- dst[4] = src[4] = lcg * LCG_MUL + 1; // LCG advances in both child and parent | |
- dst[5] = dot; // dot product modified in child only | |
- // use dot as a PCG state to seed the xoshiro256 registers: | |
- dst[0] = pcg_out(dot = dot * LCG_MUL + 1); | |
- dst[1] = pcg_out(dot = dot * LCG_MUL + 1); | |
- dst[2] = pcg_out(dot = dot * LCG_MUL + 1); | |
- dst[3] = pcg_out(dot = dot * LCG_MUL + 1); | |
- // since the PCG state and output are the same size, the outputs must all be | |
- // distinct, which guarantees that the xoshiro256 state cannot be all zeros | |
+ // PCG-RXS-M-XS output with four variants | |
+ for (int i = 0; i < 4; i++) { | |
+ uint64_t p = x + a[i]; | |
+ p ^= p >> ((p >> 59) + 5); | |
+ p *= m[i]; | |
+ p ^= p >> 43; | |
+ dst[i] = src[i] + p; // SplitMix dot product | |
+ } | |
} | |
JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) | |
diff --git a/stdlib/Random/src/Xoshiro.jl b/stdlib/Random/src/Xoshiro.jl | |
index 6ad302e139..58e9f988f5 100644 | |
--- a/stdlib/Random/src/Xoshiro.jl | |
+++ b/stdlib/Random/src/Xoshiro.jl | |
@@ -116,8 +116,7 @@ rng_native_52(::TaskLocalRNG) = UInt64 | |
function setstate!( | |
x::TaskLocalRNG, | |
s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64, # xoshiro256 state | |
- s4::UInt64 = hash((s0, s1)), # splitmix weight rng state | |
- s5::UInt64 = hash((s2, s3)), # splitmix dot product | |
+ s4::UInt64 = hash((s0, s1, s2, s3)), # splitmix weight rng state | |
) | |
t = current_task() | |
t.rngState0 = s0 | |
@@ -125,7 +124,6 @@ function setstate!( | |
t.rngState2 = s2 | |
t.rngState3 = s3 | |
t.rngState4 = s4 | |
- t.rngState5 = s5 | |
x | |
end | |
commit 8d3d55033296046e06369473f4bf1e8a1a032a0b (HEAD -> sk/rng_split, origin/sk/rng_split) | |
Author: Stefan Karpinski <stefan@karpinski.org> | |
Date: Sat Mar 25 18:15:00 2023 -0400 | |
add tests | |
diff --git a/stdlib/Random/test/runtests.jl b/stdlib/Random/test/runtests.jl | |
index 4e8d3b4ffb..3f570d862b 100644 | |
--- a/stdlib/Random/test/runtests.jl | |
+++ b/stdlib/Random/test/runtests.jl | |
@@ -1018,3 +1018,50 @@ guardseed() do | |
@test f42752(true) === val | |
end | |
end | |
+ | |
+@testset "TaskLocalRNG: stream collision smoke test" begin | |
+ # spawn a trinary tree of tasks: | |
+ # - spawn three recursive child tasks in each | |
+ # - generate a random UInt64 in each before, after and between | |
+ # - collect and count all the generated random values | |
+ # these should all be distinct across all tasks | |
+ function gen(d) | |
+ r = rand(UInt64) | |
+ vals = [r] | |
+ if d ≥ 0 | |
+ append!(vals, gent(d - 1)) | |
+ isodd(r) && append!(vals, gent(d - 1)) | |
+ push!(vals, rand(UInt64)) | |
+ iseven(r) && append!(vals, gent(d - 1)) | |
+ end | |
+ push!(vals, rand(UInt64)) | |
+ end | |
+ gent(d) = fetch(@async gen(d)) | |
+ seeds = rand(RandomDevice(), UInt64, 5) | |
+ for seed in seeds | |
+ Random.seed!(seed) | |
+ vals = gen(6) | |
+ @test allunique(vals) | |
+ end | |
+end | |
+ | |
+@testset "TaskLocalRNG: child doesn't affect parent" begin | |
+ seeds = rand(RandomDevice(), UInt64, 5) | |
+ for seed in seeds | |
+ Random.seed!(seed) | |
+ x = rand(UInt64) | |
+ y = rand(UInt64) | |
+ n = 3 | |
+ for i = 1:n | |
+ Random.seed!(seed) | |
+ @sync for j = 0:i | |
+ @async rand(UInt64) | |
+ end | |
+ @test x == rand(UInt64) | |
+ @sync for j = 0:(n-i) | |
+ @async rand(UInt64) | |
+ end | |
+ @test y == rand(UInt64) | |
+ end | |
+ end | |
+end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment