Skip to content

Instantly share code, notes, and snippets.

@mratsim
Created January 18, 2023 11:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mratsim/967e649f19896b52f13a679a6c66bfe9 to your computer and use it in GitHub Desktop.
Save mratsim/967e649f19896b52f13a679a6c66bfe9 to your computer and use it in GitHub Desktop.
LCG
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# TODO: while there is likely no attacks available that may exploit
# how work is stolen from threads by other threads,
# it would be disappointing to not use a CSPRNG in a cryptographic library.
#
# See:
#
# - Parallel Random Numbers: As Easy as 1, 2, 3
# John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw
# https://thesalmons.org/john/random123/papers/random123sc11.pdf
# https://github.com/DEShawResearch/random123
#
# - Fast-key-erasure random-number generators
# Daniel J. Bernstein
# https://blog.cr.yp.to/20170723-random.html
# https://github.com/floodyberry/supercop/blob/a351f2c/crypto_sign/ntrumls439x/ref/fastrandombytes.c
#
# - Randen - fast backtracking-resistant random generator with AES+Feistel+Reverie
# Jan Wassenberg, Robert Obryk, Jyrki Alakuijala, Emmanuel Mogenet
# https://arxiv.org/abs/1810.02227
# https://github.com/google/randen
import ../../../helpers/prng_unsafe
export prng_unsafe
import
../bithacks,
instrumentation/contracts
# Work-stealing is more efficient when the thread we steal from is randomized.
# If all threads steal in the same order, we increase contention
# on the start victims task queues.
#
# The randomness quality is not important besides distributing potential contention,
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
#
# Hence for efficiency, so that a thread can go to sleep faster, we want to
# reduce calls to to the RNG as:
# - Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
# - a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
# - If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
#
# Instead of using Fisher-Yates
# - generates the victim set eagerly, inefficient if the first steal attempts are successful
# - needs a RNG call when sampling a victim
# - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
# or a sparseset
# - 1 RNG call when sampling a victim
# - memory usage: 2*numThreads per thread so 2*numthreads² uint8 (255 threads max) or uint32
#
# we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
# If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
# with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
#
# References:
# - Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
# - Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
# https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
# https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
# - Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
#
# And if we want cryptographic strength:
# - Ciphers with Arbitrary Finite Domains
# John Black and Phillip Rogaway, 2001
# https://eprint.iacr.org/2001/012
# - An Enciphering Scheme Based on a Card Shuffle
# Viet Tung Hoang, Ben Morris, Phillip Rogaway
# https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf
func nextPowerOfTwo_vartime(n: uint32): uint32 {.inline.} =
## Returns x if x is a power of 2
## or the next biggest power of 2
1'u32 shl (log2_vartime(n-1) + 1)
func gcd_vartime(u, v: uint32): uint32 {.used.} =
## Compute the greatest common divisor
## gcd(u, v) == 1 <=> u and v are coprimes
if u == 0: return v
if v == 0: return u
let shift = countTrailingZeroBits(u or v)
var u = u shr u.countTrailingZeroBits()
var v = v
while true:
v = v shr v.countTrailingZeroBits()
if u > v:
swap(u, v)
v = v - u
if v == 0:
return u shl shift
func fastRange(seed, maxPow2: uint32): uint32 {.inline.} =
## Constrain n in range [0, m) with m a power of 2
##
## Alternatively we could use
## https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
##
## However the number of threads is often in the 2-4 bit range (2 to 16 threads)
## so using the low bits as well avoids hammering the 0 thread.
seed and (maxPow2 - 1)
iterator pseudoRandomPermutation*(randomSeed, maxExclusive: uint32): uint32 =
## Create a (low-quality) pseudo-random permutation from [0, max)
# Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
#
# Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
# if and only if (Hull-Dobell theorem):
# 1. c and m are coprime
# 2. a-1 is divisible by all prime factors of m
# 3. a-1 is divisible by 4 if m is divisible by 4
#
# By choosing a=1, all conditions are easy to reach.
#
# The randomness quality is not important besides distributing potential contention,
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
#
# Assuming 6 threads, co-primes are [1, 5], which means the following permutations
# assuming we start with victim 0:
# - [0, 1, 2, 3, 4, 5]
# - [0, 5, 4, 3, 2, 1]
#
# We can choose m to be the next power of 2, meaning all odd integers are co-primes,
# consequently:
# - we don't need a GCD to find them
# - we don't need to cache coprimes, removing a cache-miss potential
# - we replace a conditional substraction, with multiplication+modulo power-of-two per iteration
# which are both cheap
let M = maxExclusive.nextPowerOfTwo_vartime()
let c = randomSeed.fastRange(M shr 1) * 2 + 1 # c odd and c ∈ [0, M)
let a = randomSeed.fastRange(M shr 2) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4
let start = randomSeed.fastRange(M)
let mask = M-1 # for mod M
var x = start
while true:
if x < maxExclusive:
yield x
x = (a*x + c) and mask # ax + c (mod M), with M power of 2
if x == start:
break
when isMainModule:
import std/[sequtils, strutils]
proc main(numThreads: uint32) =
const s = 0x0EADBEEF'u32
const H = 0x01000000'u32
let iters = 2*numThreads
echo "NumThreads: ", numThreads
# Low-bits
echo " Testing low-bits change in seed"
for seed in s ..< s+iters:
echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
echo " ----"
# high-bits
echo " Testing high-bits change in seed"
for seed in countup(s, s+iters*H-1, H):
echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
echo " ===="
main(2)
main(4)
main(8)
main(12)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment