mratsim/random_permutations.nim

## random_permutations.nim
# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# TODO: while there is likely no attacks available that may exploit
# how work is stolen from threads by other threads,
# it would be disappointing to not use a CSPRNG in a cryptographic library.
#
# See:
#
# - Parallel Random Numbers: As Easy as 1, 2, 3
#   John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw
#   https://thesalmons.org/john/random123/papers/random123sc11.pdf
#   https://github.com/DEShawResearch/random123
#
# - Fast-key-erasure random-number generators
#   Daniel J. Bernstein
#   https://blog.cr.yp.to/20170723-random.html
#   https://github.com/floodyberry/supercop/blob/a351f2c/crypto_sign/ntrumls439x/ref/fastrandombytes.c
#
# - Randen - fast backtracking-resistant random generator with AES+Feistel+Reverie
#   Jan Wassenberg, Robert Obryk, Jyrki Alakuijala, Emmanuel Mogenet
#   https://arxiv.org/abs/1810.02227
#   https://github.com/google/randen

import ../../../helpers/prng_unsafe
export prng_unsafe

import
  ../bithacks,
  instrumentation/contracts

# Work-stealing is more efficient when the thread we steal from is randomized.
# If all threads steal in the same order, we increase contention
# on the start victims task queues.
#
# The randomness quality is not important besides distributing potential contention,
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
#
# Hence for efficiency, so that a thread can go to sleep faster, we want to
# reduce calls to to the RNG as:
# - Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
# - a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
# - If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
#
# Instead of using Fisher-Yates
#   - generates the victim set eagerly, inefficient if the first steal attempts are successful
#   - needs a RNG call when sampling a victim
#   - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
# or a sparseset
#   - 1 RNG call when sampling a victim
#   - memory usage: 2*numThreads per thread so 2*numthreads² uint8 (255 threads max) or uint32
#
# we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
# If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
# with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
#
# References:
# - Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
# - Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
#                https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
#                https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
# - Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
#
# And if we want cryptographic strength:
# - Ciphers with Arbitrary Finite Domains
#   John Black and Phillip Rogaway, 2001
#   https://eprint.iacr.org/2001/012
# - An Enciphering Scheme Based on a Card Shuffle
#   Viet Tung Hoang, Ben Morris, Phillip Rogaway
#   https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf

func nextPowerOfTwo_vartime(n: uint32): uint32 {.inline.} =
  ## Returns x if x is a power of 2
  ## or the next biggest power of 2
  1'u32 shl (log2_vartime(n-1) + 1)

func gcd_vartime(u, v: uint32): uint32 {.used.} =
  ## Compute the greatest common divisor
  ## gcd(u, v) == 1 <=> u and v are coprimes
  if u == 0: return v
  if v == 0: return u
  let shift = countTrailingZeroBits(u or v)
  var u = u shr u.countTrailingZeroBits()
  var v = v
  while true:
    v = v shr v.countTrailingZeroBits()
    if u > v:
      swap(u, v)
    v = v - u
    if v == 0:
      return u shl shift

func fastRange(seed, maxPow2: uint32): uint32 {.inline.} =
  ## Constrain n in range [0, m) with m a power of 2
  ##
  ## Alternatively we could use
  ## https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
  ##
  ## However the number of threads is often in the 2-4 bit range (2 to 16 threads)
  ## so using the low bits as well avoids hammering the 0 thread.
  seed and (maxPow2 - 1)

iterator pseudoRandomPermutation*(randomSeed, maxExclusive: uint32): uint32 =
  ## Create a (low-quality) pseudo-random permutation from [0, max)
  # Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
  #
  # Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
  # if and only if (Hull-Dobell theorem):
  # 1. c and m are coprime
  # 2. a-1 is divisible by all prime factors of m
  # 3. a-1 is divisible by 4 if m is divisible by 4
  #
  # By choosing a=1, all conditions are easy to reach.
  #
  # The randomness quality is not important besides distributing potential contention,
  # i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
  #
  # Assuming 6 threads, co-primes are [1, 5], which means the following permutations
  # assuming we start with victim 0:
  # - [0, 1, 2, 3, 4, 5]
  # - [0, 5, 4, 3, 2, 1]
  #
  # We can choose m to be the next power of 2, meaning all odd integers are co-primes,
  # consequently:
  # - we don't need a GCD to find them
  # - we don't need to cache coprimes, removing a cache-miss potential
  # - we replace a conditional substraction, with multiplication+modulo power-of-two per iteration
  #   which are both cheap
  let M = maxExclusive.nextPowerOfTwo_vartime()
  let c = randomSeed.fastRange(M shr 1) * 2 + 1    # c odd and c ∈ [0, M)
  let a = randomSeed.fastRange(M shr 2) * 4 + 1    # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4

  let start = randomSeed.fastRange(M)
  let mask = M-1                                   # for mod M

  var x = start
  while true:
    if x < maxExclusive:
      yield x
    x = (a*x + c) and mask                         # ax + c (mod M), with M power of 2
    if x == start:
      break


when isMainModule:
  import std/[sequtils, strutils]

  proc main(numThreads: uint32) =
    const s = 0x0EADBEEF'u32
    const H = 0x01000000'u32
    let iters = 2*numThreads
    echo "NumThreads: ", numThreads

    # Low-bits
    echo "  Testing low-bits change in seed"
    for seed in s ..< s+iters:
      echo "    seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
    echo "  ----"
    # high-bits
    echo "  Testing high-bits change in seed"
    for seed in countup(s, s+iters*H-1, H):
      echo "    seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
    echo "  ===="

  main(2)
  main(4)
  main(8)
  main(12)
	# Constantine
	# Copyright (c) 2018-2019 Status Research & Development GmbH
	# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
	# Licensed and distributed under either of
	# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
	# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
	# at your option. This file may not be copied, modified, or distributed except according to those terms.

	# TODO: while there is likely no attacks available that may exploit
	# how work is stolen from threads by other threads,
	# it would be disappointing to not use a CSPRNG in a cryptographic library.
	#
	# See:
	#
	# - Parallel Random Numbers: As Easy as 1, 2, 3
	# John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw
	# https://thesalmons.org/john/random123/papers/random123sc11.pdf
	# https://github.com/DEShawResearch/random123
	#
	# - Fast-key-erasure random-number generators
	# Daniel J. Bernstein
	# https://blog.cr.yp.to/20170723-random.html
	# https://github.com/floodyberry/supercop/blob/a351f2c/crypto_sign/ntrumls439x/ref/fastrandombytes.c
	#
	# - Randen - fast backtracking-resistant random generator with AES+Feistel+Reverie
	# Jan Wassenberg, Robert Obryk, Jyrki Alakuijala, Emmanuel Mogenet
	# https://arxiv.org/abs/1810.02227
	# https://github.com/google/randen

	import ../../../helpers/prng_unsafe
	export prng_unsafe

	import
	../bithacks,
	instrumentation/contracts

	# Work-stealing is more efficient when the thread we steal from is randomized.
	# If all threads steal in the same order, we increase contention
	# on the start victims task queues.
	#
	# The randomness quality is not important besides distributing potential contention,
	# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
	#
	# Hence for efficiency, so that a thread can go to sleep faster, we want to
	# reduce calls to to the RNG as:
	# - Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement)
	# - a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls.
	# - If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement).
	#
	# Instead of using Fisher-Yates
	# - generates the victim set eagerly, inefficient if the first steal attempts are successful
	# - needs a RNG call when sampling a victim
	# - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32
	# or a sparseset
	# - 1 RNG call when sampling a victim
	# - memory usage: 2numThreads per thread so 2numthreads² uint8 (255 threads max) or uint32
	#
	# we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m)
	# If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m)
	# with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m
	#
	# References:
	# - Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
	# - Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484
	# https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim
	# https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim
	# - Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
	#
	# And if we want cryptographic strength:
	# - Ciphers with Arbitrary Finite Domains
	# John Black and Phillip Rogaway, 2001
	# https://eprint.iacr.org/2001/012
	# - An Enciphering Scheme Based on a Card Shuffle
	# Viet Tung Hoang, Ben Morris, Phillip Rogaway
	# https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf

	func nextPowerOfTwo_vartime(n: uint32): uint32 {.inline.} =
	## Returns x if x is a power of 2
	## or the next biggest power of 2
	1'u32 shl (log2_vartime(n-1) + 1)

	func gcd_vartime(u, v: uint32): uint32 {.used.} =
	## Compute the greatest common divisor
	## gcd(u, v) == 1 <=> u and v are coprimes
	if u == 0: return v
	if v == 0: return u
	let shift = countTrailingZeroBits(u or v)
	var u = u shr u.countTrailingZeroBits()
	var v = v
	while true:
	v = v shr v.countTrailingZeroBits()
	if u > v:
	swap(u, v)
	v = v - u
	if v == 0:
	return u shl shift

	func fastRange(seed, maxPow2: uint32): uint32 {.inline.} =
	## Constrain n in range [0, m) with m a power of 2
	##
	## Alternatively we could use
	## https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
	##
	## However the number of threads is often in the 2-4 bit range (2 to 16 threads)
	## so using the low bits as well avoids hammering the 0 thread.
	seed and (maxPow2 - 1)

	iterator pseudoRandomPermutation*(randomSeed, maxExclusive: uint32): uint32 =
	## Create a (low-quality) pseudo-random permutation from [0, max)
	# Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator
	#
	# Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition
	# if and only if (Hull-Dobell theorem):
	# 1. c and m are coprime
	# 2. a-1 is divisible by all prime factors of m
	# 3. a-1 is divisible by 4 if m is divisible by 4
	#
	# By choosing a=1, all conditions are easy to reach.
	#
	# The randomness quality is not important besides distributing potential contention,
	# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough.
	#
	# Assuming 6 threads, co-primes are [1, 5], which means the following permutations
	# assuming we start with victim 0:
	# - [0, 1, 2, 3, 4, 5]
	# - [0, 5, 4, 3, 2, 1]
	#
	# We can choose m to be the next power of 2, meaning all odd integers are co-primes,
	# consequently:
	# - we don't need a GCD to find them
	# - we don't need to cache coprimes, removing a cache-miss potential
	# - we replace a conditional substraction, with multiplication+modulo power-of-two per iteration
	# which are both cheap
	let M = maxExclusive.nextPowerOfTwo_vartime()
	let c = randomSeed.fastRange(M shr 1) * 2 + 1 # c odd and c ∈ [0, M)
	let a = randomSeed.fastRange(M shr 2) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4

	let start = randomSeed.fastRange(M)
	let mask = M-1 # for mod M

	var x = start
	while true:
	if x < maxExclusive:
	yield x
	x = (a*x + c) and mask # ax + c (mod M), with M power of 2
	if x == start:
	break


	when isMainModule:
	import std/[sequtils, strutils]

	proc main(numThreads: uint32) =
	const s = 0x0EADBEEF'u32
	const H = 0x01000000'u32
	let iters = 2*numThreads
	echo "NumThreads: ", numThreads

	# Low-bits
	echo " Testing low-bits change in seed"
	for seed in s ..< s+iters:
	echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
	echo " ----"
	# high-bits
	echo " Testing high-bits change in seed"
	for seed in countup(s, s+iters*H-1, H):
	echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "")
	echo " ===="

	main(2)
	main(4)
	main(8)
	main(12)