Last active
August 14, 2020 03:09
-
-
Save rkern/9502a19c5926a139ce7deb5bca76c312 to your computer and use it in GitHub Desktop.
Python implementation of SeedSequence
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" PRNG seed sequence implementation for np.random. | |
The algorithms are derived from Melissa E. O'Neill's C++11 `std::seed_seq` | |
implementation, as it has a lot of nice properties that we want. | |
https://gist.github.com/imneme/540829265469e673d045 | |
http://www.pcg-random.org/posts/developing-a-seed_seq-alternative.html | |
The MIT License (MIT) | |
Copyright (c) 2015 Melissa E. O'Neill | |
Copyright (c) 2019 Robert Kern | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
import abc | |
from itertools import cycle | |
import re | |
import secrets | |
import sys | |
import numpy as np | |
DECIMAL_RE = re.compile(r'[0-9]+') | |
DEFAULT_POOL_SIZE = 4 | |
INIT_A = np.uint32(0x43b0d7e5) | |
MULT_A = np.uint32(0x931e8875) | |
INIT_B = np.uint32(0x8b51f9dd) | |
MULT_B = np.uint32(0x58f38ded) | |
MIX_MULT_L = np.uint32(0xca01f9dd) | |
MIX_MULT_R = np.uint32(0x4973f715) | |
XSHIFT = np.dtype(np.uint32).itemsize * 8 // 2 | |
MASK32 = 0xFFFFFFFF | |
def _int_to_uint32_array(n): | |
arr = [] | |
if n < 0: | |
raise ValueError("expected non-negative integer") | |
if n == 0: | |
arr.append(np.uint32(n)) | |
while n > 0: | |
arr.append(np.uint32(n & MASK32)) | |
n >>= 32 | |
return np.array(arr, dtype=np.uint32) | |
def coerce_to_uint32_array(x): | |
""" Coerce an input to a uint32 array. | |
If a `uint32` array, pass it through directly. | |
If a non-negative integer, then break it up into `uint32` words, lowest | |
bits first. | |
If a string starting with "0x", then interpret as a hex integer, as above. | |
If a string of decimal digits, interpret as a decimal integer, as above. | |
If a sequence of ints or strings, interpret each element as above and | |
concatenate. | |
Note that the handling of `int64` or `uint64` arrays are not just | |
straightforward views as `uint32` arrays. If an element is small enough to | |
fit into a `uint32`, then it will only take up one `uint32` element in the | |
output. This is to make sure that the interpretation of a sequence of | |
integers is the same regardless of numpy's default integer type, which | |
differs on different platforms. | |
Parameters | |
---------- | |
x : int, str, sequence of int or str | |
Returns | |
------- | |
seed_array : uint32 array | |
Examples | |
-------- | |
>>> import numpy as np | |
>>> from seed_seq import coerce_to_uint32_array | |
>>> coerce_to_uint32_array(12345) | |
array([12345], dtype=uint32) | |
>>> coerce_to_uint32_array('12345') | |
array([12345], dtype=uint32) | |
>>> coerce_to_uint32_array('0x12345') | |
array([74565], dtype=uint32) | |
>>> coerce_to_uint32_array([12345, '67890']) | |
array([12345, 67890], dtype=uint32) | |
>>> coerce_to_uint32_array(np.array([12345, 67890], dtype=np.uint32)) | |
array([12345, 67890], dtype=uint32) | |
>>> coerce_to_uint32_array(np.array([12345, 67890], dtype=np.int64)) | |
array([12345, 67890], dtype=uint32) | |
>>> coerce_to_uint32_array([12345, 0x10deadbeef, 67890, 0xdeadbeef]) | |
array([ 12345, 3735928559, 16, 67890, 3735928559], | |
dtype=uint32) | |
>>> coerce_to_uint32_array(1234567890123456789012345678901234567890) | |
array([3460238034, 2898026390, 3235640248, 2697535605, 3], | |
dtype=uint32) | |
""" | |
if isinstance(x, np.ndarray) and x.dtype == np.dtype(np.uint32): | |
return x.copy() | |
elif isinstance(x, str): | |
if x.startswith('0x'): | |
x = int(x, base=16) | |
elif DECIMAL_RE.match(x): | |
x = int(x) | |
else: | |
raise ValueError("unrecognized seed string") | |
if isinstance(x, (int, np.integer)): | |
return _int_to_uint32_array(x) | |
else: | |
if len(x) == 0: | |
return np.array([], dtype=np.uint32) | |
# Should be a sequence of interpretable-as-ints. Convert each one to | |
# a uint32 array and concatenate. | |
subseqs = [coerce_to_uint32_array(v) for v in x] | |
return np.concatenate(subseqs) | |
class ISeedSequence(abc.ABC): | |
""" Abstract base class for seed sequences. | |
""" | |
@abc.abstractmethod | |
def generate_state(self, n_words, dtype=np.uint32): | |
""" Return the requested number of words for PRNG seeding. | |
Parameters | |
---------- | |
n_words : int | |
dtype : np.uint32 or np.uint64, optional | |
The size of each word. This should only be either `uint32` or | |
`uint64`. Strings (`'uint32'`, `'uint64'`) are fine. Note that | |
requesting `uint64` will draw twice as many bits as `uint32` for | |
the same `n_words`. This is a convenience for `BitGenerator`s that | |
express their states as `uint64` arrays. | |
Returns | |
------- | |
state : uint32 or uint64 array, shape=(n_words,) | |
""" | |
class SeedSequence(ISeedSequence): | |
def __init__(self, entropy=None, program_entropy=None, spawn_key=(), | |
pool_size=DEFAULT_POOL_SIZE): | |
if pool_size < DEFAULT_POOL_SIZE: | |
raise ValueError("The size of the entropy pool should be at least " | |
f"{DEFAULT_POOL_SIZE}") | |
if entropy is None: | |
entropy = secrets.randbits(pool_size * 32) | |
self.entropy = entropy | |
self.program_entropy = program_entropy | |
self.spawn_key = tuple(spawn_key) | |
self.pool_size = pool_size | |
self.pool = np.zeros(pool_size, dtype=np.uint32) | |
self.n_children_spawned = 0 | |
self.mix_entropy(self.get_assembled_entropy()) | |
def __repr__(self): | |
lines = [ | |
f'{type(self).__name__}(', | |
f' entropy={self.entropy!r},', | |
] | |
# Omit some entries if they are left as the defaults in order to | |
# simplify things. | |
if self.program_entropy is not None: | |
lines.append(f' program_entropy={self.program_entropy!r},') | |
if self.spawn_key: | |
lines.append(f' spawn_key={self.spawn_key!r},') | |
if self.pool_size != DEFAULT_POOL_SIZE: | |
lines.append(f' pool_size={self.pool_size!r},') | |
lines.append(')') | |
text = '\n'.join(lines) | |
return text | |
@np.errstate(over='ignore') | |
def mix_entropy(self, entropy_array): | |
""" Mix in the given entropy. | |
Parameters | |
---------- | |
entropy_array : 1D uint32 array | |
""" | |
hash_const = INIT_A | |
def hash(value): | |
# We are modifying the multiplier as we go along. | |
nonlocal hash_const | |
value ^= hash_const | |
hash_const *= MULT_A | |
value *= hash_const | |
value ^= value >> XSHIFT | |
return value | |
def mix(x, y): | |
# Ensure that we're emulating C++ uint32_t arithmetic correctly. | |
result = np.uint32(np.uint32(MIX_MULT_L * x) - np.uint32(MIX_MULT_R * y)) | |
result ^= result >> XSHIFT | |
return result | |
mixer = self.pool | |
# Add in the entropy up to the pool size. | |
for i in range(len(mixer)): | |
if i < len(entropy_array): | |
mixer[i] = hash(entropy_array[i]) | |
else: | |
# Our pool size is bigger than our entropy, so just keep | |
# running the hash out. | |
mixer[i] = hash(np.uint32(0)) | |
# Mix all bits together so late bits can affect earlier bits. | |
for i_src in range(len(mixer)): | |
for i_dst in range(len(mixer)): | |
if i_src != i_dst: | |
mixer[i_dst] = mix(mixer[i_dst], hash(mixer[i_src])) | |
# Add any remaining entropy, mixing each new entropy word with each | |
# pool word. | |
for i_src in range(len(mixer), len(entropy_array)): | |
for i_dst in range(len(mixer)): | |
mixer[i_dst] = mix(mixer[i_dst], hash(entropy_array[i_src])) | |
# Should have modified in-place. | |
assert mixer is self.pool | |
def get_assembled_entropy(self): | |
""" Convert and assemble all entropy sources into a uniform uint32 | |
array. | |
Returns | |
------- | |
entropy_array : 1D uint32 array | |
""" | |
# Convert run-entropy, program-entropy, and the spawn key into uint32 | |
# arrays and concatenate them. | |
# We MUST have at least some run-entropy. The others are optional. | |
assert self.entropy is not None | |
run_entropy = coerce_to_uint32_array(self.entropy) | |
if self.program_entropy is None: | |
# We *could* make `coerce_to_uint32_array(None)` handle this case, | |
# but that would make it easier to misuse, a la | |
# `coerce_to_uint32_array([None, 12345])` | |
program_entropy = np.array([], dtype=np.uint32) | |
else: | |
program_entropy = coerce_to_uint32_array(self.program_entropy) | |
spawn_entropy = coerce_to_uint32_array(self.spawn_key) | |
entropy_array = np.concatenate([run_entropy, program_entropy, | |
spawn_entropy]) | |
return entropy_array | |
@np.errstate(over='ignore') | |
def generate_state(self, n_words, dtype=np.uint32): | |
""" Return the requested number of words for PRNG seeding. | |
Parameters | |
---------- | |
n_words : int | |
dtype : np.uint32 or np.uint64, optional | |
The size of each word. This should only be either `uint32` or | |
`uint64`. Strings (`'uint32'`, `'uint64'`) are fine. Note that | |
requesting `uint64` will draw twice as many bits as `uint32` for | |
the same `n_words`. This is a convenience for `BitGenerator`s that | |
express their states as `uint64` arrays. | |
Returns | |
------- | |
state : uint32 or uint64 array, shape=(n_words,) | |
""" | |
hash_const = INIT_B | |
out_dtype = np.dtype(dtype) | |
if out_dtype == np.dtype(np.uint32): | |
pass | |
elif out_dtype == np.dtype(np.uint64): | |
n_words *= 2 | |
else: | |
raise ValueError("only support uint32 or uint64") | |
state = np.zeros(n_words, dtype=np.uint32) | |
src_cycle = cycle(self.pool) | |
for i_dst in range(n_words): | |
data_val = next(src_cycle) | |
data_val ^= hash_const | |
hash_const *= MULT_B | |
data_val *= hash_const | |
data_val ^= data_val >> XSHIFT | |
state[i_dst] = data_val | |
if out_dtype == np.dtype(np.uint64): | |
state = state.view(np.uint64) | |
return state | |
def spawn(self, n_children): | |
""" Spawn a number of child `SeedSequence`s by extending the | |
`spawn_key`. | |
Parameters | |
---------- | |
n_children : int | |
Returns | |
------- | |
seqs : list of `SeedSequence`s | |
""" | |
seqs = [] | |
for i in range(self.n_children_spawned, | |
self.n_children_spawned + n_children): | |
seqs.append(SeedSequence( | |
self.entropy, | |
program_entropy=self.program_entropy, | |
spawn_key=self.spawn_key + (i,), | |
pool_size=self.pool_size, | |
)) | |
return seqs | |
def seed_prng(name, seed_seq): | |
""" Use a `SeedSequence` to initialize the given PRNG. | |
This is not an API I'm recommending for numpy. This is just for | |
bootstrapping this demo. | |
""" | |
def _pcg64_cons(arr): | |
""" For some reason, I can't pass in a 128-bit integer to the | |
constructor, so we'll just punch one in through the state. | |
""" | |
bitgen = np.random.PCG64() | |
state = bitgen.state.copy() | |
state['state']['state'] = int(arr[0]) + int(arr[1]) * (1<<64) | |
# The author now recommends using seed entropy to set the increment as well. | |
state['state']['inc'] = (int(arr[2]) + int(arr[3]) * (1<<64)) | 1 | |
bitgen.state = state | |
return bitgen | |
def _pcg32_cons(arr): | |
bitgen = np.random.PCG32(int(arr[0]), inc=int(arr[1]) | 1) | |
return bitgen | |
def _mt19937_cons(arr): | |
bitgen = np.random.MT19937() | |
state = bitgen.state.copy() | |
state['state']['key'] = arr | |
bitgen.state = state | |
return bitgen | |
n_words, dtype, constructor = { | |
'PCG32': (2, 'uint64', _pcg32_cons), | |
'PCG64': (4, 'uint64', _pcg64_cons), | |
'Xoshiro256': (4, 'uint64', np.random.Xoshiro256), | |
'Xoshiro512': (8, 'uint64', np.random.Xoshiro512), | |
'ThreeFry': (4, 'uint64', lambda key: np.random.ThreeFry(key=key)), | |
'Philox': (2, 'uint64', lambda key: np.random.Philox(key=key)), | |
'MT19937': (624, 'uint32', _mt19937_cons), | |
# DSFMT has a very particular state configuration, so just give it 256 | |
# bits and let its seeding work things out. | |
'DSFMT': (8, 'uint32', np.random.DSFMT), | |
}[name] | |
bitgen = constructor(seed_seq.generate_state(n_words, dtype)) | |
return bitgen | |
class SeededGenerator(np.random.Generator): | |
""" Prototype of `Generator` API incorporating `SeedSequence`. | |
""" | |
def __init__(self, bit_generator, seed_seq=None): | |
# FIXME: seed_seq will probably be owned by the BitGenerator, not | |
# Generator, but for now, it's easiest to subclass just Generator for | |
# demo purposes. | |
super().__init__(bit_generator) | |
self._seed_seq = seed_seq | |
@property | |
def seed(self): | |
if self._seed_seq is None: | |
return None | |
return getattr(self._seed_seq, 'entropy', None) | |
def spawn(self, n_children): | |
child_seeds = self._seed_seq.spawn(n_children) | |
# FIXME: placeholder. Actually, we would just pass the children | |
# SeedSequences to the BitGenerator constructors when they accept them. | |
bitgen_name = type(self.bit_generator).__name__ | |
child_bitgens = [SeededGenerator(seed_prng(bitgen_name, seed), seed) | |
for seed in child_seeds] | |
return child_bitgens | |
def default_gen(seed=None): | |
if isinstance(seed, SeedSequence): | |
seed_seq = seed | |
else: | |
seed_seq = SeedSequence(seed) | |
bitgen = seed_prng('PCG64', seed_seq) | |
gen = SeededGenerator(bitgen, seed_seq) | |
return gen | |
def gen_interleaved_bytes(gens, n_per_gen=1024, output_dtype=np.uint32): | |
while True: | |
draws = [g.integers(np.iinfo(output_dtype).max, dtype=output_dtype, | |
endpoint=True, size=n_per_gen) | |
for g in gens] | |
interleaved = np.column_stack(draws).ravel() | |
bytes_chunk = bytes(interleaved.data) | |
yield bytes_chunk | |
def main(): | |
import argparse | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
ALGORITHMS = [ | |
'PCG32', | |
'PCG64', | |
'Xoshiro256', | |
'Xoshiro512', | |
'ThreeFry', | |
'Philox', | |
'MT19937', | |
'DSFMT', | |
] | |
parser.add_argument('-s', '--seed', type=int, help='The root seed.') | |
parser.add_argument('-d', '--depth', type=int, default=4, | |
help='The depth of the spawn tree.') | |
parser.add_argument('-p', '--ply', type=int, default=8, | |
help='The number of spawns at each level.') | |
parser.add_argument('--dtype', choices=['uint32', 'uint64'], default='uint32', | |
help='The output dtype.') | |
parser.add_argument('-a', '--algorithm', choices=ALGORITHMS, | |
default='PCG64', | |
help='The PRNG algorithm to test.') | |
args = parser.parse_args() | |
ss = SeedSequence(args.seed) | |
root = SeededGenerator(seed_prng(args.algorithm, ss), ss) | |
# See? We have easy access to the original seed. The user can copy-paste | |
# that and use it on the CLI later. Print it to stderr because RNG_test is | |
# consuming stdout. | |
print(f"seed = {root.seed}", file=sys.stderr) | |
# Generate `ply ** depth` leaf `SeededGenerators` by the `spawn()` API. | |
leaves = [] | |
nodes = [root] | |
for i in range(args.depth): | |
children = [] | |
for node in nodes: | |
children.extend(node.spawn(args.ply)) | |
nodes = children | |
gens = nodes | |
dtype = np.dtype(args.dtype) | |
for chunk in gen_interleaved_bytes(gens, output_dtype=dtype): | |
sys.stdout.buffer.write(chunk) | |
if __name__ == '__main__': | |
main() |
Okay, I found my error: https://gist.github.com/rkern/9502a19c5926a139ce7deb5bca76c312#file-seed_seq-py-L217
Ah, found the problem. My code was relying on using np.uint32
scalar objects in generate_state()
to emulate C uint32_t
arithmetic, and that worked because all the objects came from the explicit np.uint32
module-level constants or from indexing np.uint32
arrays. When you turned the module-level constants to cdef uint32_t
Cython constants, the undeclared function-level variables in generate_state()
are just plain-old Python objects that happen to be int
s, not np.uint32
objects. That's why you had to add in that & MASK32
that I didn't have.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Good question. Your implementation and mine seem to give different results, both of which differ from the C++ implementation. The latter may be an error in my C++ driver, though, so I'll track that down.