-
-
Save andreasnoack/9f3f63d7f1d00a07359b to your computer and use it in GitHub Desktop.
module XORShift | |
import Base.rand | |
type XORShiftStar1024 <: AbstractRNG | |
p::Int | |
state::Vector{Uint64} | |
end | |
XORShiftStar() = XORShiftStar1024(1, rand(Uint64, 16)) | |
const globalXORShiftStar1024 = XORShiftStar() | |
function rand(rng::XORShiftStar1024) | |
s = rng.state | |
p = rng.p | |
s0 = s[p] | |
p = p %16 + 1 | |
s1 = s[p] | |
s1 $= s1 << 31 | |
s1 $= s1 >> 11 | |
s0 $= s0 >> 30 | |
s0 $= s1 | |
s[p] = s0 | |
rng.p = p | |
return s0 * 1181783497276652981 | |
end | |
randXORShiftStar() = rand(globalXORShiftStar1024)*5.421010862427522e-20 # The number is 1/2.0^64 | |
end #module |
You can slightly modify it to avoid a couple of unnecessary + 1
operations (just make p
an Int
and start it at 1
). But in this case you need & 15
since %
is more expensive for Int
.
module XORShift3
import Base: rand, rand!
type XORShiftStar1024 <: AbstractRNG
s1::Uint64
s2::Uint64
s3::Uint64
s4::Uint64
s5::Uint64
s6::Uint64
s7::Uint64
s8::Uint64
s9::Uint64
s10::Uint64
s11::Uint64
s12::Uint64
s13::Uint64
s14::Uint64
s15::Uint64
s16::Uint64
p::Int
end
XORShiftStar() =XORShiftStar1024(rand(Uint64, 16)..., 1)
const globalXORShiftStar1024 = XORShiftStar()
function rand(rng::XORShiftStar1024)
@inbounds begin
p = rng.p
s0 = getfield(rng, p)
p = (p & 15) + 1
s1 = getfield(rng, p)
s1 $= s1 << 31
s1 $= s1 >> 11
s0 $= s0 >> 30
s0 $= s1
unsafe_store!(convert(Ptr{Uint64}, pointer_from_objref(rng))+8, s0, p)
rng.p = p
end
return s0 * 1181783497276652981
end
function rand!(rng::XORShiftStar1024, a::AbstractArray{Uint64})
for i = 1:length(a)
@inbounds a[i] = rand(rng)
end
return a
end
rand(rng::XORShiftStar1024, n::Int) = rand!(rng, Array(Uint64, n))
end #module
@stevengj Did you benchmark that? I would assume that the heterogeneous type results in type instability getfield
(although you could always use unsafe_load
if you wanted). Also I think the increments should be optimized out, since computing the address to load requires decrementing the argument to getfield
/unsafe_load!
.
I wonder whether this could be vectorized to compute two or four random numbers per iteration, but that would probably make getting a single number out of rand
more complicated. I also don't think it can be made to work without llvmcall
or maybe the SLP vectorizer.
Here's my implementation, which seems to beat
rand()
by a little bit. It avoidsc_malloc
andunsafe_load
, but I still had to use anunsafe_store!
because apparently we optimizegetfield
but notsetfield!
for integer indices on objects with homogenous field types. The@inline
keyword is kind of cheating but necessary to get the extra ~25%, and clang also inlines the reference implementation.And the results:
I think this could still be slightly faster if it didn't store
p
to memory on each loop iteration. Otherwise the IR looks pretty close to the reference implementation.