-
-
Save andreasnoack/9f3f63d7f1d00a07359b to your computer and use it in GitHub Desktop.
module XORShift | |
import Base.rand | |
type XORShiftStar1024 <: AbstractRNG | |
p::Int | |
state::Vector{Uint64} | |
end | |
XORShiftStar() = XORShiftStar1024(1, rand(Uint64, 16)) | |
const globalXORShiftStar1024 = XORShiftStar() | |
function rand(rng::XORShiftStar1024) | |
s = rng.state | |
p = rng.p | |
s0 = s[p] | |
p = p %16 + 1 | |
s1 = s[p] | |
s1 $= s1 << 31 | |
s1 $= s1 >> 11 | |
s0 $= s0 >> 30 | |
s0 $= s1 | |
s[p] = s0 | |
rng.p = p | |
return s0 * 1181783497276652981 | |
end | |
randXORShiftStar() = rand(globalXORShiftStar1024)*5.421010862427522e-20 # The number is 1/2.0^64 | |
end #module |
Here's my implementation, which seems to beat rand()
by a little bit. It avoids c_malloc
and unsafe_load
, but I still had to use an unsafe_store!
because apparently we optimize getfield
but not setfield!
for integer indices on objects with homogenous field types. The @inline
keyword is kind of cheating but necessary to get the extra ~25%, and clang also inlines the reference implementation.
module XORShift2
import Base.rand
type XORShiftStar1024 <: AbstractRNG
s1::Uint64
s2::Uint64
s3::Uint64
s4::Uint64
s5::Uint64
s6::Uint64
s7::Uint64
s8::Uint64
s9::Uint64
s10::Uint64
s11::Uint64
s12::Uint64
s13::Uint64
s14::Uint64
s15::Uint64
s16::Uint64
p::Uint64
end
XORShiftStar() =XORShiftStar1024(rand(Uint64, 16)..., 0)
const globalXORShiftStar1024 = XORShiftStar()
@inline function rand(rng::XORShiftStar1024)
@inbounds begin
p = rng.p
s0 = getfield(rng, reinterpret(Int, p)+1)
p = (p + 1) % 16
s1 = getfield(rng, reinterpret(Int, p)+1)
s1 $= s1 << 31
s1 $= s1 >> 11
s0 $= s0 >> 30
s0 $= s1
unsafe_store!(convert(Ptr{Uint64}, pointer_from_objref(rng))+8,
s0, reinterpret(Int, p)+1)
rng.p = p
end
return s0 * 1181783497276652981
end
@eval @inline randXORShiftStar() = rand($(globalXORShiftStar1024)) * 5.421010862427522e-20 # The number is 1/2.0^64
end #module
function f()
x = 0.0
@time for _=1:10^9
x += XORShift2.randXORShiftStar()
end
x
end
function g()
x = 0.0
@time for _=1:10^9
x += rand()
end
x
end
And the results:
julia> f()
elapsed time: 3.358085455 seconds (0 bytes allocated)
4.9999501392498094e8
julia> f()
elapsed time: 3.293589685 seconds (0 bytes allocated)
4.999960515687871e8
julia> f()
elapsed time: 3.375352192 seconds (0 bytes allocated)
5.0001651784083945e8
julia> g()
elapsed time: 3.748420765 seconds (0 bytes allocated)
5.000106120737784e8
julia> g()
elapsed time: 3.453261735 seconds (0 bytes allocated)
5.000021824302525e8
julia> g()
elapsed time: 3.770765756 seconds (0 bytes allocated)
5.0000825887505025e8
I think this could still be slightly faster if it didn't store p
to memory on each loop iteration. Otherwise the IR looks pretty close to the reference implementation.
You can slightly modify it to avoid a couple of unnecessary + 1
operations (just make p
an Int
and start it at 1
). But in this case you need & 15
since %
is more expensive for Int
.
module XORShift3
import Base: rand, rand!
type XORShiftStar1024 <: AbstractRNG
s1::Uint64
s2::Uint64
s3::Uint64
s4::Uint64
s5::Uint64
s6::Uint64
s7::Uint64
s8::Uint64
s9::Uint64
s10::Uint64
s11::Uint64
s12::Uint64
s13::Uint64
s14::Uint64
s15::Uint64
s16::Uint64
p::Int
end
XORShiftStar() =XORShiftStar1024(rand(Uint64, 16)..., 1)
const globalXORShiftStar1024 = XORShiftStar()
function rand(rng::XORShiftStar1024)
@inbounds begin
p = rng.p
s0 = getfield(rng, p)
p = (p & 15) + 1
s1 = getfield(rng, p)
s1 $= s1 << 31
s1 $= s1 >> 11
s0 $= s0 >> 30
s0 $= s1
unsafe_store!(convert(Ptr{Uint64}, pointer_from_objref(rng))+8, s0, p)
rng.p = p
end
return s0 * 1181783497276652981
end
function rand!(rng::XORShiftStar1024, a::AbstractArray{Uint64})
for i = 1:length(a)
@inbounds a[i] = rand(rng)
end
return a
end
rand(rng::XORShiftStar1024, n::Int) = rand!(rng, Array(Uint64, n))
end #module
@stevengj Did you benchmark that? I would assume that the heterogeneous type results in type instability getfield
(although you could always use unsafe_load
if you wanted). Also I think the increments should be optimized out, since computing the address to load requires decrementing the argument to getfield
/unsafe_load!
.
I wonder whether this could be vectorized to compute two or four random numbers per iteration, but that would probably make getting a single number out of rand
more complicated. I also don't think it can be made to work without llvmcall
or maybe the SLP vectorizer.
slight modification.
julia> @time for _=1:10^9 XORShift2.randXORShiftStar() end
elapsed time: 2.758581827 seconds (0 bytes allocated)
julia> @time for _=1:10^9 rand() end
elapsed time: 2.523010018 seconds (0 bytes allocated)