Skip to content

Instantly share code, notes, and snippets.

@andreasnoack
Created October 21, 2014 14:54
Show Gist options
  • Save andreasnoack/9f3f63d7f1d00a07359b to your computer and use it in GitHub Desktop.
Save andreasnoack/9f3f63d7f1d00a07359b to your computer and use it in GitHub Desktop.
A new RNG for Julia
module XORShift
import Base.rand
type XORShiftStar1024 <: AbstractRNG
p::Int
state::Vector{Uint64}
end
XORShiftStar() = XORShiftStar1024(1, rand(Uint64, 16))
const globalXORShiftStar1024 = XORShiftStar()
function rand(rng::XORShiftStar1024)
s = rng.state
p = rng.p
s0 = s[p]
p = p %16 + 1
s1 = s[p]
s1 $= s1 << 31
s1 $= s1 >> 11
s0 $= s0 >> 30
s0 $= s1
s[p] = s0
rng.p = p
return s0 * 1181783497276652981
end
randXORShiftStar() = rand(globalXORShiftStar1024)*5.421010862427522e-20 # The number is 1/2.0^64
end #module
@jakebolewski
Copy link

slight modification.

module XORShift2

import Base.rand

immutable XORShiftStar1024 <: AbstractRNG
    stateptr::Ptr{Uint64}
end
XORShiftStar() = begin
    r = XORShiftStar1024(convert(Ptr{Uint64}, c_malloc(sizeof(Ptr{Uint64}) * 17)))
    unsafe_copy!(r.stateptr, convert(Ptr{Uint64}, rand(Uint64, 16)), 16)
    unsafe_store!(r.stateptr, 1, 17)
    return r 
end

const globalXORShiftStar1024 = XORShiftStar()

function rand(rng::XORShiftStar1024)
    @inbounds begin
        s = rng.stateptr::Ptr{Uint64}
        p  = unsafe_load(s, 17)
        s0 = unsafe_load(s, p)
        p = p % 16 + 1
        s1 = unsafe_load(s, p)
        s1 $= s1 << 31
        s1 $= s1 >> 11
        s0 $= s0 >> 30
        s0 $= s1
        unsafe_store!(s, s0, p)
        unsafe_store!(s, p, 17)
    end
    return s0 * 1181783497276652981
end

randXORShiftStar() = rand(globalXORShiftStar1024) * 5.421010862427522e-20 # The number is 1/2.0^64

end #module

julia> @time for _=1:10^9 XORShift2.randXORShiftStar() end
elapsed time: 2.758581827 seconds (0 bytes allocated)

julia> @time for _=1:10^9 rand() end
elapsed time: 2.523010018 seconds (0 bytes allocated)

@simonster
Copy link

Here's my implementation, which seems to beat rand() by a little bit. It avoids c_malloc and unsafe_load, but I still had to use an unsafe_store! because apparently we optimize getfield but not setfield! for integer indices on objects with homogenous field types. The @inline keyword is kind of cheating but necessary to get the extra ~25%, and clang also inlines the reference implementation.

module XORShift2

import Base.rand

type XORShiftStar1024 <: AbstractRNG
    s1::Uint64
    s2::Uint64
    s3::Uint64
    s4::Uint64
    s5::Uint64
    s6::Uint64
    s7::Uint64
    s8::Uint64
    s9::Uint64
    s10::Uint64
    s11::Uint64
    s12::Uint64
    s13::Uint64
    s14::Uint64
    s15::Uint64
    s16::Uint64
    p::Uint64
end
XORShiftStar() =XORShiftStar1024(rand(Uint64, 16)..., 0)

const globalXORShiftStar1024 = XORShiftStar()

@inline function rand(rng::XORShiftStar1024)
    @inbounds begin
        p  = rng.p
        s0 = getfield(rng, reinterpret(Int, p)+1)
        p = (p + 1) % 16
        s1 = getfield(rng, reinterpret(Int, p)+1)
        s1 $= s1 << 31
        s1 $= s1 >> 11
        s0 $= s0 >> 30
        s0 $= s1
        unsafe_store!(convert(Ptr{Uint64}, pointer_from_objref(rng))+8,
                      s0, reinterpret(Int, p)+1)
        rng.p = p
    end
    return s0 * 1181783497276652981
end


@eval @inline randXORShiftStar() = rand($(globalXORShiftStar1024)) * 5.421010862427522e-20 # The number is 1/2.0^64

end #module

function f()
    x = 0.0
    @time for _=1:10^9
        x += XORShift2.randXORShiftStar()
    end
    x
end
function g()
    x = 0.0
    @time for _=1:10^9
        x += rand()
    end
    x
end

And the results:

julia> f()
elapsed time: 3.358085455 seconds (0 bytes allocated)
4.9999501392498094e8

julia> f()
elapsed time: 3.293589685 seconds (0 bytes allocated)
4.999960515687871e8

julia> f()
elapsed time: 3.375352192 seconds (0 bytes allocated)
5.0001651784083945e8

julia> g()
elapsed time: 3.748420765 seconds (0 bytes allocated)
5.000106120737784e8

julia> g()
elapsed time: 3.453261735 seconds (0 bytes allocated)
5.000021824302525e8

julia> g()
elapsed time: 3.770765756 seconds (0 bytes allocated)
5.0000825887505025e8

I think this could still be slightly faster if it didn't store p to memory on each loop iteration. Otherwise the IR looks pretty close to the reference implementation.

@stevengj
Copy link

You can slightly modify it to avoid a couple of unnecessary + 1 operations (just make p an Int and start it at 1). But in this case you need & 15 since % is more expensive for Int.

module XORShift3

import Base: rand, rand!

type XORShiftStar1024 <: AbstractRNG
    s1::Uint64
    s2::Uint64
    s3::Uint64
    s4::Uint64
    s5::Uint64
    s6::Uint64
    s7::Uint64
    s8::Uint64
    s9::Uint64
    s10::Uint64
    s11::Uint64
    s12::Uint64
    s13::Uint64
    s14::Uint64
    s15::Uint64
    s16::Uint64
    p::Int
end
XORShiftStar() =XORShiftStar1024(rand(Uint64, 16)..., 1)

const globalXORShiftStar1024 = XORShiftStar()

function rand(rng::XORShiftStar1024)
    @inbounds begin
        p  = rng.p
        s0 = getfield(rng, p)
        p = (p & 15) + 1
        s1 = getfield(rng, p)
        s1 $= s1 << 31
        s1 $= s1 >> 11
        s0 $= s0 >> 30
        s0 $= s1
        unsafe_store!(convert(Ptr{Uint64}, pointer_from_objref(rng))+8, s0, p)
        rng.p = p
    end
    return s0 * 1181783497276652981
end

function rand!(rng::XORShiftStar1024, a::AbstractArray{Uint64})
    for i = 1:length(a)
        @inbounds a[i] = rand(rng)
    end
    return a
end
rand(rng::XORShiftStar1024, n::Int) = rand!(rng, Array(Uint64, n))

end #module

@simonster
Copy link

@stevengj Did you benchmark that? I would assume that the heterogeneous type results in type instability getfield (although you could always use unsafe_load if you wanted). Also I think the increments should be optimized out, since computing the address to load requires decrementing the argument to getfield/unsafe_load!.

I wonder whether this could be vectorized to compute two or four random numbers per iteration, but that would probably make getting a single number out of rand more complicated. I also don't think it can be made to work without llvmcall or maybe the SLP vectorizer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment