chethega/readme.md

## readme.md

      
    Raw
  

              readme.md
            
          
    Use clang -Ofast xoshiro256plusplus.c -march=native -shared -fpic to compile.
Use gcc -Ofast xoshiro256plusplus.c -march=native -S or clang -Ofast xoshiro256plusplus.c -march=native -emit-llvm -S to see the generated code, and weep.
Output of julia ./xoshiro256plusplus.jl on my admittedly anemic machine:
julia dsfmt 1024 x UInt64
  2.209 μs (0 allocations: 0 bytes)
ref impl, 1 x interleaved, 1024 x UInt64
  1.579 μs (0 allocations: 0 bytes)
ref impl, 4 x interleaved, 1024 x UInt64
  2.151 μs (0 allocations: 0 bytes)
julia 1 x interleaved 1 x SIMD, 1024 x UInt64
  4.333 μs (0 allocations: 0 bytes)
  1.827 μs (0 allocations: 0 bytes)
  2.269 μs (0 allocations: 0 bytes)
julia 1 x interleaved 2 x SIMD, 1024 x UInt64
  2.709 μs (0 allocations: 0 bytes)
  1.377 μs (0 allocations: 0 bytes)
  4.158 μs (0 allocations: 0 bytes)
julia 1 x interleaved 4 x SIMD, 1024 x UInt64
  1.479 μs (0 allocations: 0 bytes)
  704.133 ns (0 allocations: 0 bytes)
  2.093 μs (0 allocations: 0 bytes)
julia 1 x interleaved 8 x SIMD, 1024 x UInt64
  1.072 μs (0 allocations: 0 bytes)
  601.562 ns (0 allocations: 0 bytes)
  2.031 μs (0 allocations: 0 bytes)
julia 1 x interleaved 16 x SIMD, 1024 x UInt64
  1.582 μs (0 allocations: 0 bytes)
  671.601 ns (0 allocations: 0 bytes)
  1.966 μs (0 allocations: 0 bytes)

Generated xoshiro code is beautiful:
julia> code_llvm(XfillSimd_nostore, (Vector{UInt64}, xstate{4}, UInt64); debuginfo=:none)

define void @julia_XfillSimd_nostore_16539(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(128), i64) {
top:
  %3 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to <4 x i64> addrspace(11)*
  %.unpack = load <4 x i64>, <4 x i64> addrspace(11)* %4, align 16
  %5 = bitcast %jl_value_t addrspace(11)* %3 to i8 addrspace(11)*
  %6 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 32
  %7 = bitcast i8 addrspace(11)* %6 to <4 x i64> addrspace(11)*
  %.unpack244 = load <4 x i64>, <4 x i64> addrspace(11)* %7, align 16
  %8 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 64
  %9 = bitcast i8 addrspace(11)* %8 to <4 x i64> addrspace(11)*
  %.unpack245 = load <4 x i64>, <4 x i64> addrspace(11)* %9, align 16
  %10 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 96
  %11 = bitcast i8 addrspace(11)* %10 to <4 x i64> addrspace(11)*
  %.unpack246 = load <4 x i64>, <4 x i64> addrspace(11)* %11, align 16
  switch i64 %2, label %L38 [
    i64 1, label %L47.L52_crit_edge
    i64 0, label %L47.L211_crit_edge
  ]

L38:                                              ; preds = %top
  %12 = icmp eq i64 %2, 1
  %13 = sub i64 1, %2
  %14 = add i64 %2, 3
  %value_phi12 = select i1 %12, i64 %13, i64 %14
  %15 = and i64 %value_phi12, 3
  %16 = icmp ult i64 %2, 2
  %17 = sub nsw i64 0, %15
  %value_phi14.p = select i1 %16, i64 %15, i64 %17
  %value_phi14 = add i64 %value_phi14.p, %2
  %18 = icmp eq i64 %value_phi14, 0
  br i1 %18, label %L47.L211_crit_edge, label %L47.L52_crit_edge

L47.L211_crit_edge:                               ; preds = %top, %L38
  %19 = shufflevector <4 x i64> %.unpack244, <4 x i64> %.unpack245, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
  %20 = shufflevector <4 x i64> %.unpack245, <4 x i64> %.unpack244, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
  %21 = shufflevector <4 x i64> %.unpack, <4 x i64> %.unpack246, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
  %22 = shufflevector <4 x i64> %.unpack246, <4 x i64> %.unpack, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
  br label %L211

L47.L52_crit_edge:                                ; preds = %top, %L38
  %value_phi248 = phi i64 [ %value_phi14, %L38 ], [ 1, %top ]
  %res.i254 = add <4 x i64> %.unpack246, %.unpack
  %tmp.i239255 = shl <4 x i64> %.unpack244, <i64 17, i64 17, i64 17, i64 17>
  %res.i238256 = xor <4 x i64> %.unpack245, %.unpack
  %res.i237257 = xor <4 x i64> %.unpack246, %.unpack244
  %res.i236258 = xor <4 x i64> %res.i238256, %.unpack244
  %res.i235259 = xor <4 x i64> %res.i237257, %.unpack
  %res.i234260 = xor <4 x i64> %res.i238256, %tmp.i239255
  %tmp.i233261 = shl <4 x i64> %res.i237257, <i64 45, i64 45, i64 45, i64 45>
  %tmp.i262 = lshr <4 x i64> %res.i237257, <i64 19, i64 19, i64 19, i64 19>
  %res.i232263 = or <4 x i64> %tmp.i262, %tmp.i233261
  %23 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %24 = addrspacecast %jl_value_t addrspace(11)* %23 to %jl_value_t*
  %25 = bitcast %jl_value_t* %24 to i8**
  br label %L145

L138:                                             ; preds = %L210
  call void @julia_throw_inexacterror_14971(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140584171307264 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140584174586064 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %35)
  call void @llvm.trap()
  unreachable

L145:                                             ; preds = %L47.L52_crit_edge, %L210
  %res.i232270 = phi <4 x i64> [ %res.i232263, %L47.L52_crit_edge ], [ %res.i232, %L210 ]
  %res.i234269 = phi <4 x i64> [ %res.i234260, %L47.L52_crit_edge ], [ %res.i234, %L210 ]
  %res.i235268 = phi <4 x i64> [ %res.i235259, %L47.L52_crit_edge ], [ %res.i235, %L210 ]
  %res.i236267 = phi <4 x i64> [ %res.i236258, %L47.L52_crit_edge ], [ %res.i236, %L210 ]
  %res.i266 = phi <4 x i64> [ %res.i254, %L47.L52_crit_edge ], [ %res.i, %L210 ]
  %value_phi4265 = phi i64 [ 1, %L47.L52_crit_edge ], [ %35, %L210 ]
  %.sroa.0193.0264 = phi <4 x i64> [ %.unpack, %L47.L52_crit_edge ], [ %res.i235268, %L210 ]
  %tmp.i242 = lshr <4 x i64> %res.i266, <i64 41, i64 41, i64 41, i64 41>
  %tmp.i243 = shl <4 x i64> %res.i266, <i64 23, i64 23, i64 23, i64 23>
  %res.i241 = or <4 x i64> %tmp.i242, %tmp.i243
  %res.i240 = add <4 x i64> %res.i241, %.sroa.0193.0264
  %26 = load i8*, i8** %25, align 8
  %27 = shl i64 %value_phi4265, 3
  %28 = add nsw i64 %27, -8
  %29 = getelementptr i8, i8* %26, i64 %28
  %ptr.i = bitcast i8* %29 to <4 x i64>*
  store <4 x i64> %res.i240, <4 x i64>* %ptr.i, align 8
  %30 = icmp eq i64 %value_phi4265, %value_phi248
  br i1 %30, label %L205.L211_crit_edge, label %L210

L205.L211_crit_edge:                              ; preds = %L145
  %31 = shufflevector <4 x i64> %res.i236267, <4 x i64> %res.i234269, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
  %32 = shufflevector <4 x i64> %res.i234269, <4 x i64> %res.i236267, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
  %33 = shufflevector <4 x i64> %res.i235268, <4 x i64> %res.i232270, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
  %34 = shufflevector <4 x i64> %res.i232270, <4 x i64> %res.i235268, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
  br label %L211

L210:                                             ; preds = %L145
  %35 = add nuw i64 %value_phi4265, 4
  %res.i = add <4 x i64> %res.i232270, %res.i235268
  %tmp.i239 = shl <4 x i64> %res.i236267, <i64 17, i64 17, i64 17, i64 17>
  %res.i238 = xor <4 x i64> %res.i234269, %res.i235268
  %res.i237 = xor <4 x i64> %res.i232270, %res.i236267
  %res.i236 = xor <4 x i64> %res.i238, %res.i236267
  %res.i235 = xor <4 x i64> %res.i237, %res.i235268
  %res.i234 = xor <4 x i64> %res.i238, %tmp.i239
  %tmp.i233 = shl <4 x i64> %res.i237, <i64 45, i64 45, i64 45, i64 45>
  %tmp.i = lshr <4 x i64> %res.i237, <i64 19, i64 19, i64 19, i64 19>
  %res.i232 = or <4 x i64> %tmp.i, %tmp.i233
  %36 = icmp sgt i64 %35, -1
  br i1 %36, label %L145, label %L138

L211:                                             ; preds = %L47.L211_crit_edge, %L205.L211_crit_edge
  %37 = phi <4 x i64> [ %19, %L47.L211_crit_edge ], [ %31, %L205.L211_crit_edge ]
  %38 = phi <4 x i64> [ %20, %L47.L211_crit_edge ], [ %32, %L205.L211_crit_edge ]
  %39 = phi <4 x i64> [ %21, %L47.L211_crit_edge ], [ %33, %L205.L211_crit_edge ]
  %40 = phi <4 x i64> [ %22, %L47.L211_crit_edge ], [ %34, %L205.L211_crit_edge ]
  %41 = shufflevector <4 x i64> %39, <4 x i64> %40, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
  %42 = bitcast %jl_value_t addrspace(10)* %1 to <4 x i64> addrspace(10)*
  store <4 x i64> %41, <4 x i64> addrspace(10)* %42, align 16
  %43 = shufflevector <4 x i64> %37, <4 x i64> %38, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
  store <4 x i64> %43, <4 x i64> addrspace(11)* %7, align 16
  %44 = shufflevector <4 x i64> %38, <4 x i64> %37, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
  store <4 x i64> %44, <4 x i64> addrspace(11)* %9, align 16
  %45 = shufflevector <4 x i64> %40, <4 x i64> %39, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
  store <4 x i64> %45, <4 x i64> addrspace(11)* %11, align 16
  ret void
}

and
julia> code_native(XfillSimd_nostore, (Vector{UInt64}, xstate{4}, UInt64); debuginfo=:none)
	.text
	pushq	%rax
	vmovdqu	(%rsi), %ymm0
	vmovdqu	32(%rsi), %ymm2
	vmovdqu	64(%rsi), %ymm3
	vmovdqu	96(%rsi), %ymm1
	testq	%rdx, %rdx
	je	L312
	cmpq	$1, %rdx
	jne	L42
	movl	$1, %eax
	jmp	L86
L42:
	movl	$1, %eax
	subl	%edx, %eax
	leal	3(%rdx), %ecx
	cmpq	$1, %rdx
	cmoveq	%rax, %rcx
	andl	$3, %ecx
	movq	%rcx, %rax
	negq	%rax
	cmpq	$2, %rdx
	cmovbq	%rcx, %rax
	addq	%rdx, %rax
	je	L312
L86:
	vpaddq	%ymm0, %ymm1, %ymm4
	vpsllq	$17, %ymm2, %ymm6
	vpxor	%ymm0, %ymm3, %ymm3
	vpxor	%ymm2, %ymm1, %ymm7
	vpxor	%ymm2, %ymm3, %ymm1
	vpxor	%ymm0, %ymm7, %ymm5
	vpxor	%ymm6, %ymm3, %ymm3
	vpsllq	$45, %ymm7, %ymm2
	vpsrlq	$19, %ymm7, %ymm6
	vpor	%ymm2, %ymm6, %ymm2
	movl	$1, %edx
	nopw	%cs:(%rax,%rax)
L144:
	vpsrlq	$41, %ymm4, %ymm6
	vpsllq	$23, %ymm4, %ymm4
	vpor	%ymm4, %ymm6, %ymm4
	vpaddq	%ymm0, %ymm4, %ymm4
	vmovdqa	%ymm5, %ymm0
	movq	(%rdi), %rcx
	vmovdqu	%ymm4, -8(%rcx,%rdx,8)
	cmpq	%rdx, %rax
	je	L266
	addq	$4, %rdx
	vpaddq	%ymm0, %ymm2, %ymm4
	vpsllq	$17, %ymm1, %ymm6
	vpxor	%ymm0, %ymm3, %ymm3
	vpxor	%ymm1, %ymm2, %ymm2
	vpxor	%ymm1, %ymm3, %ymm1
	vpxor	%ymm0, %ymm2, %ymm5
	vpxor	%ymm6, %ymm3, %ymm3
	vpsllq	$45, %ymm2, %ymm6
	vpsrlq	$19, %ymm2, %ymm2
	vpor	%ymm6, %ymm2, %ymm2
	jns	L144
	movabsq	$140584241810544, %rax  # imm = 0x7FDC51CB4C70
	movabsq	$140584171307264, %rdi  # imm = 0x7FDC4D978100
	movabsq	$140584174586064, %rsi  # imm = 0x7FDC4DC988D0
	vzeroupper
	callq	*%rax
	ud2
L266:
	vpshufd	$78, %ymm3, %ymm4       # ymm4 = ymm3[2,3,0,1,6,7,4,5]
	vpblendd	$240, %ymm4, %ymm1, %ymm4 # ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
	vpshufd	$78, %xmm3, %xmm3       # xmm3 = xmm3[2,3,0,1]
	vpblendd	$240, %ymm1, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
	vpshufd	$78, %ymm2, %ymm1       # ymm1 = ymm2[2,3,0,1,6,7,4,5]
	vpblendd	$240, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7]
	vpshufd	$78, %xmm2, %xmm1       # xmm1 = xmm2[2,3,0,1]
	vpblendd	$240, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
	jmp	L356
L312:
	vpshufd	$78, %ymm3, %ymm4       # ymm4 = ymm3[2,3,0,1,6,7,4,5]
	vpblendd	$240, %ymm4, %ymm2, %ymm4 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
	vpshufd	$78, %xmm3, %xmm3       # xmm3 = xmm3[2,3,0,1]
	vpblendd	$240, %ymm2, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7]
	vpshufd	$78, %ymm1, %ymm2       # ymm2 = ymm1[2,3,0,1,6,7,4,5]
	vpblendd	$240, %ymm2, %ymm0, %ymm5 # ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7]
	vpshufd	$78, %xmm1, %xmm1       # xmm1 = xmm1[2,3,0,1]
	vpblendd	$240, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
L356:
	vpblendd	$240, %ymm0, %ymm5, %ymm1 # ymm1 = ymm5[0,1,2,3],ymm0[4,5,6,7]
	vmovdqu	%ymm1, (%rsi)
	vpblendd	$240, %ymm3, %ymm4, %ymm1 # ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
	vmovdqu	%ymm1, 32(%rsi)
	vpblendd	$240, %ymm4, %ymm3, %ymm1 # ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7]
	vpshufd	$78, %ymm1, %ymm1       # ymm1 = ymm1[2,3,0,1,6,7,4,5]
	vmovdqu	%ymm1, 64(%rsi)
	vpblendd	$240, %ymm5, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
	vpshufd	$78, %ymm0, %ymm0       # ymm0 = ymm0[2,3,0,1,6,7,4,5]
	vmovdqu	%ymm0, 96(%rsi)
	popq	%rax
	vzeroupper
	retq
	nop


## xoshiro256plusplus.c
/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

mutilated by chethega for gist.

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


inline uint64_t Xnext(uint64_t* ss) {
	uint64_t result = rotl(ss[0] + ss[3], 23) + ss[0];

	uint64_t t = ss[1] << 17;

	ss[2] ^= ss[0];
	ss[3] ^= ss[1];

	ss[1] ^= ss[2];
	ss[0] ^= ss[3];

	ss[2] ^= t;

	ss[3] = rotl(ss[3], 45);

	return result;
}


void Xfill(uint64_t* restrict buf, uint64_t* restrict state, int n){
	for(int i=0; i<n; i++){
		buf[i] = Xnext(state);
	}
	return;
}

void Xfill2(uint64_t* restrict buf, uint64_t* restrict state1,  uint64_t* restrict state2,  uint64_t* restrict state3,  uint64_t* restrict state4, int n){
	for(int i=0; i<n; i += 4){
		buf[i] = Xnext(state1);
		buf[i+1] = Xnext(state2);
		buf[i+2] = Xnext(state3);
		buf[i+3] = Xnext(state4);
	}
	return;
}

## xoshiro256plusplus.jl
# prepare with clang -Ofast xoshiro256plusplus.c -march=native -shared -fpic
# or use gcc, makes no difference in speed.
# gcc makes nicer looking x86 assembly, clang -emit-llvm is nicer for julia comparison wrt generated code. Choose your poison.
using BenchmarkTools, Random, Libdl, SIMD
lib = dlopen("./a.out")
const Xfill_ptr = dlsym(lib, :Xfill)
const Xfill2_ptr = dlsym(lib, :Xfill2)

function Xfill(dst, state, num)
    GC.@preserve dst,state
        ccall(Xfill_ptr, Cvoid, (Ptr{UInt64}, Ptr{UInt64}, Cint), dst, state, num)
    nothing
end

function Xfill2(dst, state1, state2, state3, state4, num)
    GC.@preserve dst, state1, state2, state3, state4
        ccall(Xfill2_ptr, Cvoid, (Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint), dst, state1, state2, state3, state4, num)
    nothing
end
state1=rand(UInt64, 4);state2=rand(UInt64, 4);state3=rand(UInt64, 4);state4=rand(UInt64, 4);
N=1024; dst=zeros(UInt64, N);

println("julia dsfmt $N x UInt64")
@btime rand!($dst)

println("ref impl, 1 x interleaved, $N x UInt64")
@btime Xfill($dst, $state1, $N)

println("ref impl, 4 x interleaved, $N x UInt64")
@btime Xfill2($dst, $state1, $state2, $state3, $state4, $N)


mutable struct xstate{N}
    s0::Vec{N, UInt64}
    s1::Vec{N, UInt64}
    s2::Vec{N, UInt64}
    s3::Vec{N, UInt64}
end
macro new(T)
    return Expr(:new, T)
end

function init(xs::xstate)
    r=rand(UInt8, sizeof(xs))
    GC.@preserve xs, r
        ccall(:memcpy, Cvoid, (Ptr{Nothing}, Ptr{Nothing}, Csize_t), pointer_from_objref(xs), pointer(r), sizeof(xs))
    nothing
end

@inline rotl(a, k) = (a<<k) | (a>>(64-k))

@inline function next(s)
    res = rotl(s.s0 + s.s3, 23) + s.s0
    t = s.s1 << 17
    s.s2 ⊻= s.s0
    s.s3 ⊻= s.s1
    s.s1 ⊻= s.s2
    s.s0 ⊻= s.s3

    s.s2 ⊻= t

    s.s3 = rotl(s.s3, 45)
    res
end

@inline function next_nostore((s0, s1, s2, s3))
    res = rotl(s0 + s3, 23) + s0
    t = s1 << 17
    s2 ⊻= s0
    s3 ⊻= s1
    s1 ⊻= s2
    s0 ⊻= s3
    s2 ⊻= t
    s3 = rotl(s3, 45)
    return (res, (s0, s1, s2, s3))
end

#xoshiro256**
@inline function next_nostore_star((s0, s1, s2, s3))
    res = rotl(s0 * UInt64(5), 7) * UInt(9)
    t = s1 << 17
    s2 ⊻= s0
    s3 ⊻= s1
    s1 ⊻= s2
    s0 ⊻= s3
    s2 ⊻= t
    s3 = rotl(s3, 45)
    return (res, (s0, s1, s2, s3))
end


function XfillSimd(dst, xs::xstate{N}, num) where N
    lane = VecRange{N}(0)
    @inbounds for i=1:N:num
        dst[lane+i] = next(xs)
    end
    nothing
end

function XfillSimd_nostore(dst, xs::xstate{N}, num) where N
    lane = VecRange{N}(0)
    s = xs.s0, xs.s1, xs.s2, xs.s3
    @inbounds for i=1:N:num
        dst[lane+i], s = next_nostore(s)
    end
    xs.s0, xs.s1, xs.s2, xs.s3 = s
    nothing
end

function XfillSimd_nostore_star(dst, xs::xstate{N}, num) where N
    lane = VecRange{N}(0)
    s = xs.s0, xs.s1, xs.s2, xs.s3
    @inbounds for i=1:N:num
        dst[lane+i], s = next_nostore_star(s)
    end
    xs.s0, xs.s1, xs.s2, xs.s3 = s
    nothing
end

println("julia 1 x interleaved 1 x SIMD, $N x UInt64")
xs = @new xstate{1}; init(xs)
@btime XfillSimd($dst, $xs, $N)
@btime XfillSimd_nostore($dst, $xs, $N)
@btime XfillSimd_nostore_star($dst, $xs, $N)


println("julia 1 x interleaved 2 x SIMD, $N x UInt64")
xs = @new xstate{2}; init(xs)
@btime XfillSimd($dst, $xs, $N)
@btime XfillSimd_nostore($dst, $xs, $N)
@btime XfillSimd_nostore_star($dst, $xs, $N)


println("julia 1 x interleaved 4 x SIMD, $N x UInt64")
xs = @new xstate{4}; init(xs)
@btime XfillSimd($dst, $xs, $N)
@btime XfillSimd_nostore($dst, $xs, $N)
@btime XfillSimd_nostore_star($dst, $xs, $N)


println("julia 1 x interleaved 8 x SIMD, $N x UInt64")
xs = @new xstate{8}; init(xs)
@btime XfillSimd($dst, $xs, $N)
@btime XfillSimd_nostore($dst, $xs, $N)
@btime XfillSimd_nostore_star($dst, $xs, $N)

println("julia 1 x interleaved 16 x SIMD, $N x UInt64")
xs = @new xstate{16}; init(xs)
@btime XfillSimd($dst, $xs, $N)
@btime XfillSimd_nostore($dst, $xs, $N)
@btime XfillSimd_nostore_star($dst, $xs, $N)
	/* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

	mutilated by chethega for gist.

	To the extent possible under law, the author has dedicated all copyright
	and related and neighboring rights to this software to the public domain
	worldwide. This software is distributed without any warranty.

	See <http://creativecommons.org/publicdomain/zero/1.0/>. */

	#include <stdint.h>


	static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) \| (x >> (64 - k));
	}


	inline uint64_t Xnext(uint64_t* ss) {
	uint64_t result = rotl(ss[0] + ss[3], 23) + ss[0];

	uint64_t t = ss[1] << 17;

	ss[2] ^= ss[0];
	ss[3] ^= ss[1];

	ss[1] ^= ss[2];
	ss[0] ^= ss[3];

	ss[2] ^= t;

	ss[3] = rotl(ss[3], 45);

	return result;
	}



	void Xfill(uint64_t* restrict buf, uint64_t* restrict state, int n){
	for(int i=0; i<n; i++){
	buf[i] = Xnext(state);
	}
	return;
	}

	void Xfill2(uint64_t* restrict buf, uint64_t* restrict state1, uint64_t* restrict state2, uint64_t* restrict state3, uint64_t* restrict state4, int n){
	for(int i=0; i<n; i += 4){
	buf[i] = Xnext(state1);
	buf[i+1] = Xnext(state2);
	buf[i+2] = Xnext(state3);
	buf[i+3] = Xnext(state4);
	}
	return;
	}
	# prepare with clang -Ofast xoshiro256plusplus.c -march=native -shared -fpic
	# or use gcc, makes no difference in speed.
	# gcc makes nicer looking x86 assembly, clang -emit-llvm is nicer for julia comparison wrt generated code. Choose your poison.
	using BenchmarkTools, Random, Libdl, SIMD
	lib = dlopen("./a.out")
	const Xfill_ptr = dlsym(lib, :Xfill)
	const Xfill2_ptr = dlsym(lib, :Xfill2)

	function Xfill(dst, state, num)
	GC.@preserve dst,state
	ccall(Xfill_ptr, Cvoid, (Ptr{UInt64}, Ptr{UInt64}, Cint), dst, state, num)
	nothing
	end

	function Xfill2(dst, state1, state2, state3, state4, num)
	GC.@preserve dst, state1, state2, state3, state4
	ccall(Xfill2_ptr, Cvoid, (Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint), dst, state1, state2, state3, state4, num)
	nothing
	end
	state1=rand(UInt64, 4);state2=rand(UInt64, 4);state3=rand(UInt64, 4);state4=rand(UInt64, 4);
	N=1024; dst=zeros(UInt64, N);

	println("julia dsfmt $N x UInt64")
	@btime rand!($dst)

	println("ref impl, 1 x interleaved, $N x UInt64")
	@btime Xfill($dst, $state1, $N)

	println("ref impl, 4 x interleaved, $N x UInt64")
	@btime Xfill2($dst, $state1, $state2, $state3, $state4, $N)




	mutable struct xstate{N}
	s0::Vec{N, UInt64}
	s1::Vec{N, UInt64}
	s2::Vec{N, UInt64}
	s3::Vec{N, UInt64}
	end
	macro new(T)
	return Expr(:new, T)
	end

	function init(xs::xstate)
	r=rand(UInt8, sizeof(xs))
	GC.@preserve xs, r
	ccall(:memcpy, Cvoid, (Ptr{Nothing}, Ptr{Nothing}, Csize_t), pointer_from_objref(xs), pointer(r), sizeof(xs))
	nothing
	end

	@inline rotl(a, k) = (a<<k) \| (a>>(64-k))

	@inline function next(s)
	res = rotl(s.s0 + s.s3, 23) + s.s0
	t = s.s1 << 17
	s.s2 ⊻= s.s0
	s.s3 ⊻= s.s1
	s.s1 ⊻= s.s2
	s.s0 ⊻= s.s3

	s.s2 ⊻= t

	s.s3 = rotl(s.s3, 45)
	res
	end

	@inline function next_nostore((s0, s1, s2, s3))
	res = rotl(s0 + s3, 23) + s0
	t = s1 << 17
	s2 ⊻= s0
	s3 ⊻= s1
	s1 ⊻= s2
	s0 ⊻= s3
	s2 ⊻= t
	s3 = rotl(s3, 45)
	return (res, (s0, s1, s2, s3))
	end

	#xoshiro256**
	@inline function next_nostore_star((s0, s1, s2, s3))
	res = rotl(s0 * UInt64(5), 7) * UInt(9)
	t = s1 << 17
	s2 ⊻= s0
	s3 ⊻= s1
	s1 ⊻= s2
	s0 ⊻= s3
	s2 ⊻= t
	s3 = rotl(s3, 45)
	return (res, (s0, s1, s2, s3))
	end






	function XfillSimd(dst, xs::xstate{N}, num) where N
	lane = VecRange{N}(0)
	@inbounds for i=1:N:num
	dst[lane+i] = next(xs)
	end
	nothing
	end

	function XfillSimd_nostore(dst, xs::xstate{N}, num) where N
	lane = VecRange{N}(0)
	s = xs.s0, xs.s1, xs.s2, xs.s3
	@inbounds for i=1:N:num
	dst[lane+i], s = next_nostore(s)
	end
	xs.s0, xs.s1, xs.s2, xs.s3 = s
	nothing
	end

	function XfillSimd_nostore_star(dst, xs::xstate{N}, num) where N
	lane = VecRange{N}(0)
	s = xs.s0, xs.s1, xs.s2, xs.s3
	@inbounds for i=1:N:num
	dst[lane+i], s = next_nostore_star(s)
	end
	xs.s0, xs.s1, xs.s2, xs.s3 = s
	nothing
	end

	println("julia 1 x interleaved 1 x SIMD, $N x UInt64")
	xs = @new xstate{1}; init(xs)
	@btime XfillSimd($dst, $xs, $N)
	@btime XfillSimd_nostore($dst, $xs, $N)
	@btime XfillSimd_nostore_star($dst, $xs, $N)



	println("julia 1 x interleaved 2 x SIMD, $N x UInt64")
	xs = @new xstate{2}; init(xs)
	@btime XfillSimd($dst, $xs, $N)
	@btime XfillSimd_nostore($dst, $xs, $N)
	@btime XfillSimd_nostore_star($dst, $xs, $N)


	println("julia 1 x interleaved 4 x SIMD, $N x UInt64")
	xs = @new xstate{4}; init(xs)
	@btime XfillSimd($dst, $xs, $N)
	@btime XfillSimd_nostore($dst, $xs, $N)
	@btime XfillSimd_nostore_star($dst, $xs, $N)


	println("julia 1 x interleaved 8 x SIMD, $N x UInt64")
	xs = @new xstate{8}; init(xs)
	@btime XfillSimd($dst, $xs, $N)
	@btime XfillSimd_nostore($dst, $xs, $N)
	@btime XfillSimd_nostore_star($dst, $xs, $N)

	println("julia 1 x interleaved 16 x SIMD, $N x UInt64")
	xs = @new xstate{16}; init(xs)
	@btime XfillSimd($dst, $xs, $N)
	@btime XfillSimd_nostore($dst, $xs, $N)
	@btime XfillSimd_nostore_star($dst, $xs, $N)