Use clang -Ofast xoshiro256plusplus.c -march=native -shared -fpic
to compile.
Use gcc -Ofast xoshiro256plusplus.c -march=native -S
or clang -Ofast xoshiro256plusplus.c -march=native -emit-llvm -S
to see the generated code, and weep.
Output of julia ./xoshiro256plusplus.jl
on my admittedly anemic machine:
julia dsfmt 1024 x UInt64
2.209 μs (0 allocations: 0 bytes)
ref impl, 1 x interleaved, 1024 x UInt64
1.579 μs (0 allocations: 0 bytes)
ref impl, 4 x interleaved, 1024 x UInt64
2.151 μs (0 allocations: 0 bytes)
julia 1 x interleaved 1 x SIMD, 1024 x UInt64
4.333 μs (0 allocations: 0 bytes)
1.827 μs (0 allocations: 0 bytes)
2.269 μs (0 allocations: 0 bytes)
julia 1 x interleaved 2 x SIMD, 1024 x UInt64
2.709 μs (0 allocations: 0 bytes)
1.377 μs (0 allocations: 0 bytes)
4.158 μs (0 allocations: 0 bytes)
julia 1 x interleaved 4 x SIMD, 1024 x UInt64
1.479 μs (0 allocations: 0 bytes)
704.133 ns (0 allocations: 0 bytes)
2.093 μs (0 allocations: 0 bytes)
julia 1 x interleaved 8 x SIMD, 1024 x UInt64
1.072 μs (0 allocations: 0 bytes)
601.562 ns (0 allocations: 0 bytes)
2.031 μs (0 allocations: 0 bytes)
julia 1 x interleaved 16 x SIMD, 1024 x UInt64
1.582 μs (0 allocations: 0 bytes)
671.601 ns (0 allocations: 0 bytes)
1.966 μs (0 allocations: 0 bytes)
Generated xoshiro code is beautiful:
julia> code_llvm(XfillSimd_nostore, (Vector{UInt64}, xstate{4}, UInt64); debuginfo=:none)
define void @julia_XfillSimd_nostore_16539(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40), %jl_value_t addrspace(10)* nonnull align 16 dereferenceable(128), i64) {
top:
%3 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to <4 x i64> addrspace(11)*
%.unpack = load <4 x i64>, <4 x i64> addrspace(11)* %4, align 16
%5 = bitcast %jl_value_t addrspace(11)* %3 to i8 addrspace(11)*
%6 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 32
%7 = bitcast i8 addrspace(11)* %6 to <4 x i64> addrspace(11)*
%.unpack244 = load <4 x i64>, <4 x i64> addrspace(11)* %7, align 16
%8 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 64
%9 = bitcast i8 addrspace(11)* %8 to <4 x i64> addrspace(11)*
%.unpack245 = load <4 x i64>, <4 x i64> addrspace(11)* %9, align 16
%10 = getelementptr inbounds i8, i8 addrspace(11)* %5, i64 96
%11 = bitcast i8 addrspace(11)* %10 to <4 x i64> addrspace(11)*
%.unpack246 = load <4 x i64>, <4 x i64> addrspace(11)* %11, align 16
switch i64 %2, label %L38 [
i64 1, label %L47.L52_crit_edge
i64 0, label %L47.L211_crit_edge
]
L38: ; preds = %top
%12 = icmp eq i64 %2, 1
%13 = sub i64 1, %2
%14 = add i64 %2, 3
%value_phi12 = select i1 %12, i64 %13, i64 %14
%15 = and i64 %value_phi12, 3
%16 = icmp ult i64 %2, 2
%17 = sub nsw i64 0, %15
%value_phi14.p = select i1 %16, i64 %15, i64 %17
%value_phi14 = add i64 %value_phi14.p, %2
%18 = icmp eq i64 %value_phi14, 0
br i1 %18, label %L47.L211_crit_edge, label %L47.L52_crit_edge
L47.L211_crit_edge: ; preds = %top, %L38
%19 = shufflevector <4 x i64> %.unpack244, <4 x i64> %.unpack245, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
%20 = shufflevector <4 x i64> %.unpack245, <4 x i64> %.unpack244, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
%21 = shufflevector <4 x i64> %.unpack, <4 x i64> %.unpack246, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
%22 = shufflevector <4 x i64> %.unpack246, <4 x i64> %.unpack, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
br label %L211
L47.L52_crit_edge: ; preds = %top, %L38
%value_phi248 = phi i64 [ %value_phi14, %L38 ], [ 1, %top ]
%res.i254 = add <4 x i64> %.unpack246, %.unpack
%tmp.i239255 = shl <4 x i64> %.unpack244, <i64 17, i64 17, i64 17, i64 17>
%res.i238256 = xor <4 x i64> %.unpack245, %.unpack
%res.i237257 = xor <4 x i64> %.unpack246, %.unpack244
%res.i236258 = xor <4 x i64> %res.i238256, %.unpack244
%res.i235259 = xor <4 x i64> %res.i237257, %.unpack
%res.i234260 = xor <4 x i64> %res.i238256, %tmp.i239255
%tmp.i233261 = shl <4 x i64> %res.i237257, <i64 45, i64 45, i64 45, i64 45>
%tmp.i262 = lshr <4 x i64> %res.i237257, <i64 19, i64 19, i64 19, i64 19>
%res.i232263 = or <4 x i64> %tmp.i262, %tmp.i233261
%23 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%24 = addrspacecast %jl_value_t addrspace(11)* %23 to %jl_value_t*
%25 = bitcast %jl_value_t* %24 to i8**
br label %L145
L138: ; preds = %L210
call void @julia_throw_inexacterror_14971(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140584171307264 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140584174586064 to %jl_value_t*) to %jl_value_t addrspace(10)*), i64 %35)
call void @llvm.trap()
unreachable
L145: ; preds = %L47.L52_crit_edge, %L210
%res.i232270 = phi <4 x i64> [ %res.i232263, %L47.L52_crit_edge ], [ %res.i232, %L210 ]
%res.i234269 = phi <4 x i64> [ %res.i234260, %L47.L52_crit_edge ], [ %res.i234, %L210 ]
%res.i235268 = phi <4 x i64> [ %res.i235259, %L47.L52_crit_edge ], [ %res.i235, %L210 ]
%res.i236267 = phi <4 x i64> [ %res.i236258, %L47.L52_crit_edge ], [ %res.i236, %L210 ]
%res.i266 = phi <4 x i64> [ %res.i254, %L47.L52_crit_edge ], [ %res.i, %L210 ]
%value_phi4265 = phi i64 [ 1, %L47.L52_crit_edge ], [ %35, %L210 ]
%.sroa.0193.0264 = phi <4 x i64> [ %.unpack, %L47.L52_crit_edge ], [ %res.i235268, %L210 ]
%tmp.i242 = lshr <4 x i64> %res.i266, <i64 41, i64 41, i64 41, i64 41>
%tmp.i243 = shl <4 x i64> %res.i266, <i64 23, i64 23, i64 23, i64 23>
%res.i241 = or <4 x i64> %tmp.i242, %tmp.i243
%res.i240 = add <4 x i64> %res.i241, %.sroa.0193.0264
%26 = load i8*, i8** %25, align 8
%27 = shl i64 %value_phi4265, 3
%28 = add nsw i64 %27, -8
%29 = getelementptr i8, i8* %26, i64 %28
%ptr.i = bitcast i8* %29 to <4 x i64>*
store <4 x i64> %res.i240, <4 x i64>* %ptr.i, align 8
%30 = icmp eq i64 %value_phi4265, %value_phi248
br i1 %30, label %L205.L211_crit_edge, label %L210
L205.L211_crit_edge: ; preds = %L145
%31 = shufflevector <4 x i64> %res.i236267, <4 x i64> %res.i234269, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
%32 = shufflevector <4 x i64> %res.i234269, <4 x i64> %res.i236267, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
%33 = shufflevector <4 x i64> %res.i235268, <4 x i64> %res.i232270, <4 x i32> <i32 0, i32 1, i32 7, i32 6>
%34 = shufflevector <4 x i64> %res.i232270, <4 x i64> %res.i235268, <4 x i32> <i32 1, i32 0, i32 6, i32 7>
br label %L211
L210: ; preds = %L145
%35 = add nuw i64 %value_phi4265, 4
%res.i = add <4 x i64> %res.i232270, %res.i235268
%tmp.i239 = shl <4 x i64> %res.i236267, <i64 17, i64 17, i64 17, i64 17>
%res.i238 = xor <4 x i64> %res.i234269, %res.i235268
%res.i237 = xor <4 x i64> %res.i232270, %res.i236267
%res.i236 = xor <4 x i64> %res.i238, %res.i236267
%res.i235 = xor <4 x i64> %res.i237, %res.i235268
%res.i234 = xor <4 x i64> %res.i238, %tmp.i239
%tmp.i233 = shl <4 x i64> %res.i237, <i64 45, i64 45, i64 45, i64 45>
%tmp.i = lshr <4 x i64> %res.i237, <i64 19, i64 19, i64 19, i64 19>
%res.i232 = or <4 x i64> %tmp.i, %tmp.i233
%36 = icmp sgt i64 %35, -1
br i1 %36, label %L145, label %L138
L211: ; preds = %L47.L211_crit_edge, %L205.L211_crit_edge
%37 = phi <4 x i64> [ %19, %L47.L211_crit_edge ], [ %31, %L205.L211_crit_edge ]
%38 = phi <4 x i64> [ %20, %L47.L211_crit_edge ], [ %32, %L205.L211_crit_edge ]
%39 = phi <4 x i64> [ %21, %L47.L211_crit_edge ], [ %33, %L205.L211_crit_edge ]
%40 = phi <4 x i64> [ %22, %L47.L211_crit_edge ], [ %34, %L205.L211_crit_edge ]
%41 = shufflevector <4 x i64> %39, <4 x i64> %40, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
%42 = bitcast %jl_value_t addrspace(10)* %1 to <4 x i64> addrspace(10)*
store <4 x i64> %41, <4 x i64> addrspace(10)* %42, align 16
%43 = shufflevector <4 x i64> %37, <4 x i64> %38, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
store <4 x i64> %43, <4 x i64> addrspace(11)* %7, align 16
%44 = shufflevector <4 x i64> %38, <4 x i64> %37, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
store <4 x i64> %44, <4 x i64> addrspace(11)* %9, align 16
%45 = shufflevector <4 x i64> %40, <4 x i64> %39, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
store <4 x i64> %45, <4 x i64> addrspace(11)* %11, align 16
ret void
}
and
julia> code_native(XfillSimd_nostore, (Vector{UInt64}, xstate{4}, UInt64); debuginfo=:none)
.text
pushq %rax
vmovdqu (%rsi), %ymm0
vmovdqu 32(%rsi), %ymm2
vmovdqu 64(%rsi), %ymm3
vmovdqu 96(%rsi), %ymm1
testq %rdx, %rdx
je L312
cmpq $1, %rdx
jne L42
movl $1, %eax
jmp L86
L42:
movl $1, %eax
subl %edx, %eax
leal 3(%rdx), %ecx
cmpq $1, %rdx
cmoveq %rax, %rcx
andl $3, %ecx
movq %rcx, %rax
negq %rax
cmpq $2, %rdx
cmovbq %rcx, %rax
addq %rdx, %rax
je L312
L86:
vpaddq %ymm0, %ymm1, %ymm4
vpsllq $17, %ymm2, %ymm6
vpxor %ymm0, %ymm3, %ymm3
vpxor %ymm2, %ymm1, %ymm7
vpxor %ymm2, %ymm3, %ymm1
vpxor %ymm0, %ymm7, %ymm5
vpxor %ymm6, %ymm3, %ymm3
vpsllq $45, %ymm7, %ymm2
vpsrlq $19, %ymm7, %ymm6
vpor %ymm2, %ymm6, %ymm2
movl $1, %edx
nopw %cs:(%rax,%rax)
L144:
vpsrlq $41, %ymm4, %ymm6
vpsllq $23, %ymm4, %ymm4
vpor %ymm4, %ymm6, %ymm4
vpaddq %ymm0, %ymm4, %ymm4
vmovdqa %ymm5, %ymm0
movq (%rdi), %rcx
vmovdqu %ymm4, -8(%rcx,%rdx,8)
cmpq %rdx, %rax
je L266
addq $4, %rdx
vpaddq %ymm0, %ymm2, %ymm4
vpsllq $17, %ymm1, %ymm6
vpxor %ymm0, %ymm3, %ymm3
vpxor %ymm1, %ymm2, %ymm2
vpxor %ymm1, %ymm3, %ymm1
vpxor %ymm0, %ymm2, %ymm5
vpxor %ymm6, %ymm3, %ymm3
vpsllq $45, %ymm2, %ymm6
vpsrlq $19, %ymm2, %ymm2
vpor %ymm6, %ymm2, %ymm2
jns L144
movabsq $140584241810544, %rax # imm = 0x7FDC51CB4C70
movabsq $140584171307264, %rdi # imm = 0x7FDC4D978100
movabsq $140584174586064, %rsi # imm = 0x7FDC4DC988D0
vzeroupper
callq *%rax
ud2
L266:
vpshufd $78, %ymm3, %ymm4 # ymm4 = ymm3[2,3,0,1,6,7,4,5]
vpblendd $240, %ymm4, %ymm1, %ymm4 # ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
vpshufd $78, %xmm3, %xmm3 # xmm3 = xmm3[2,3,0,1]
vpblendd $240, %ymm1, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
vpshufd $78, %ymm2, %ymm1 # ymm1 = ymm2[2,3,0,1,6,7,4,5]
vpblendd $240, %ymm1, %ymm0, %ymm5 # ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7]
vpshufd $78, %xmm2, %xmm1 # xmm1 = xmm2[2,3,0,1]
vpblendd $240, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
jmp L356
L312:
vpshufd $78, %ymm3, %ymm4 # ymm4 = ymm3[2,3,0,1,6,7,4,5]
vpblendd $240, %ymm4, %ymm2, %ymm4 # ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
vpshufd $78, %xmm3, %xmm3 # xmm3 = xmm3[2,3,0,1]
vpblendd $240, %ymm2, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7]
vpshufd $78, %ymm1, %ymm2 # ymm2 = ymm1[2,3,0,1,6,7,4,5]
vpblendd $240, %ymm2, %ymm0, %ymm5 # ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7]
vpshufd $78, %xmm1, %xmm1 # xmm1 = xmm1[2,3,0,1]
vpblendd $240, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
L356:
vpblendd $240, %ymm0, %ymm5, %ymm1 # ymm1 = ymm5[0,1,2,3],ymm0[4,5,6,7]
vmovdqu %ymm1, (%rsi)
vpblendd $240, %ymm3, %ymm4, %ymm1 # ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
vmovdqu %ymm1, 32(%rsi)
vpblendd $240, %ymm4, %ymm3, %ymm1 # ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7]
vpshufd $78, %ymm1, %ymm1 # ymm1 = ymm1[2,3,0,1,6,7,4,5]
vmovdqu %ymm1, 64(%rsi)
vpblendd $240, %ymm5, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
vpshufd $78, %ymm0, %ymm0 # ymm0 = ymm0[2,3,0,1,6,7,4,5]
vmovdqu %ymm0, 96(%rsi)
popq %rax
vzeroupper
retq
nop