-
-
Save toivoh/3cd3481dc66d9224811f to your computer and use it in GitHub Desktop.
module TestSLP | |
function rmw!{T}(dest::Ptr{T}, src::Ptr{T}) | |
s1 = unsafe_load(src, 1) | |
s2 = unsafe_load(src, 2) | |
d1 = unsafe_load(dest, 1) | |
d2 = unsafe_load(dest, 2) | |
d1 $= s1 | |
d2 $= s2 | |
unsafe_store!(dest, d1, 1) | |
unsafe_store!(dest, d2, 2) | |
nothing | |
end | |
function rmw2!{T}(dest::Vector{T}, src::Vector{T}) | |
psrc, pdest = pointer(src), pointer(dest) | |
s1 = unsafe_load(psrc, 1) | |
s2 = unsafe_load(psrc, 2) | |
d1 = unsafe_load(pdest, 1) | |
d2 = unsafe_load(pdest, 2) | |
d1 $= s1 | |
d2 $= s2 | |
unsafe_store!(pdest, d1, 1) | |
unsafe_store!(pdest, d2, 2) | |
nothing | |
end | |
function rmw2b!{T}(dest::Vector{T}, src::Vector{T}) | |
@inbounds s1 = src[1] | |
@inbounds s2 = src[2] | |
@inbounds d1 = dest[1] | |
@inbounds d2 = dest[2] | |
d1 $= s1 | |
d2 $= s2 | |
pdest = pointer(dest) | |
unsafe_store!(pdest, d1, 1) | |
unsafe_store!(pdest, d2, 2) | |
nothing | |
end | |
function rmw3!{T}(dest::Vector{T}, src::Vector{T}) | |
@inbounds s1 = src[1] | |
@inbounds s2 = src[2] | |
@inbounds d1 = dest[1] | |
@inbounds d2 = dest[2] | |
d1 $= s1 | |
d2 $= s2 | |
@inbounds dest[1] = d1 | |
@inbounds dest[2] = d2 | |
nothing | |
end | |
println("\n\nrmw!: ") | |
code_native(rmw!, (Ptr{Uint64}, Ptr{Uint64})) | |
println("\n\nrmw2!: ") | |
code_native(rmw2!, (Vector{Uint64}, Vector{Uint64})) | |
println("\n\nrmw2b!: ") | |
code_native(rmw2b!, (Vector{Uint64}, Vector{Uint64})) | |
println("\n\nrmw3!: ") | |
code_native(rmw3!, (Vector{Uint64}, Vector{Uint64})) | |
end # module |
Using LLVM 3.5.0 with branch adr/slpvector
appears to vectorize the second example:
julia> code_native(rmw2!, (Vector{Uint64}, Vector{Uint64}))
.text
Filename: slp.jl
Source line: 0
push rbp
mov rbp, rsp
Source line: 14
mov rax, qword ptr [rsi + 8]
mov rcx, qword ptr [rdi + 8]
Source line: 17
vmovups xmm0, xmmword ptr [rcx]
Source line: 19
vxorps xmm0, xmm0, xmmword ptr [rax]
Source line: 21
vmovups xmmword ptr [rcx], xmm0
Source line: 23
pop rbp
ret
Third example fails to vectorize with the branch LLVM 3.5.0. It's doesn't appear to be an alias issue since the loads/stores have the right tbaa tags:
julia> code_llvm(rmw3!, (Vector{Uint64}, Vector{Uint64}))
define void @"julia_rmw3!40180"(%jl_value_t*, %jl_value_t*) {
top:
%2 = getelementptr inbounds %jl_value_t* %1, i64 1, !dbg !8
%3 = bitcast %jl_value_t* %2 to i8**, !dbg !8
%4 = load i8** %3, align 8, !dbg !8, !tbaa %jtbaa_arrayptr
%5 = bitcast i8* %4 to i64*, !dbg !8
%6 = load i64* %5, align 8, !dbg !8, !tbaa %jtbaa_user, !julia_type !15
%7 = getelementptr i8* %4, i64 8, !dbg !16
%8 = bitcast i8* %7 to i64*, !dbg !16
%9 = load i64* %8, align 8, !dbg !16, !tbaa %jtbaa_user, !julia_type !15
%10 = getelementptr inbounds %jl_value_t* %0, i64 1, !dbg !17
%11 = bitcast %jl_value_t* %10 to i8**, !dbg !17
%12 = load i8** %11, align 8, !dbg !17, !tbaa %jtbaa_arrayptr
%13 = bitcast i8* %12 to i64*, !dbg !17
%14 = load i64* %13, align 8, !dbg !17, !tbaa %jtbaa_user, !julia_type !15
%15 = getelementptr i8* %12, i64 8, !dbg !18
%16 = bitcast i8* %15 to i64*, !dbg !18
%17 = load i64* %16, align 8, !dbg !18, !tbaa %jtbaa_user, !julia_type !15
%18 = xor i64 %14, %6, !dbg !19, !julia_type !15
%19 = xor i64 %17, %9, !dbg !20, !julia_type !15
store i64 %18, i64* %13, align 8, !dbg !21, !tbaa %jtbaa_user
%20 = load i8** %11, align 8, !dbg !22, !tbaa %jtbaa_arrayptr
%21 = getelementptr i8* %20, i64 8, !dbg !22
%22 = bitcast i8* %21 to i64*, !dbg !22
store i64 %19, i64* %22, align 8, !dbg !22, !tbaa %jtbaa_user
ret void, !dbg !23
}
I'll poke around some more.
Nice! I don't understand the tbaa tags yet, perhaps I'll read up on them.
The problem seems to be something to do with the stores though, with LLVM 3.5.0 and branch adr/slpvector
I can also vectorize rmw2b!
(just added above) which uses @inbounds
for the loads but unsafe_store!
for the stores. I tried to look at the difference in code_llvm
output between rmw2b!
and rmw3!
, but the main difference I see is that the first has been vectorized :) Maybe there should be a version of code_llvm
that gives you the llvm code before optimization.
I've been trying to make Julia emit some reasonable SIMD loads/stores using the branch JuliaLang/julia#6271
The
rmw!
function seems to manage it by working directly with pointers:However, just taking the vectors as arguments and taking the pointers up front as in
rmw!
seems to thwart the vectorizer, leaving us completely without SIMD instructions:The difference between
rmw2!
andrmw3!
is very small, though the latter works with the vectors all the way (and@inbounds
):There is one difference: In the last case, the code feels the need to reload
at the end, even though the address was already available from the
up top. Presumably this is because it feels that the previous store might have changed the pointer.