Last active
July 21, 2017 03:24
-
-
Save quinnj/8d3ff09e43473979d87a4fb9c1b1f697 to your computer and use it in GitHub Desktop.
isbits Union array benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
julia> function f(v::Vector{Int8}, n) | |
s = Int8(0) | |
@inbounds for i = 1:n | |
s += v[i] | |
end | |
return s | |
end | |
f (generic function with 2 methods) | |
julia> function f(v::Vector{Union{Void, Int8}}, n) | |
s = Int8(0) | |
@inbounds for i = 1:n | |
t = Base.arrayref(v, i) | |
s += t isa Void ? Int8(0) : t | |
end | |
return s | |
end | |
f (generic function with 2 methods) | |
julia> using BenchmarkTools | |
julia> n = 100 | |
100 | |
julia> @benchmark f($(Vector{Int8}(n)), $n) | |
BenchmarkTools.Trial: | |
memory estimate: 0 bytes | |
allocs estimate: 0 | |
-------------- | |
minimum time: 5.468 ns (0.00% GC) | |
median time: 6.115 ns (0.00% GC) | |
mean time: 6.513 ns (0.00% GC) | |
maximum time: 97.110 ns (0.00% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 1000 | |
julia> V = Vector{Union{Void, Int8}}(n) | |
100-element Array{Union{Int8, Void},1}: | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
⋮ | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
nothing | |
julia> @benchmark f($(V), $n) | |
BenchmarkTools.Trial: | |
memory estimate: 0 bytes | |
allocs estimate: 0 | |
-------------- | |
minimum time: 99.514 ns (0.00% GC) | |
median time: 99.974 ns (0.00% GC) | |
mean time: 103.177 ns (0.00% GC) | |
maximum time: 219.872 ns (0.00% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 949 | |
julia> @code_llvm f(Vector{Int8}(10), 10) | |
; Function Attrs: sspstrong | |
define i8 @julia_f_68903(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 { | |
L14: | |
%2 = icmp slt i64 %1, 1 | |
br i1 %2, label %L34, label %if1.lr.ph | |
if1.lr.ph: ; preds = %L14 | |
%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)* | |
%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)* | |
%5 = load i8*, i8* addrspace(11)* %4, align 8 | |
%min.iters.check = icmp ult i64 %1, 32 | |
br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked | |
min.iters.checked: ; preds = %if1.lr.ph | |
%n.vec = and i64 %1, -32 | |
%cmp.zero = icmp eq i64 %n.vec, 0 | |
%ind.end = or i64 %n.vec, 1 | |
br i1 %cmp.zero, label %scalar.ph, label %vector.ph | |
vector.ph: ; preds = %min.iters.checked | |
br label %vector.body | |
vector.body: ; preds = %vector.body, %vector.ph | |
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] | |
%6 = phi i64 [ 1, %vector.ph ], [ %7, %vector.body ] | |
%vec.phi = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ] | |
%7 = add i64 %6, 32 | |
%8 = add i64 %6, -1 | |
%9 = getelementptr i8, i8* %5, i64 %8 | |
%10 = bitcast i8* %9 to <32 x i8>* | |
%wide.load = load <32 x i8>, <32 x i8>* %10, align 1 | |
%11 = add <32 x i8> %wide.load, %vec.phi | |
%index.next = add i64 %index, 32 | |
%12 = icmp eq i64 %index.next, %n.vec | |
br i1 %12, label %middle.block, label %vector.body | |
middle.block: ; preds = %vector.body | |
%rdx.shuf = shufflevector <32 x i8> %11, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx = add <32 x i8> %11, %rdx.shuf | |
%rdx.shuf7 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx8 = add <32 x i8> %bin.rdx, %rdx.shuf7 | |
%rdx.shuf9 = shufflevector <32 x i8> %bin.rdx8, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx10 = add <32 x i8> %bin.rdx8, %rdx.shuf9 | |
%rdx.shuf11 = shufflevector <32 x i8> %bin.rdx10, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx12 = add <32 x i8> %bin.rdx10, %rdx.shuf11 | |
%rdx.shuf13 = shufflevector <32 x i8> %bin.rdx12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx14 = add <32 x i8> %bin.rdx12, %rdx.shuf13 | |
%13 = extractelement <32 x i8> %bin.rdx14, i32 0 | |
%cmp.n = icmp eq i64 %n.vec, %1 | |
br i1 %cmp.n, label %L34.loopexit, label %scalar.ph | |
scalar.ph: ; preds = %middle.block, %min.iters.checked, %if1.lr.ph | |
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if1.lr.ph ], [ 1, %min.iters.checked ] | |
%bc.merge.rdx = phi i8 [ %13, %middle.block ], [ 0, %if1.lr.ph ], [ 0, %min.iters.checked ] | |
br label %if1 | |
if1: ; preds = %scalar.ph, %if1 | |
%"#temp#.06" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %14, %if1 ] | |
%s.05 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %18, %if1 ] | |
%14 = add i64 %"#temp#.06", 1 | |
%15 = add i64 %"#temp#.06", -1 | |
%16 = getelementptr i8, i8* %5, i64 %15 | |
%17 = load i8, i8* %16, align 1 | |
%18 = add i8 %17, %s.05 | |
%19 = icmp eq i64 %"#temp#.06", %1 | |
br i1 %19, label %L34.loopexit, label %if1 | |
L34.loopexit: ; preds = %middle.block, %if1 | |
%.lcssa = phi i8 [ %18, %if1 ], [ %13, %middle.block ] | |
br label %L34 | |
L34: ; preds = %L34.loopexit, %L14 | |
%s.0.lcssa = phi i8 [ 0, %L14 ], [ %.lcssa, %L34.loopexit ] | |
ret i8 %s.0.lcssa | |
} | |
julia> @code_llvm f(Vector{Union{Void, Int8}}(10), 10) | |
; Function Attrs: sspstrong | |
define i8 @julia_f_68893(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 { | |
L14: | |
%2 = icmp slt i64 %1, 1 | |
br i1 %2, label %L62, label %if4.lr.ph | |
if4.lr.ph: ; preds = %L14 | |
%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)* | |
%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)* | |
%5 = load i8*, i8* addrspace(11)* %4, align 8 | |
%6 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)* | |
%7 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %6, i64 0, i32 1 | |
%8 = load i64, i64 addrspace(11)* %7, align 8 | |
br label %if4 | |
if4: ; preds = %if4.lr.ph, %post_union_move | |
%t.sroa.0.015 = phi i8 [ undef, %if4.lr.ph ], [ %t.sroa.0.1, %post_union_move ] | |
%"#temp#.014" = phi i64 [ 1, %if4.lr.ph ], [ %16, %post_union_move ] | |
%s.013 = phi i8 [ 0, %if4.lr.ph ], [ %17, %post_union_move ] | |
%9 = add i64 %"#temp#.014", -1 | |
%10 = add i64 %8, %9 | |
%11 = getelementptr i8, i8* %5, i64 %10 | |
%12 = load i8, i8* %11, align 1 | |
%13 = add nuw i8 %12, 1 | |
%trunc = trunc i8 %13 to i7 | |
switch i7 %trunc, label %union_move_skip [ | |
i7 1, label %post_union_move | |
i7 2, label %union_move5 | |
] | |
L62.loopexit: ; preds = %post_union_move | |
br label %L62 | |
L62: ; preds = %L62.loopexit, %L14 | |
%s.0.lcssa = phi i8 [ 0, %L14 ], [ %17, %L62.loopexit ] | |
ret i8 %s.0.lcssa | |
union_move_skip: ; preds = %if4 | |
call void @llvm.trap() | |
unreachable | |
post_union_move: ; preds = %if4, %union_move5 | |
%t.sroa.0.1 = phi i8 [ %20, %union_move5 ], [ %t.sroa.0.015, %if4 ] | |
%14 = and i8 %13, 127 | |
%15 = icmp eq i8 %14, 1 | |
%.t.sroa.0.1 = select i1 %15, i8 0, i8 %t.sroa.0.1 | |
%16 = add i64 %"#temp#.014", 1 | |
%17 = add i8 %.t.sroa.0.1, %s.013 | |
%18 = icmp eq i64 %"#temp#.014", %1 | |
br i1 %18, label %L62.loopexit, label %if4 | |
union_move5: ; preds = %if4 | |
%19 = getelementptr i8, i8* %5, i64 %9 | |
%20 = load i8, i8* %19, align 1 | |
br label %post_union_move | |
} | |
julia> |
@vtjnash here's my attempt at doing eager loading; I guess I don't have a great sense for how expensive doing a load is. It currently outperforms the LLVM generated from codegen on my jq/array-union-bits
, but fails to SIMD. Do you know what would preventing the SIMD-ification here? too many loads?
function f3(a, n)
s = Int8(0)
mx = convert(Ptr{UInt8}, pointer(a)) + length(a) - 1
el = convert(Ptr{Int8}, pointer(a)) - 1
@inbounds for i = 1:n
sel = unsafe_load(mx + i)
val2 = unsafe_load(el + i)
s += ifelse(sel == 0x00, Int8(0), val2)
end
return s
end
julia> @code_llvm f3(V, n)
define i8 @julia_f3_63023(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
%2 = icmp slt i64 %1, 1
br i1 %2, label %L64, label %L55.lr.ph
L55.lr.ph: ; preds = %L14
%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to i64 addrspace(11)*
%5 = load i64, i64 addrspace(11)* %4, align 8
%6 = add i64 %5, -1
%7 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
%8 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %7, i64 0, i32 1
%9 = load i64, i64 addrspace(11)* %8, align 8
%10 = add i64 %6, %9
br label %L55
L64.loopexit: ; preds = %L55
br label %L64
L64: ; preds = %L64.loopexit, %L14
%s.0.lcssa = phi i8 [ 0, %L14 ], [ %20, %L64.loopexit ]
ret i8 %s.0.lcssa
L55: ; preds = %L55.lr.ph, %L55
%"#temp#.010" = phi i64 [ 1, %L55.lr.ph ], [ %11, %L55 ]
%s.09 = phi i8 [ 0, %L55.lr.ph ], [ %20, %L55 ]
%11 = add i64 %"#temp#.010", 1
%12 = add i64 %6, %"#temp#.010"
%13 = inttoptr i64 %12 to i8*
%14 = load i8, i8* %13, align 1
%15 = add i64 %10, %"#temp#.010"
%16 = inttoptr i64 %15 to i8*
%17 = load i8, i8* %16, align 1
%18 = icmp ne i8 %17, 0
%19 = select i1 %18, i8 %14, i8 0
%20 = add i8 %19, %s.09
%21 = icmp eq i64 %"#temp#.010", %1
br i1 %21, label %L64.loopexit, label %L55
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A better comparison would be:
since it's not quite fair to have a branch in one case but not the other