Skip to content

Instantly share code, notes, and snippets.

@quinnj
Last active July 21, 2017 03:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quinnj/8d3ff09e43473979d87a4fb9c1b1f697 to your computer and use it in GitHub Desktop.
Save quinnj/8d3ff09e43473979d87a4fb9c1b1f697 to your computer and use it in GitHub Desktop.
isbits Union array benchmark
julia> function f(v::Vector{Int8}, n)
s = Int8(0)
@inbounds for i = 1:n
s += v[i]
end
return s
end
f (generic function with 2 methods)
julia> function f(v::Vector{Union{Void, Int8}}, n)
s = Int8(0)
@inbounds for i = 1:n
t = Base.arrayref(v, i)
s += t isa Void ? Int8(0) : t
end
return s
end
f (generic function with 2 methods)
julia> using BenchmarkTools
julia> n = 100
100
julia> @benchmark f($(Vector{Int8}(n)), $n)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 5.468 ns (0.00% GC)
median time: 6.115 ns (0.00% GC)
mean time: 6.513 ns (0.00% GC)
maximum time: 97.110 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> V = Vector{Union{Void, Int8}}(n)
100-element Array{Union{Int8, Void},1}:
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
nothing
julia> @benchmark f($(V), $n)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 99.514 ns (0.00% GC)
median time: 99.974 ns (0.00% GC)
mean time: 103.177 ns (0.00% GC)
maximum time: 219.872 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 949
julia> @code_llvm f(Vector{Int8}(10), 10)
; Function Attrs: sspstrong
define i8 @julia_f_68903(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
%2 = icmp slt i64 %1, 1
br i1 %2, label %L34, label %if1.lr.ph
if1.lr.ph: ; preds = %L14
%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
%5 = load i8*, i8* addrspace(11)* %4, align 8
%min.iters.check = icmp ult i64 %1, 32
br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
min.iters.checked: ; preds = %if1.lr.ph
%n.vec = and i64 %1, -32
%cmp.zero = icmp eq i64 %n.vec, 0
%ind.end = or i64 %n.vec, 1
br i1 %cmp.zero, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %min.iters.checked
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i64 [ 1, %vector.ph ], [ %7, %vector.body ]
%vec.phi = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
%7 = add i64 %6, 32
%8 = add i64 %6, -1
%9 = getelementptr i8, i8* %5, i64 %8
%10 = bitcast i8* %9 to <32 x i8>*
%wide.load = load <32 x i8>, <32 x i8>* %10, align 1
%11 = add <32 x i8> %wide.load, %vec.phi
%index.next = add i64 %index, 32
%12 = icmp eq i64 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%rdx.shuf = shufflevector <32 x i8> %11, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <32 x i8> %11, %rdx.shuf
%rdx.shuf7 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <32 x i8> %bin.rdx, %rdx.shuf7
%rdx.shuf9 = shufflevector <32 x i8> %bin.rdx8, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx10 = add <32 x i8> %bin.rdx8, %rdx.shuf9
%rdx.shuf11 = shufflevector <32 x i8> %bin.rdx10, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx12 = add <32 x i8> %bin.rdx10, %rdx.shuf11
%rdx.shuf13 = shufflevector <32 x i8> %bin.rdx12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx14 = add <32 x i8> %bin.rdx12, %rdx.shuf13
%13 = extractelement <32 x i8> %bin.rdx14, i32 0
%cmp.n = icmp eq i64 %n.vec, %1
br i1 %cmp.n, label %L34.loopexit, label %scalar.ph
scalar.ph: ; preds = %middle.block, %min.iters.checked, %if1.lr.ph
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if1.lr.ph ], [ 1, %min.iters.checked ]
%bc.merge.rdx = phi i8 [ %13, %middle.block ], [ 0, %if1.lr.ph ], [ 0, %min.iters.checked ]
br label %if1
if1: ; preds = %scalar.ph, %if1
%"#temp#.06" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %14, %if1 ]
%s.05 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %18, %if1 ]
%14 = add i64 %"#temp#.06", 1
%15 = add i64 %"#temp#.06", -1
%16 = getelementptr i8, i8* %5, i64 %15
%17 = load i8, i8* %16, align 1
%18 = add i8 %17, %s.05
%19 = icmp eq i64 %"#temp#.06", %1
br i1 %19, label %L34.loopexit, label %if1
L34.loopexit: ; preds = %middle.block, %if1
%.lcssa = phi i8 [ %18, %if1 ], [ %13, %middle.block ]
br label %L34
L34: ; preds = %L34.loopexit, %L14
%s.0.lcssa = phi i8 [ 0, %L14 ], [ %.lcssa, %L34.loopexit ]
ret i8 %s.0.lcssa
}
julia> @code_llvm f(Vector{Union{Void, Int8}}(10), 10)
; Function Attrs: sspstrong
define i8 @julia_f_68893(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
%2 = icmp slt i64 %1, 1
br i1 %2, label %L62, label %if4.lr.ph
if4.lr.ph: ; preds = %L14
%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
%5 = load i8*, i8* addrspace(11)* %4, align 8
%6 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
%7 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %6, i64 0, i32 1
%8 = load i64, i64 addrspace(11)* %7, align 8
br label %if4
if4: ; preds = %if4.lr.ph, %post_union_move
%t.sroa.0.015 = phi i8 [ undef, %if4.lr.ph ], [ %t.sroa.0.1, %post_union_move ]
%"#temp#.014" = phi i64 [ 1, %if4.lr.ph ], [ %16, %post_union_move ]
%s.013 = phi i8 [ 0, %if4.lr.ph ], [ %17, %post_union_move ]
%9 = add i64 %"#temp#.014", -1
%10 = add i64 %8, %9
%11 = getelementptr i8, i8* %5, i64 %10
%12 = load i8, i8* %11, align 1
%13 = add nuw i8 %12, 1
%trunc = trunc i8 %13 to i7
switch i7 %trunc, label %union_move_skip [
i7 1, label %post_union_move
i7 2, label %union_move5
]
L62.loopexit: ; preds = %post_union_move
br label %L62
L62: ; preds = %L62.loopexit, %L14
%s.0.lcssa = phi i8 [ 0, %L14 ], [ %17, %L62.loopexit ]
ret i8 %s.0.lcssa
union_move_skip: ; preds = %if4
call void @llvm.trap()
unreachable
post_union_move: ; preds = %if4, %union_move5
%t.sroa.0.1 = phi i8 [ %20, %union_move5 ], [ %t.sroa.0.015, %if4 ]
%14 = and i8 %13, 127
%15 = icmp eq i8 %14, 1
%.t.sroa.0.1 = select i1 %15, i8 0, i8 %t.sroa.0.1
%16 = add i64 %"#temp#.014", 1
%17 = add i8 %.t.sroa.0.1, %s.013
%18 = icmp eq i64 %"#temp#.014", %1
br i1 %18, label %L62.loopexit, label %if4
union_move5: ; preds = %if4
%19 = getelementptr i8, i8* %5, i64 %9
%20 = load i8, i8* %19, align 1
br label %post_union_move
}
julia>
@vtjnash
Copy link

vtjnash commented Jul 16, 2017

A better comparison would be:

function f(v::Vector{Int8}, m::Vector{Int8}, n)
         s = Int8(0)
         @inbounds for i = 1:n
         if m[i] == 1
           s += v[i]
         end
         end
         return s
       end

since it's not quite fair to have a branch in one case but not the other

@quinnj
Copy link
Author

quinnj commented Jul 21, 2017

@vtjnash here's my attempt at doing eager loading; I guess I don't have a great sense for how expensive doing a load is. It currently outperforms the LLVM generated from codegen on my jq/array-union-bits, but fails to SIMD. Do you know what would preventing the SIMD-ification here? too many loads?

function f3(a, n)
    s = Int8(0)
    mx = convert(Ptr{UInt8}, pointer(a)) + length(a) - 1
    el = convert(Ptr{Int8}, pointer(a)) - 1
    @inbounds for i = 1:n
        sel = unsafe_load(mx + i)
        val2 = unsafe_load(el + i)
        s += ifelse(sel == 0x00, Int8(0), val2)
    end
    return s
end

julia> @code_llvm f3(V, n)

define i8 @julia_f3_63023(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
  %2 = icmp slt i64 %1, 1
  br i1 %2, label %L64, label %L55.lr.ph

L55.lr.ph:                                        ; preds = %L14
  %3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to i64 addrspace(11)*
  %5 = load i64, i64 addrspace(11)* %4, align 8
  %6 = add i64 %5, -1
  %7 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
  %8 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %7, i64 0, i32 1
  %9 = load i64, i64 addrspace(11)* %8, align 8
  %10 = add i64 %6, %9
  br label %L55

L64.loopexit:                                     ; preds = %L55
  br label %L64

L64:                                              ; preds = %L64.loopexit, %L14
  %s.0.lcssa = phi i8 [ 0, %L14 ], [ %20, %L64.loopexit ]
  ret i8 %s.0.lcssa

L55:                                              ; preds = %L55.lr.ph, %L55
  %"#temp#.010" = phi i64 [ 1, %L55.lr.ph ], [ %11, %L55 ]
  %s.09 = phi i8 [ 0, %L55.lr.ph ], [ %20, %L55 ]
  %11 = add i64 %"#temp#.010", 1
  %12 = add i64 %6, %"#temp#.010"
  %13 = inttoptr i64 %12 to i8*
  %14 = load i8, i8* %13, align 1
  %15 = add i64 %10, %"#temp#.010"
  %16 = inttoptr i64 %15 to i8*
  %17 = load i8, i8* %16, align 1
  %18 = icmp ne i8 %17, 0
  %19 = select i1 %18, i8 %14, i8 0
  %20 = add i8 %19, %s.09
  %21 = icmp eq i64 %"#temp#.010", %1
  br i1 %21, label %L64.loopexit, label %L55
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment