quinnj/iuab.jl

## iuab.jl
julia> function f(v::Vector{Int8}, n)
         s = Int8(0)
         @inbounds for i = 1:n
           s += v[i]
         end
         return s
       end
f (generic function with 2 methods)

julia> function f(v::Vector{Union{Void, Int8}}, n)
         s = Int8(0)
         @inbounds for i = 1:n
           t = Base.arrayref(v, i)
           s += t isa Void ? Int8(0) : t
         end
         return s
       end
f (generic function with 2 methods)

julia> using BenchmarkTools

julia> n = 100
100

julia> @benchmark f($(Vector{Int8}(n)), $n)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     5.468 ns (0.00% GC)
  median time:      6.115 ns (0.00% GC)
  mean time:        6.513 ns (0.00% GC)
  maximum time:     97.110 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

julia> V = Vector{Union{Void, Int8}}(n)
100-element Array{Union{Int8, Void},1}:
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 ⋮
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing
 nothing

julia> @benchmark f($(V), $n)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     99.514 ns (0.00% GC)
  median time:      99.974 ns (0.00% GC)
  mean time:        103.177 ns (0.00% GC)
  maximum time:     219.872 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     949

julia> @code_llvm f(Vector{Int8}(10), 10)

; Function Attrs: sspstrong
define i8 @julia_f_68903(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
  %2 = icmp slt i64 %1, 1
  br i1 %2, label %L34, label %if1.lr.ph

if1.lr.ph:                                        ; preds = %L14
  %3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
  %5 = load i8*, i8* addrspace(11)* %4, align 8
  %min.iters.check = icmp ult i64 %1, 32
  br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked

min.iters.checked:                                ; preds = %if1.lr.ph
  %n.vec = and i64 %1, -32
  %cmp.zero = icmp eq i64 %n.vec, 0
  %ind.end = or i64 %n.vec, 1
  br i1 %cmp.zero, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %min.iters.checked
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %6 = phi i64 [ 1, %vector.ph ], [ %7, %vector.body ]
  %vec.phi = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
  %7 = add i64 %6, 32
  %8 = add i64 %6, -1
  %9 = getelementptr i8, i8* %5, i64 %8
  %10 = bitcast i8* %9 to <32 x i8>*
  %wide.load = load <32 x i8>, <32 x i8>* %10, align 1
  %11 = add <32 x i8> %wide.load, %vec.phi
  %index.next = add i64 %index, 32
  %12 = icmp eq i64 %index.next, %n.vec
  br i1 %12, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
  %rdx.shuf = shufflevector <32 x i8> %11, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx = add <32 x i8> %11, %rdx.shuf
  %rdx.shuf7 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx8 = add <32 x i8> %bin.rdx, %rdx.shuf7
  %rdx.shuf9 = shufflevector <32 x i8> %bin.rdx8, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx10 = add <32 x i8> %bin.rdx8, %rdx.shuf9
  %rdx.shuf11 = shufflevector <32 x i8> %bin.rdx10, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx12 = add <32 x i8> %bin.rdx10, %rdx.shuf11
  %rdx.shuf13 = shufflevector <32 x i8> %bin.rdx12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx14 = add <32 x i8> %bin.rdx12, %rdx.shuf13
  %13 = extractelement <32 x i8> %bin.rdx14, i32 0
  %cmp.n = icmp eq i64 %n.vec, %1
  br i1 %cmp.n, label %L34.loopexit, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %min.iters.checked, %if1.lr.ph
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if1.lr.ph ], [ 1, %min.iters.checked ]
  %bc.merge.rdx = phi i8 [ %13, %middle.block ], [ 0, %if1.lr.ph ], [ 0, %min.iters.checked ]
  br label %if1

if1:                                              ; preds = %scalar.ph, %if1
  %"#temp#.06" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %14, %if1 ]
  %s.05 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %18, %if1 ]
  %14 = add i64 %"#temp#.06", 1
  %15 = add i64 %"#temp#.06", -1
  %16 = getelementptr i8, i8* %5, i64 %15
  %17 = load i8, i8* %16, align 1
  %18 = add i8 %17, %s.05
  %19 = icmp eq i64 %"#temp#.06", %1
  br i1 %19, label %L34.loopexit, label %if1

L34.loopexit:                                     ; preds = %middle.block, %if1
  %.lcssa = phi i8 [ %18, %if1 ], [ %13, %middle.block ]
  br label %L34

L34:                                              ; preds = %L34.loopexit, %L14
  %s.0.lcssa = phi i8 [ 0, %L14 ], [ %.lcssa, %L34.loopexit ]
  ret i8 %s.0.lcssa
}

julia> @code_llvm f(Vector{Union{Void, Int8}}(10), 10)

; Function Attrs: sspstrong
define i8 @julia_f_68893(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
L14:
  %2 = icmp slt i64 %1, 1
  br i1 %2, label %L62, label %if4.lr.ph

if4.lr.ph:                                        ; preds = %L14
  %3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
  %5 = load i8*, i8* addrspace(11)* %4, align 8
  %6 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
  %7 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %6, i64 0, i32 1
  %8 = load i64, i64 addrspace(11)* %7, align 8
  br label %if4

if4:                                              ; preds = %if4.lr.ph, %post_union_move
  %t.sroa.0.015 = phi i8 [ undef, %if4.lr.ph ], [ %t.sroa.0.1, %post_union_move ]
  %"#temp#.014" = phi i64 [ 1, %if4.lr.ph ], [ %16, %post_union_move ]
  %s.013 = phi i8 [ 0, %if4.lr.ph ], [ %17, %post_union_move ]
  %9 = add i64 %"#temp#.014", -1
  %10 = add i64 %8, %9
  %11 = getelementptr i8, i8* %5, i64 %10
  %12 = load i8, i8* %11, align 1
  %13 = add nuw i8 %12, 1
  %trunc = trunc i8 %13 to i7
  switch i7 %trunc, label %union_move_skip [
    i7 1, label %post_union_move
    i7 2, label %union_move5
  ]

L62.loopexit:                                     ; preds = %post_union_move
  br label %L62

L62:                                              ; preds = %L62.loopexit, %L14
  %s.0.lcssa = phi i8 [ 0, %L14 ], [ %17, %L62.loopexit ]
  ret i8 %s.0.lcssa

union_move_skip:                                  ; preds = %if4
  call void @llvm.trap()
  unreachable

post_union_move:                                  ; preds = %if4, %union_move5
  %t.sroa.0.1 = phi i8 [ %20, %union_move5 ], [ %t.sroa.0.015, %if4 ]
  %14 = and i8 %13, 127
  %15 = icmp eq i8 %14, 1
  %.t.sroa.0.1 = select i1 %15, i8 0, i8 %t.sroa.0.1
  %16 = add i64 %"#temp#.014", 1
  %17 = add i8 %.t.sroa.0.1, %s.013
  %18 = icmp eq i64 %"#temp#.014", %1
  br i1 %18, label %L62.loopexit, label %if4

union_move5:                                      ; preds = %if4
  %19 = getelementptr i8, i8* %5, i64 %9
  %20 = load i8, i8* %19, align 1
  br label %post_union_move
}

julia>
	julia> function f(v::Vector{Int8}, n)
	s = Int8(0)
	@inbounds for i = 1:n
	s += v[i]
	end
	return s
	end
	f (generic function with 2 methods)

	julia> function f(v::Vector{Union{Void, Int8}}, n)
	s = Int8(0)
	@inbounds for i = 1:n
	t = Base.arrayref(v, i)
	s += t isa Void ? Int8(0) : t
	end
	return s
	end
	f (generic function with 2 methods)

	julia> using BenchmarkTools

	julia> n = 100
	100

	julia> @benchmark f($(Vector{Int8}(n)), $n)
	BenchmarkTools.Trial:
	memory estimate: 0 bytes
	allocs estimate: 0
	--------------
	minimum time: 5.468 ns (0.00% GC)
	median time: 6.115 ns (0.00% GC)
	mean time: 6.513 ns (0.00% GC)
	maximum time: 97.110 ns (0.00% GC)
	--------------
	samples: 10000
	evals/sample: 1000

	julia> V = Vector{Union{Void, Int8}}(n)
	100-element Array{Union{Int8, Void},1}:
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	⋮
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing
	nothing

	julia> @benchmark f($(V), $n)
	BenchmarkTools.Trial:
	memory estimate: 0 bytes
	allocs estimate: 0
	--------------
	minimum time: 99.514 ns (0.00% GC)
	median time: 99.974 ns (0.00% GC)
	mean time: 103.177 ns (0.00% GC)
	maximum time: 219.872 ns (0.00% GC)
	--------------
	samples: 10000
	evals/sample: 949

	julia> @code_llvm f(Vector{Int8}(10), 10)

	; Function Attrs: sspstrong
	define i8 @julia_f_68903(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
	L14:
	%2 = icmp slt i64 %1, 1
	br i1 %2, label %L34, label %if1.lr.ph

	if1.lr.ph: ; preds = %L14
	%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
	%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
	%5 = load i8, i8 addrspace(11)* %4, align 8
	%min.iters.check = icmp ult i64 %1, 32
	br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked

	min.iters.checked: ; preds = %if1.lr.ph
	%n.vec = and i64 %1, -32
	%cmp.zero = icmp eq i64 %n.vec, 0
	%ind.end = or i64 %n.vec, 1
	br i1 %cmp.zero, label %scalar.ph, label %vector.ph

	vector.ph: ; preds = %min.iters.checked
	br label %vector.body

	vector.body: ; preds = %vector.body, %vector.ph
	%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
	%6 = phi i64 [ 1, %vector.ph ], [ %7, %vector.body ]
	%vec.phi = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
	%7 = add i64 %6, 32
	%8 = add i64 %6, -1
	%9 = getelementptr i8, i8* %5, i64 %8
	%10 = bitcast i8* %9 to <32 x i8>*
	%wide.load = load <32 x i8>, <32 x i8>* %10, align 1
	%11 = add <32 x i8> %wide.load, %vec.phi
	%index.next = add i64 %index, 32
	%12 = icmp eq i64 %index.next, %n.vec
	br i1 %12, label %middle.block, label %vector.body

	middle.block: ; preds = %vector.body
	%rdx.shuf = shufflevector <32 x i8> %11, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx = add <32 x i8> %11, %rdx.shuf
	%rdx.shuf7 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx8 = add <32 x i8> %bin.rdx, %rdx.shuf7
	%rdx.shuf9 = shufflevector <32 x i8> %bin.rdx8, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx10 = add <32 x i8> %bin.rdx8, %rdx.shuf9
	%rdx.shuf11 = shufflevector <32 x i8> %bin.rdx10, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx12 = add <32 x i8> %bin.rdx10, %rdx.shuf11
	%rdx.shuf13 = shufflevector <32 x i8> %bin.rdx12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx14 = add <32 x i8> %bin.rdx12, %rdx.shuf13
	%13 = extractelement <32 x i8> %bin.rdx14, i32 0
	%cmp.n = icmp eq i64 %n.vec, %1
	br i1 %cmp.n, label %L34.loopexit, label %scalar.ph

	scalar.ph: ; preds = %middle.block, %min.iters.checked, %if1.lr.ph
	%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if1.lr.ph ], [ 1, %min.iters.checked ]
	%bc.merge.rdx = phi i8 [ %13, %middle.block ], [ 0, %if1.lr.ph ], [ 0, %min.iters.checked ]
	br label %if1

	if1: ; preds = %scalar.ph, %if1
	%"#temp#.06" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %14, %if1 ]
	%s.05 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %18, %if1 ]
	%14 = add i64 %"#temp#.06", 1
	%15 = add i64 %"#temp#.06", -1
	%16 = getelementptr i8, i8* %5, i64 %15
	%17 = load i8, i8* %16, align 1
	%18 = add i8 %17, %s.05
	%19 = icmp eq i64 %"#temp#.06", %1
	br i1 %19, label %L34.loopexit, label %if1

	L34.loopexit: ; preds = %middle.block, %if1
	%.lcssa = phi i8 [ %18, %if1 ], [ %13, %middle.block ]
	br label %L34

	L34: ; preds = %L34.loopexit, %L14
	%s.0.lcssa = phi i8 [ 0, %L14 ], [ %.lcssa, %L34.loopexit ]
	ret i8 %s.0.lcssa
	}

	julia> @code_llvm f(Vector{Union{Void, Int8}}(10), 10)

	; Function Attrs: sspstrong
	define i8 @julia_f_68893(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 !dbg !5 {
	L14:
	%2 = icmp slt i64 %1, 1
	br i1 %2, label %L62, label %if4.lr.ph

	if4.lr.ph: ; preds = %L14
	%3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
	%4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
	%5 = load i8, i8 addrspace(11)* %4, align 8
	%6 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
	%7 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %6, i64 0, i32 1
	%8 = load i64, i64 addrspace(11)* %7, align 8
	br label %if4

	if4: ; preds = %if4.lr.ph, %post_union_move
	%t.sroa.0.015 = phi i8 [ undef, %if4.lr.ph ], [ %t.sroa.0.1, %post_union_move ]
	%"#temp#.014" = phi i64 [ 1, %if4.lr.ph ], [ %16, %post_union_move ]
	%s.013 = phi i8 [ 0, %if4.lr.ph ], [ %17, %post_union_move ]
	%9 = add i64 %"#temp#.014", -1
	%10 = add i64 %8, %9
	%11 = getelementptr i8, i8* %5, i64 %10
	%12 = load i8, i8* %11, align 1
	%13 = add nuw i8 %12, 1
	%trunc = trunc i8 %13 to i7
	switch i7 %trunc, label %union_move_skip [
	i7 1, label %post_union_move
	i7 2, label %union_move5
	]

	L62.loopexit: ; preds = %post_union_move
	br label %L62

	L62: ; preds = %L62.loopexit, %L14
	%s.0.lcssa = phi i8 [ 0, %L14 ], [ %17, %L62.loopexit ]
	ret i8 %s.0.lcssa

	union_move_skip: ; preds = %if4
	call void @llvm.trap()
	unreachable

	post_union_move: ; preds = %if4, %union_move5
	%t.sroa.0.1 = phi i8 [ %20, %union_move5 ], [ %t.sroa.0.015, %if4 ]
	%14 = and i8 %13, 127
	%15 = icmp eq i8 %14, 1
	%.t.sroa.0.1 = select i1 %15, i8 0, i8 %t.sroa.0.1
	%16 = add i64 %"#temp#.014", 1
	%17 = add i8 %.t.sroa.0.1, %s.013
	%18 = icmp eq i64 %"#temp#.014", %1
	br i1 %18, label %L62.loopexit, label %if4

	union_move5: ; preds = %if4
	%19 = getelementptr i8, i8* %5, i64 %9
	%20 = load i8, i8* %19, align 1
	br label %post_union_move
	}

	julia>