With master:
julia> code_native(Base.sum_seq, (Vector{Float64}, Int, Int))
.text
Filename: reduce.jl
Source line: 226
push RBP
mov RBP, RSP
Source line: 226
mov RAX, QWORD PTR [RDI + 8]
vmovsd XMM0, QWORD PTR [RAX + 8*RSI - 8]
Source line: 210
lea RCX, QWORD PTR [RSI + 6]
cmp RCX, RDX
jge 181
Source line: 229
vmovsd XMM2, QWORD PTR [RAX + 8*RSI + 16]
Source line: 226
vaddsd XMM1, XMM0, QWORD PTR [RAX + 8*RSI + 24]
Source line: 229
vaddsd XMM0, XMM2, QWORD PTR [RAX + 8*RSI + 48]
Source line: 227
vmovsd XMM3, QWORD PTR [RAX + 8*RSI]
Source line: 228
vmovsd XMM2, QWORD PTR [RAX + 8*RSI + 8]
vaddsd XMM2, XMM2, QWORD PTR [RAX + 8*RSI + 40]
Source line: 227
vaddsd XMM3, XMM3, QWORD PTR [RAX + 8*RSI + 32]
Source line: 232
lea R8, QWORD PTR [RDX - 3]
Source line: 231
lea RAX, QWORD PTR [RSI + 8]
cmp RAX, R8
jg 71
Source line: 234
mov R11, QWORD PTR [RDI + 8]
Source line: 238
lea R9, QWORD PTR [R11 + 8*RSI + 56]
lea R10, QWORD PTR [R11 + 8*RSI + 64]
lea RCX, QWORD PTR [R11 + 8*RSI + 72]
lea RSI, QWORD PTR [R11 + 8*RSI + 80]
Source line: 237
vaddsd XMM0, XMM0, QWORD PTR [RSI]
Source line: 236
vaddsd XMM2, XMM2, QWORD PTR [RCX]
Source line: 235
vaddsd XMM3, XMM3, QWORD PTR [R10]
Source line: 238
add R10, 32
Source line: 234
vaddsd XMM1, XMM1, QWORD PTR [R9]
Source line: 238
add R9, 32
add RCX, 32
add RSI, 32
add RAX, 4
cmp RAX, R8
jle -47
Source line: 241
cmp RAX, RDX
jg 29
Source line: 242
mov RCX, QWORD PTR [RDI + 8]
lea RCX, QWORD PTR [RCX + 8*RAX - 8]
vaddsd XMM1, XMM1, QWORD PTR [RCX]
Source line: 243
add RCX, 8
inc RAX
cmp RAX, RDX
jle -20
Source line: 246
vaddsd XMM1, XMM3, XMM1
vaddsd XMM1, XMM2, XMM1
vaddsd XMM0, XMM0, XMM1
pop RBP
ret
Source line: 212
vaddsd XMM0, XMM0, QWORD PTR [RAX + 8*RSI]
Source line: 213
lea RAX, QWORD PTR [RSI + 1]
cmp RAX, RDX
jge 32
Source line: 215
dec RDX
sub RDX, RSI
mov RAX, QWORD PTR [RDI + 8]
lea RAX, QWORD PTR [RAX + 8*RSI + 8]
vaddsd XMM0, XMM0, QWORD PTR [RAX]
add RAX, 8
dec RDX
jne -17
Source line: 217
pop RBP
ret
julia> code_llvm(Base.sum_seq, (Vector{Float64}, Int, Int))
define double @julia_sum_seq17922(%jl_value_t*, i64, i64) {
top:
%3 = add i64 %1, 6, !dbg !590
%4 = icmp slt i64 %3, %2, !dbg !590
%5 = add i64 %1, -1, !dbg !591
%6 = getelementptr inbounds %jl_value_t* %0, i64 1, i32 0, !dbg !591
%7 = load %jl_value_t** %6, align 8, !dbg !591, !tbaa %jtbaa_arrayptr
%8 = getelementptr %jl_value_t* %7, i64 %5, !dbg !591
%9 = bitcast %jl_value_t* %8 to double*, !dbg !591
%10 = load double* %9, align 8, !dbg !591, !tbaa %jtbaa_user
br i1 %4, label %L5, label %if, !dbg !590
if: ; preds = %top
%11 = getelementptr %jl_value_t* %7, i64 %1, !dbg !597
%12 = bitcast %jl_value_t* %11 to double*, !dbg !597
%13 = load double* %12, align 8, !dbg !597, !tbaa %jtbaa_user
%14 = fadd double %10, %13, !dbg !597
%15 = add i64 %1, 1, !dbg !598
%16 = icmp slt i64 %15, %2, !dbg !599
br i1 %16, label %L.preheader, label %L4, !dbg !599
L.preheader: ; preds = %if
%sunkaddr = ptrtoint %jl_value_t* %0 to i64, !dbg !600
%sunkaddr53 = add i64 %sunkaddr, 8, !dbg !600
%sunkaddr54 = inttoptr i64 %sunkaddr53 to %jl_value_t**, !dbg !600
%17 = load %jl_value_t** %sunkaddr54, align 8, !dbg !600, !tbaa %jtbaa_arrayptr
%18 = add i64 %1, 1, !dbg !600
%scevgep48 = getelementptr %jl_value_t* %17, i64 %18
%19 = add i64 %2, -1, !dbg !600
%20 = sub i64 %19, %1, !dbg !600
br label %L, !dbg !600
L: ; preds = %L, %L.preheader
%lsr.iv52 = phi i64 [ %lsr.iv.next, %L ], [ %20, %L.preheader ], !dbg !600
%lsr.iv49 = phi %jl_value_t* [ %scevgep50, %L ], [ %scevgep48, %L.preheader ]
%s.0 = phi double [ %22, %L ], [ %14, %L.preheader ]
%lsr.iv4951 = bitcast %jl_value_t* %lsr.iv49 to double*
%21 = load double* %lsr.iv4951, align 8, !dbg !600, !tbaa %jtbaa_user
%22 = fadd double %s.0, %21, !dbg !600
%scevgep50 = getelementptr %jl_value_t* %lsr.iv49, i64 1, !dbg !600
%lsr.iv.next = add i64 %lsr.iv52, -1, !dbg !600
%exitcond = icmp eq i64 %lsr.iv.next, 0, !dbg !600
br i1 %exitcond, label %L4, label %L, !dbg !600
L4: ; preds = %L, %if
%s.1 = phi double [ %14, %if ], [ %22, %L ]
ret double %s.1, !dbg !601
L5: ; preds = %top
%23 = add i64 %1, 3, !dbg !591
%24 = getelementptr %jl_value_t* %7, i64 %23, !dbg !591
%25 = bitcast %jl_value_t* %24 to double*, !dbg !591
%26 = load double* %25, align 8, !dbg !591, !tbaa %jtbaa_user
%27 = fadd double %10, %26, !dbg !591
%28 = getelementptr %jl_value_t* %7, i64 %1, !dbg !602
%29 = bitcast %jl_value_t* %28 to double*, !dbg !602
%30 = load double* %29, align 8, !dbg !602, !tbaa %jtbaa_user
%31 = add i64 %1, 4, !dbg !602
%32 = getelementptr %jl_value_t* %7, i64 %31, !dbg !602
%33 = bitcast %jl_value_t* %32 to double*, !dbg !602
%34 = load double* %33, align 8, !dbg !602, !tbaa %jtbaa_user
%35 = fadd double %30, %34, !dbg !602
%36 = add i64 %1, 1, !dbg !603
%37 = getelementptr %jl_value_t* %7, i64 %36, !dbg !603
%38 = bitcast %jl_value_t* %37 to double*, !dbg !603
%39 = load double* %38, align 8, !dbg !603, !tbaa %jtbaa_user
%40 = add i64 %1, 5, !dbg !603
%41 = getelementptr %jl_value_t* %7, i64 %40, !dbg !603
%42 = bitcast %jl_value_t* %41 to double*, !dbg !603
%43 = load double* %42, align 8, !dbg !603, !tbaa %jtbaa_user
%44 = fadd double %39, %43, !dbg !603
%45 = add i64 %1, 2, !dbg !604
%46 = getelementptr %jl_value_t* %7, i64 %45, !dbg !604
%47 = bitcast %jl_value_t* %46 to double*, !dbg !604
%48 = load double* %47, align 8, !dbg !604, !tbaa %jtbaa_user
%sunkaddr55 = ptrtoint %jl_value_t* %7 to i64, !dbg !604
%sunkaddr56 = mul i64 %1, 8, !dbg !604
%sunkaddr57 = add i64 %sunkaddr55, %sunkaddr56, !dbg !604
%sunkaddr58 = add i64 %sunkaddr57, 48, !dbg !604
%sunkaddr59 = inttoptr i64 %sunkaddr58 to double*, !dbg !604
%49 = load double* %sunkaddr59, align 8, !dbg !604, !tbaa %jtbaa_user
%50 = fadd double %48, %49, !dbg !604
%51 = add i64 %1, 8, !dbg !605
%52 = add i64 %2, -3, !dbg !606
%53 = icmp sgt i64 %51, %52, !dbg !607
br i1 %53, label %L10, label %L7.preheader, !dbg !607
L7.preheader: ; preds = %L5
%sunkaddr60 = ptrtoint %jl_value_t* %0 to i64, !dbg !608
%sunkaddr61 = add i64 %sunkaddr60, 8, !dbg !608
%sunkaddr62 = inttoptr i64 %sunkaddr61 to %jl_value_t**, !dbg !608
%54 = load %jl_value_t** %sunkaddr62, align 8, !dbg !608, !tbaa %jtbaa_arrayptr
%55 = add i64 %1, 10, !dbg !608
%scevgep32 = getelementptr %jl_value_t* %54, i64 %55
%56 = add i64 %1, 9, !dbg !608
%scevgep36 = getelementptr %jl_value_t* %54, i64 %56, !dbg !609
%57 = add i64 %1, 8, !dbg !608
%scevgep40 = getelementptr %jl_value_t* %54, i64 %57, !dbg !609
%58 = add i64 %1, 7, !dbg !608
%scevgep44 = getelementptr %jl_value_t* %54, i64 %58, !dbg !609
br label %L7, !dbg !608
L7: ; preds = %L7.preheader, %L7
%lsr.iv45 = phi %jl_value_t* [ %scevgep44, %L7.preheader ], [ %scevgep46, %L7 ], !dbg !609
%lsr.iv41 = phi %jl_value_t* [ %scevgep40, %L7.preheader ], [ %scevgep42, %L7 ], !dbg !609
%lsr.iv37 = phi %jl_value_t* [ %scevgep36, %L7.preheader ], [ %scevgep38, %L7 ], !dbg !609
%lsr.iv33 = phi %jl_value_t* [ %scevgep32, %L7.preheader ], [ %scevgep34, %L7 ]
%s3.0 = phi double [ %64, %L7 ], [ %44, %L7.preheader ]
%s2.0 = phi double [ %62, %L7 ], [ %35, %L7.preheader ]
%s1.0 = phi double [ %60, %L7 ], [ %27, %L7.preheader ]
%s4.0 = phi double [ %66, %L7 ], [ %50, %L7.preheader ]
%i.1 = phi i64 [ %67, %L7 ], [ %51, %L7.preheader ]
%lsr.iv4547 = bitcast %jl_value_t* %lsr.iv45 to double*
%lsr.iv4143 = bitcast %jl_value_t* %lsr.iv41 to double*
%lsr.iv3739 = bitcast %jl_value_t* %lsr.iv37 to double*
%lsr.iv3335 = bitcast %jl_value_t* %lsr.iv33 to double*
%59 = load double* %lsr.iv4547, align 8, !dbg !608, !tbaa %jtbaa_user
%60 = fadd double %s1.0, %59, !dbg !608
%61 = load double* %lsr.iv4143, align 8, !dbg !610, !tbaa %jtbaa_user
%62 = fadd double %s2.0, %61, !dbg !610
%63 = load double* %lsr.iv3739, align 8, !dbg !611, !tbaa %jtbaa_user
%64 = fadd double %s3.0, %63, !dbg !611
%65 = load double* %lsr.iv3335, align 8, !dbg !612, !tbaa %jtbaa_user
%66 = fadd double %s4.0, %65, !dbg !612
%67 = add i64 %i.1, 4, !dbg !609
%scevgep34 = getelementptr %jl_value_t* %lsr.iv33, i64 4, !dbg !609
%scevgep38 = getelementptr %jl_value_t* %lsr.iv37, i64 4, !dbg !609
%scevgep42 = getelementptr %jl_value_t* %lsr.iv41, i64 4, !dbg !609
%scevgep46 = getelementptr %jl_value_t* %lsr.iv45, i64 4, !dbg !609
%68 = icmp sgt i64 %67, %52, !dbg !609
br i1 %68, label %L10, label %L7, !dbg !609
L10: ; preds = %L7, %L5
%s3.1 = phi double [ %44, %L5 ], [ %64, %L7 ]
%s2.1 = phi double [ %35, %L5 ], [ %62, %L7 ]
%s1.1 = phi double [ %27, %L5 ], [ %60, %L7 ]
%s4.1 = phi double [ %50, %L5 ], [ %66, %L7 ]
%i.2 = phi i64 [ %51, %L5 ], [ %67, %L7 ]
%69 = icmp sgt i64 %i.2, %2, !dbg !613
br i1 %69, label %L15, label %L12.preheader, !dbg !613
L12.preheader: ; preds = %L10
%sunkaddr63 = ptrtoint %jl_value_t* %0 to i64, !dbg !614
%sunkaddr64 = add i64 %sunkaddr63, 8, !dbg !614
%sunkaddr65 = inttoptr i64 %sunkaddr64 to %jl_value_t**, !dbg !614
%70 = load %jl_value_t** %sunkaddr65, align 8, !dbg !614, !tbaa %jtbaa_arrayptr
%71 = add i64 %i.2, -1, !dbg !614
%scevgep = getelementptr %jl_value_t* %70, i64 %71
br label %L12, !dbg !614
L12: ; preds = %L12.preheader, %L12
%lsr.iv = phi %jl_value_t* [ %scevgep, %L12.preheader ], [ %scevgep30, %L12 ]
%s1.2 = phi double [ %73, %L12 ], [ %s1.1, %L12.preheader ]
%i.3 = phi i64 [ %74, %L12 ], [ %i.2, %L12.preheader ]
%lsr.iv31 = bitcast %jl_value_t* %lsr.iv to double*
%72 = load double* %lsr.iv31, align 8, !dbg !614, !tbaa %jtbaa_user
%73 = fadd double %s1.2, %72, !dbg !614
%74 = add i64 %i.3, 1, !dbg !615
%scevgep30 = getelementptr %jl_value_t* %lsr.iv, i64 1, !dbg !615
%75 = icmp sgt i64 %74, %2, !dbg !615
br i1 %75, label %L15, label %L12, !dbg !615
L15: ; preds = %L12, %L10
%s1.3 = phi double [ %s1.1, %L10 ], [ %73, %L12 ]
%76 = fadd double %s2.1, %s1.3, !dbg !616
%77 = fadd double %s3.1, %76, !dbg !616
%78 = fadd double %s4.1, %77, !dbg !616
ret double %78, !dbg !616
}
With #6928
julia> code_native(Base.sum_seq, (Vector{Float64}, Int, Int))
.text
Filename: reduce.jl
Source line: 33
push RBP
mov RBP, RSP
Source line: 33
push R14
push RBX
and RSP, -32
sub RSP, 160
lea R8, QWORD PTR [RSI + 1]
lea RAX, QWORD PTR [RSI + 2]
cmp RAX, RDX
cmovg RDX, R8
sub RDX, RAX
jo 683
lea R10, QWORD PTR [RDX + 1]
add RDX, 1
jo 669
Source line: 210
mov RAX, QWORD PTR [RDI + 8]
vmovsd XMM0, QWORD PTR [RAX + 8*RSI - 8]
vaddsd XMM0, XMM0, QWORD PTR [RAX + 8*RSI]
Source line: 33
test R10, R10
jle 633
vxorpd YMM15, YMM15, YMM15
vmovsd XMM0, XMM15, XMM0
vinsertf128 YMM13, YMM15, XMM0, 0
Source line: 212
mov RCX, QWORD PTR [RDI + 8]
mov R9, R10
and R9, -16
je 528
vmovq XMM0, R8
vmovlhps XMM0, XMM0, XMM0 # xmm0 = xmm0[0,0]
vinsertf128 YMM11, YMM0, XMM0, 1
vxorpd YMM15, YMM15, YMM15
xor EDI, EDI
movabs RAX, 139889361538016
vmovaps XMM0, XMMWORD PTR [RAX]
vmovaps XMMWORD PTR [RSP + 112], XMM0
movabs RAX, 139889361538032
vmovaps XMM0, XMMWORD PTR [RAX]
vmovaps XMMWORD PTR [RSP + 96], XMM0
movabs RAX, 139889361538048
vmovaps XMM0, XMMWORD PTR [RAX]
vmovaps XMMWORD PTR [RSP + 80], XMM0
movabs RAX, 139889361538064
vmovaps XMM0, XMMWORD PTR [RAX]
vmovaps XMMWORD PTR [RSP + 64], XMM0
movabs RAX, 139889361538080
movabs RBX, 139889361538096
movabs RDX, 139889361538112
vmovaps XMM0, XMMWORD PTR [RBX]
vmovaps XMMWORD PTR [RSP + 48], XMM0
vmovaps XMM0, XMMWORD PTR [RAX]
vmovaps XMMWORD PTR [RSP + 32], XMM0
vmovapd XMM0, XMMWORD PTR [RDX]
vmovapd XMMWORD PTR [RSP + 16], XMM0
mov R8D, 1
vxorpd YMM14, YMM14, YMM14
vxorpd YMM6, YMM6, YMM6
vmovq XMM0, RDI
vmovlhps XMM5, XMM0, XMM0 # xmm5 = xmm0[0,0]
vpaddq XMM0, XMM5, XMMWORD PTR [RSP + 112]
vpaddq XMM1, XMM5, XMMWORD PTR [RSP + 96]
vextractf128 XMM7, YMM11, 1
vpaddq XMM2, XMM7, XMM1
vpaddq XMM10, XMM7, XMM0
vpaddq XMM0, XMM5, XMMWORD PTR [RSP + 80]
vpaddq XMM0, XMM11, XMM0
vpextrq R14, XMM0, 1
vmovq R11, XMM0
vmovq RAX, XMM10
vmovq RBX, XMM2
vpaddq XMM0, XMM5, XMMWORD PTR [RSP + 64]
vpaddq XMM12, XMM7, XMM0
vmovq RDX, XMM12
vmovsd XMM9, QWORD PTR [RCX + 8*RDX]
vmovsd XMM8, QWORD PTR [RCX + 8*RBX]
vmovsd XMM3, QWORD PTR [RCX + 8*RAX]
vmovsd XMM0, QWORD PTR [RCX + 8*R11]
vpextrq R11, XMM2, 1
add RDI, 16
vmovhpd XMM0, XMM0, QWORD PTR [RCX + 8*R14]
vmovapd YMMWORD PTR [RSP + 128], YMM0
vpextrq R14, XMM10, 1
vpextrq RDX, XMM12, 1
vpaddq XMM0, XMM5, XMMWORD PTR [RSP + 48]
vpaddq XMM1, XMM5, XMMWORD PTR [RSP + 32]
vpaddq XMM1, XMM11, XMM1
vpextrq RBX, XMM1, 1
vmovq RAX, XMM1
vmovsd XMM1, QWORD PTR [RCX + 8*RAX]
vmovhpd XMM10, XMM1, QWORD PTR [RCX + 8*RBX]
vpaddq XMM4, XMM11, XMM0
vmovhpd XMM2, XMM9, QWORD PTR [RCX + 8*RDX]
vmovhpd XMM0, XMM8, QWORD PTR [RCX + 8*R11]
vmovhpd XMM1, XMM3, QWORD PTR [RCX + 8*R14]
vpaddq XMM3, XMM5, XMMWORD PTR [RSP + 16]
vpaddq XMM3, XMM7, XMM3
vpextrq RAX, XMM4, 1
vmovq RDX, XMM4
vmovsd XMM4, QWORD PTR [RCX + 8*RDX]
vmovhpd XMM7, XMM4, QWORD PTR [RCX + 8*RAX]
vmovq XMM4, R8
vinsertf128 YMM1, YMM7, XMM1, 1
vinsertf128 YMM0, YMM10, XMM0, 1
vmovapd YMM7, YMMWORD PTR [RSP + 128]
vinsertf128 YMM2, YMM7, XMM2, 1
vpextrq RAX, XMM3, 1
vmovq RDX, XMM3
vpslldq XMM3, XMM4, 8
vpaddq XMM3, XMM5, XMM3
vmovsd XMM4, QWORD PTR [RCX + 8*RDX]
cmp R9, RDI
vaddpd YMM6, YMM6, YMM2
vaddpd YMM14, YMM14, YMM0
vaddpd YMM15, YMM15, YMM1
vmovhpd XMM0, XMM4, QWORD PTR [RCX + 8*RAX]
vpaddq XMM1, XMM11, XMM3
vpextrq RAX, XMM1, 1
vmovq RDX, XMM1
vmovsd XMM1, QWORD PTR [RCX + 8*RDX]
vmovhpd XMM1, XMM1, QWORD PTR [RCX + 8*RAX]
vinsertf128 YMM0, YMM1, XMM0, 1
vaddpd YMM13, YMM13, YMM0
jne -346
jmpq 12
xor R9D, R9D
vxorpd YMM14, YMM14, YMM14
vxorpd YMM6, YMM6, YMM6
vaddpd YMM0, YMM15, YMM13
vaddpd YMM0, YMM14, YMM0
vaddpd YMM0, YMM6, YMM0
vextractf128 XMM1, YMM0, 1
vaddpd YMM0, YMM0, YMM1
vhaddpd YMM0, YMM0, YMM0
sub R10, R9
je 25
add R9, RSI
lea RCX, QWORD PTR [RCX + 8*R9 + 8]
vaddsd XMM0, XMM0, QWORD PTR [RCX]
Source line: 41
add RCX, 8
dec R10
jne -17
Source line: 214
lea RSP, QWORD PTR [RBP - 16]
pop RBX
pop R14
pop RBP
vzeroupper
ret
Source line: 33
movabs RAX, 139889360410464
mov RDI, QWORD PTR [RAX]
movabs RAX, 139889345816880
mov ESI, 33
call RAX
julia> code_llvm(Base.sum_seq, (Vector{Float64}, Int, Int))
define double @julia_sum_seq17654(%jl_value_t*, i64, i64) {
top:
%3 = add i64 %1, 2, !dbg !575
%4 = icmp sgt i64 %3, %2, !dbg !575
%5 = add i64 %1, 1, !dbg !575
%6 = select i1 %4, i64 %5, i64 %2, !dbg !575
%7 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %6, i64 %3), !dbg !575
%8 = extractvalue { i64, i1 } %7, 1, !dbg !575
br i1 %8, label %fail, label %pass, !dbg !575
fail: ; preds = %top
%9 = load %jl_value_t** @jl_overflow_exception, align 8, !dbg !575
call void @jl_throw_with_superfluous_argument(%jl_value_t* %9, i32 33), !dbg !575
unreachable, !dbg !575
pass: ; preds = %top
%10 = extractvalue { i64, i1 } %7, 0, !dbg !575
%11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1), !dbg !575
%12 = extractvalue { i64, i1 } %11, 1, !dbg !575
br i1 %12, label %fail1, label %pass2, !dbg !575
fail1: ; preds = %pass
%13 = load %jl_value_t** @jl_overflow_exception, align 8, !dbg !575
call void @jl_throw_with_superfluous_argument(%jl_value_t* %13, i32 33), !dbg !575
unreachable, !dbg !575
pass2: ; preds = %pass
%14 = add i64 %1, -1, !dbg !576
%15 = getelementptr inbounds %jl_value_t* %0, i64 1, i32 0, !dbg !576
%16 = load %jl_value_t** %15, align 8, !dbg !576, !tbaa %jtbaa_arrayptr
%17 = getelementptr %jl_value_t* %16, i64 %14, !dbg !576
%18 = bitcast %jl_value_t* %17 to double*, !dbg !576
%19 = load double* %18, align 8, !dbg !576, !tbaa %jtbaa_user
%20 = getelementptr %jl_value_t* %16, i64 %1, !dbg !576
%21 = bitcast %jl_value_t* %20 to double*, !dbg !576
%22 = load double* %21, align 8, !dbg !576, !tbaa %jtbaa_user
%23 = fadd double %19, %22, !dbg !576
%24 = extractvalue { i64, i1 } %11, 0, !dbg !575
%25 = icmp slt i64 %24, 1, !dbg !582
br i1 %25, label %L7, label %L.preheader, !dbg !582
L.preheader: ; preds = %pass2
%sunkaddr = ptrtoint %jl_value_t* %0 to i64, !dbg !583
%sunkaddr30 = add i64 %sunkaddr, 8, !dbg !583
%sunkaddr31 = inttoptr i64 %sunkaddr30 to %jl_value_t**, !dbg !583
%26 = load %jl_value_t** %sunkaddr31, align 8, !dbg !583, !tbaa %jtbaa_arrayptr, !llvm.mem.parallel_loop_access !584
%n.vec = and i64 %24, -16
%cmp.zero = icmp eq i64 %n.vec, 0
%27 = insertelement <4 x double> <double undef, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double %23, i32 0
br i1 %cmp.zero, label %middle.block, label %vector.ph
vector.ph: ; preds = %L.preheader
%broadcast.splatinsert14 = insertelement <4 x i64> undef, i64 %5, i32 0
%broadcast.splat15 = shufflevector <4 x i64> %broadcast.splatinsert14, <4 x i64> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x double> [ %27, %vector.ph ], [ %112, %vector.body ]
%vec.phi8 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %113, %vector.body ]
%vec.phi9 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %114, %vector.body ]
%vec.phi10 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %115, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
%induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
%induction11 = add <4 x i64> %broadcast.splat, <i64 4, i64 5, i64 6, i64 7>
%induction12 = add <4 x i64> %broadcast.splat, <i64 8, i64 9, i64 10, i64 11>
%induction13 = add <4 x i64> %broadcast.splat, <i64 12, i64 13, i64 14, i64 15>
%28 = add <4 x i64> %broadcast.splat15, %induction
%29 = add <4 x i64> %broadcast.splat15, %induction11
%30 = add <4 x i64> %broadcast.splat15, %induction12
%31 = add <4 x i64> %broadcast.splat15, %induction13
%32 = extractelement <4 x i64> %28, i32 0
%33 = getelementptr %jl_value_t* %26, i64 %32, !dbg !583
%34 = extractelement <4 x i64> %28, i32 1
%35 = getelementptr %jl_value_t* %26, i64 %34, !dbg !583
%36 = extractelement <4 x i64> %28, i32 2
%37 = getelementptr %jl_value_t* %26, i64 %36, !dbg !583
%38 = extractelement <4 x i64> %28, i32 3
%39 = getelementptr %jl_value_t* %26, i64 %38, !dbg !583
%40 = extractelement <4 x i64> %29, i32 0
%41 = getelementptr %jl_value_t* %26, i64 %40, !dbg !583
%42 = extractelement <4 x i64> %29, i32 1
%43 = getelementptr %jl_value_t* %26, i64 %42, !dbg !583
%44 = extractelement <4 x i64> %29, i32 2
%45 = getelementptr %jl_value_t* %26, i64 %44, !dbg !583
%46 = extractelement <4 x i64> %29, i32 3
%47 = getelementptr %jl_value_t* %26, i64 %46, !dbg !583
%48 = extractelement <4 x i64> %30, i32 0
%49 = getelementptr %jl_value_t* %26, i64 %48, !dbg !583
%50 = extractelement <4 x i64> %30, i32 1
%51 = getelementptr %jl_value_t* %26, i64 %50, !dbg !583
%52 = extractelement <4 x i64> %30, i32 2
%53 = getelementptr %jl_value_t* %26, i64 %52, !dbg !583
%54 = extractelement <4 x i64> %30, i32 3
%55 = getelementptr %jl_value_t* %26, i64 %54, !dbg !583
%56 = extractelement <4 x i64> %31, i32 0
%57 = getelementptr %jl_value_t* %26, i64 %56, !dbg !583
%58 = extractelement <4 x i64> %31, i32 1
%59 = getelementptr %jl_value_t* %26, i64 %58, !dbg !583
%60 = extractelement <4 x i64> %31, i32 2
%61 = getelementptr %jl_value_t* %26, i64 %60, !dbg !583
%62 = extractelement <4 x i64> %31, i32 3
%63 = getelementptr %jl_value_t* %26, i64 %62, !dbg !583
%64 = bitcast %jl_value_t* %33 to double*
%65 = load double* %64, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%66 = insertelement <4 x double> undef, double %65, i32 0
%67 = bitcast %jl_value_t* %35 to double*
%68 = load double* %67, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%69 = insertelement <4 x double> %66, double %68, i32 1
%70 = bitcast %jl_value_t* %37 to double*
%71 = load double* %70, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%72 = insertelement <4 x double> %69, double %71, i32 2
%73 = bitcast %jl_value_t* %39 to double*
%74 = load double* %73, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%75 = insertelement <4 x double> %72, double %74, i32 3
%76 = bitcast %jl_value_t* %41 to double*
%77 = load double* %76, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%78 = insertelement <4 x double> undef, double %77, i32 0
%79 = bitcast %jl_value_t* %43 to double*
%80 = load double* %79, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%81 = insertelement <4 x double> %78, double %80, i32 1
%82 = bitcast %jl_value_t* %45 to double*
%83 = load double* %82, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%84 = insertelement <4 x double> %81, double %83, i32 2
%85 = bitcast %jl_value_t* %47 to double*
%86 = load double* %85, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%87 = insertelement <4 x double> %84, double %86, i32 3
%88 = bitcast %jl_value_t* %49 to double*
%89 = load double* %88, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%90 = insertelement <4 x double> undef, double %89, i32 0
%91 = bitcast %jl_value_t* %51 to double*
%92 = load double* %91, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%93 = insertelement <4 x double> %90, double %92, i32 1
%94 = bitcast %jl_value_t* %53 to double*
%95 = load double* %94, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%96 = insertelement <4 x double> %93, double %95, i32 2
%97 = bitcast %jl_value_t* %55 to double*
%98 = load double* %97, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%99 = insertelement <4 x double> %96, double %98, i32 3
%100 = bitcast %jl_value_t* %57 to double*
%101 = load double* %100, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%102 = insertelement <4 x double> undef, double %101, i32 0
%103 = bitcast %jl_value_t* %59 to double*
%104 = load double* %103, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%105 = insertelement <4 x double> %102, double %104, i32 1
%106 = bitcast %jl_value_t* %61 to double*
%107 = load double* %106, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%108 = insertelement <4 x double> %105, double %107, i32 2
%109 = bitcast %jl_value_t* %63 to double*
%110 = load double* %109, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%111 = insertelement <4 x double> %108, double %110, i32 3
%112 = fadd <4 x double> %vec.phi, %75
%113 = fadd <4 x double> %vec.phi8, %87
%114 = fadd <4 x double> %vec.phi9, %99
%115 = fadd <4 x double> %vec.phi10, %111
%index.next = add i64 %index, 16
%116 = icmp eq i64 %n.vec, %index.next
br i1 %116, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body, %L.preheader
%resume.val = phi i64 [ 0, %L.preheader ], [ %n.vec, %vector.body ]
%rdx.vec.exit.phi = phi <4 x double> [ %27, %L.preheader ], [ %112, %vector.body ]
%rdx.vec.exit.phi18 = phi <4 x double> [ zeroinitializer, %L.preheader ], [ %113, %vector.body ]
%rdx.vec.exit.phi19 = phi <4 x double> [ zeroinitializer, %L.preheader ], [ %114, %vector.body ]
%rdx.vec.exit.phi20 = phi <4 x double> [ zeroinitializer, %L.preheader ], [ %115, %vector.body ]
%bin.rdx = fadd <4 x double> %rdx.vec.exit.phi18, %rdx.vec.exit.phi
%bin.rdx21 = fadd <4 x double> %rdx.vec.exit.phi19, %bin.rdx
%bin.rdx22 = fadd <4 x double> %rdx.vec.exit.phi20, %bin.rdx21
%rdx.shuf = shufflevector <4 x double> %bin.rdx22, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx23 = fadd <4 x double> %bin.rdx22, %rdx.shuf
%rdx.shuf24 = shufflevector <4 x double> %bin.rdx23, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx25 = fadd <4 x double> %bin.rdx23, %rdx.shuf24
%117 = extractelement <4 x double> %bin.rdx25, i32 0
%cmp.n = icmp eq i64 %24, %resume.val
br i1 %cmp.n, label %L7, label %L.preheader26
L.preheader26: ; preds = %middle.block
%118 = add i64 %resume.val, %1, !dbg !583
%119 = add i64 %118, 1, !dbg !583
%scevgep = getelementptr %jl_value_t* %26, i64 %119
%120 = sub i64 %24, %resume.val, !dbg !583
br label %L, !dbg !583
L: ; preds = %L.preheader26, %L
%lsr.iv29 = phi i64 [ %120, %L.preheader26 ], [ %lsr.iv.next, %L ], !dbg !586
%lsr.iv = phi %jl_value_t* [ %scevgep, %L.preheader26 ], [ %scevgep27, %L ]
%s.0 = phi double [ %122, %L ], [ %117, %L.preheader26 ]
%lsr.iv28 = bitcast %jl_value_t* %lsr.iv to double*
%121 = load double* %lsr.iv28, align 8, !dbg !583, !tbaa %jtbaa_user, !llvm.mem.parallel_loop_access !584
%122 = fadd fast double %s.0, %121, !dbg !583
%scevgep27 = getelementptr %jl_value_t* %lsr.iv, i64 1, !dbg !586
%lsr.iv.next = add i64 %lsr.iv29, -1, !dbg !586
%exitcond = icmp eq i64 %lsr.iv.next, 0, !dbg !586
br i1 %exitcond, label %L7, label %L, !dbg !586, !llvm.loop.parallel !585, !llvm.vectorizer.already_vectorized !585
L7: ; preds = %L, %middle.block, %pass2
%s.2 = phi double [ %23, %pass2 ], [ %117, %middle.block ], [ %122, %L ]
ret double %s.2, !dbg !587
}