ffreyer/code_native.jl Secret

## code_native.jl
julia> @code_native debuginfo = :none syntax = :intel reflectorApply!(x, τ, y)
        .text
        push    rbp
        mov     rbp, rsp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        and     rsp, -32
        sub     rsp, 384
        vmovq   qword ptr [rsp + 224], xmm0
        mov     qword ptr [rsp + 48], rdx
        mov     qword ptr [rsp + 144], rsi
        mov     qword ptr [rsp + 160], rdi
        vpxor   xmm0, xmm0, xmm0
        vmovdqa ymmword ptr [rsp + 288], ymm0
        mov     rax, qword ptr fs:[0]
        mov     rdx, qword ptr [rax - 8]
        mov     qword ptr [rsp + 288], 8
        mov     rax, qword ptr [rdx]
        mov     qword ptr [rsp + 296], rax
        lea     rax, [rsp + 288]
        mov     qword ptr [rsp + 152], rdx
        mov     qword ptr [rdx], rax
        mov     rdi, qword ptr [rcx + 32]
        mov     qword ptr [rsp + 32], rcx
        mov     rcx, qword ptr [rcx + 24]
        mov     rax, rcx
        mov     qword ptr [rsp + 80], rcx
        sub     rdi, rcx
        inc     rdi
        test    rdi, rdi
        jle     L1813
        mov     rcx, qword ptr [rsp + 32]
        mov     rax, qword ptr [rcx + 8]
        mov     rdx, qword ptr [rcx + 16]
        sub     rdx, rax
        inc     rdx
        mov     rsi, rdx
        sar     rsi, 63
        andn    r11, rsi, rdx
        mov     rdx, rdi
        sar     rdx, 63
        andn    rdx, rdx, rdi
        mov     qword ptr [rsp + 216], rdx
        mov     r8, qword ptr [rcx]
        mov     rdi, qword ptr [r8 + 24]
        mov     r15, qword ptr [rsp + 48]
        mov     r9, qword ptr [r15]
        mov     rcx, qword ptr [r15 + 24]
        dec     rcx
        imul    rcx, qword ptr [r9 + 24]
        mov     rdx, qword ptr [rsp + 80]
        lea     r10, [rdx - 1]
        mov     qword ptr [rsp + 200], rdi
        imul    r10, rdi
        lea     rdi, [rax + r10]
        dec     rdi
        shl     rdi, 3
        mov     qword ptr [rsp + 96], rdi
        lea     r13, [r11 - 2]
        cmp     r11, 18
        setl    dil
        cmp     r13, 15
        setne   bl
        and     bl, dil
        mov     byte ptr [rsp + 23], bl
        lea     rdx, [r11 - 1]
        mov     r14, rdx
        and     r14, -16
        and     rdx, -8
        lea     ebx, [r11 + 2]
        and     ebx, 3
        xor     esi, esi
        cmp     r11, 1
        mov     edi, 3
        cmovg   rdi, rbx
        vmovq   xmm0, rdi
        cmovle  r14, rsi
        cmovle  rdx, rsi
        mov     qword ptr [rsp + 64], rdx
        lea     rdx, [rax - 1]
        mov     qword ptr [rsp + 184], rdx
        shl     rax, 3
        lea     rax, [rax + 8*r10]
        add     rax, 96
        mov     qword ptr [rsp + 88], rax
        mov     rax, qword ptr [r15 + 8]
        lea     rdx, [rax + rcx]
        shl     rax, 3
        lea     rax, [rax + 8*rcx]
        mov     qword ptr [rsp + 192], r9
        mov     rcx, qword ptr [r9]
        lea     rdx, [rcx + 8*rdx]
        mov     qword ptr [rsp + 24], rdx
        lea     rbx, [rcx + rax + 96]
        movabs  rax, offset .rodata.cst8
        vpbroadcastq    ymm1, qword ptr [rax]
        movabs  rax, offset .rodata.cst32
        vmovdqa ymm2, ymmword ptr [rax]
        vpbroadcastq    ymm0, xmm0
        vpxor   ymm0, ymm0, ymm1
        lea     rax, [r11 - 6]
        mov     qword ptr [rsp + 56], rax
        lea     rax, [r11 - 10]
        mov     qword ptr [rsp + 176], rax
        add     r11, -14
        mov     qword ptr [rsp + 168], r11
        vpcmpgtq        ymm1, ymm2, ymm0
        vpcmpeqd        ymm0, ymm0, ymm0
        vmovdqa ymmword ptr [rsp + 320], ymm1
        vpxor   ymm0, ymm1, ymm0
        vmovdqa ymmword ptr [rsp + 256], ymm0
        mov     qword ptr [rsp + 208], r8
        mov     rax, qword ptr [r8]
        mov     qword ptr [rsp + 112], rax
        mov     r15d, 1
        mov     qword ptr [rsp + 72], r13
        jmp     L632
        nop     dword ptr [rax + rax]
L560:
        mov     rcx, qword ptr [rsp + 24]
        vmovapd ymm3, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm0, ymm3, ymmword ptr [rcx + 8*r14]
        vmaskmovpd      ymm1, ymm3, ymmword ptr [rax + 8*r14]
        vbroadcastsd    ymm2, xmm7
        vfnmadd213pd    ymm2, ymm0, ymm1        # ymm2 = -(ymm0 * ymm2) + ymm1
        vmaskmovpd      ymmword ptr [rax + 8*r14], ymm3, ymm2
L602:
        mov     rcx, qword ptr [rsp + 248]
        lea     r15, [rcx + 1]
        add     rsi, 8
        cmp     rcx, qword ptr [rsp + 216]
        je      L1813
L632:
        mov     qword ptr [rsp + 120], rsi
        mov     rax, qword ptr [rsp + 80]
        lea     rcx, [r15 + rax]
        add     rcx, -2
        imul    rcx, qword ptr [rsp + 200]
        add     rcx, qword ptr [rsp + 184]
        mov     rax, qword ptr [rsp + 112]
        mov     qword ptr [rsp + 40], rcx
        vmovsd  xmm0, qword ptr [rax + 8*rcx]   # xmm0 = mem[0],zero
        vmovsd  qword ptr [rsp + 128], xmm0
        mov     rax, qword ptr [rsp + 192]
        mov     qword ptr [rsp + 312], rax
        mov     r13, qword ptr [rsp + 208]
        mov     qword ptr [rsp + 304], r13
        movabs  r12, 140286483749744
        mov     rdi, r12
        mov     rsi, qword ptr [rsp + 48]
        movabs  rax, offset StrideIndex
        vzeroupper
        call    rax
        mov     r13, qword ptr [r13]
        mov     rax, qword ptr [rsp + 96]
        add     rax, r13
        mov     qword ptr [rsp + 104], rax
        mov     rdi, r12
        mov     rsi, qword ptr [rsp + 32]
        movabs  rax, offset StrideIndex
        call    rax
        mov     qword ptr [rsp + 248], r15
        lea     rsi, [8*r15 - 8]
        mov     rcx, rsi
        imul    rcx, rax
        cmp     byte ptr [rsp + 23], 0
        mov     qword ptr [rsp + 240], r13
        je      L880
        vxorpd  xmm0, xmm0, xmm0
        xor     eax, eax
        vpxor   xmm1, xmm1, xmm1
        mov     rdx, qword ptr [rsp + 64]
        vmovsd  xmm5, qword ptr [rsp + 128]     # xmm5 = mem[0],zero
        mov     rdi, qword ptr [rsp + 104]
        jmp     L1017
        nop     word ptr cs:[rax + rax]
L880:
        mov     rdx, qword ptr [rsp + 88]
        add     rdx, r13
        imul    rax, qword ptr [rsp + 120]
        add     rax, rdx
        vpxor   xmm1, xmm1, xmm1
        vxorpd  xmm0, xmm0, xmm0
        vpxor   xmm2, xmm2, xmm2
        vxorpd  xmm3, xmm3, xmm3
        xor     edx, edx
        nop     word ptr cs:[rax + rax]
L928:
        vmovupd ymm4, ymmword ptr [rbx + 8*rdx - 96]
        vmovupd ymm5, ymmword ptr [rbx + 8*rdx - 64]
        vmovupd ymm6, ymmword ptr [rbx + 8*rdx - 32]
        vmovupd ymm7, ymmword ptr [rbx + 8*rdx]
        vfmadd231pd     ymm3, ymm4, ymmword ptr [rax + 8*rdx - 96] # ymm3 = (ymm4 * mem) + ymm3
        vfmadd231pd     ymm2, ymm5, ymmword ptr [rax + 8*rdx - 64] # ymm2 = (ymm5 * mem) + ymm2
        vfmadd231pd     ymm0, ymm6, ymmword ptr [rax + 8*rdx - 32] # ymm0 = (ymm6 * mem) + ymm0
        vfmadd231pd     ymm1, ymm7, ymmword ptr [rax + 8*rdx] # ymm1 = (ymm7 * mem) + ymm1
        add     rdx, 16
        cmp     r14, rdx
        jne     L928
        vaddpd  ymm0, ymm3, ymm0
        vaddpd  ymm1, ymm2, ymm1
        mov     rax, r14
        mov     rdx, qword ptr [rsp + 64]
        vmovsd  xmm5, qword ptr [rsp + 128]     # xmm5 = mem[0],zero
        mov     rdi, qword ptr [rsp + 104]
L1017:
        add     rcx, rdi
        add     rcx, 8
        cmp     rax, rdx
        jne     L1040
        mov     rax, rdx
        mov     r13, qword ptr [rsp + 72]
        jmp     L1078
        nop
L1040:
        mov     rdx, qword ptr [rsp + 24]
        vmovupd ymm2, ymmword ptr [rdx + 8*rax]
        vmovupd ymm3, ymmword ptr [rdx + 8*rax + 32]
        vfmadd231pd     ymm0, ymm2, ymmword ptr [rcx + 8*rax] # ymm0 = (ymm2 * mem) + ymm0
        vfmadd231pd     ymm1, ymm3, ymmword ptr [rcx + 8*rax + 32] # ymm1 = (ymm3 * mem) + ymm1
        or      rax, 8
        mov     r13, qword ptr [rsp + 72]
L1078:
        cmp     rax, r13
        mov     qword ptr [rsp + 232], rsi
        jle     L1104
        mov     rcx, qword ptr [rsp + 40]
        jmp     L1232
        nop     dword ptr [rax]
L1104:
        cmp     qword ptr [rsp + 56], rax
        jge     L1168
        mov     rdx, qword ptr [rsp + 24]
        vmovapd ymm3, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm2, ymm3, ymmword ptr [rdx + 8*rax]
        vmaskmovpd      ymm3, ymm3, ymmword ptr [rcx + 8*rax]
        vfmadd213pd     ymm3, ymm2, ymm0        # ymm3 = (ymm2 * ymm3) + ymm0
        vmovapd ymm2, ymmword ptr [rsp + 320]
        vblendvpd       ymm0, ymm3, ymm0, ymm2
        mov     rcx, qword ptr [rsp + 40]
        jmp     L1232
        nop     dword ptr [rax]
L1168:
        mov     rdx, qword ptr [rsp + 24]
        vmovapd ymm3, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm2, ymm3, ymmword ptr [rdx + 8*rax + 32]
        vmaskmovpd      ymm3, ymm3, ymmword ptr [rcx + 8*rax + 32]
        vmovupd ymm4, ymmword ptr [rdx + 8*rax]
        vfmadd231pd     ymm0, ymm4, ymmword ptr [rcx + 8*rax] # ymm0 = (ymm4 * mem) + ymm0
        vfmadd213pd     ymm3, ymm2, ymm1        # ymm3 = (ymm2 * ymm3) + ymm1
        vmovapd ymm2, ymmword ptr [rsp + 320]
        vblendvpd       ymm1, ymm3, ymm1, ymm2
        mov     rcx, qword ptr [rsp + 40]
L1232:
        vaddpd  ymm0, ymm0, ymm1
        vextractf128    xmm1, ymm0, 1
        vaddpd  xmm0, xmm0, xmm1
        vpermilpd       xmm1, xmm0, 1           # xmm1 = xmm0[1,0]
        vaddsd  xmm0, xmm0, xmm1
        vaddsd  xmm0, xmm5, xmm0
        vmulsd  xmm1, xmm0, qword ptr [rsp + 224]
        vmovapd xmmword ptr [rsp + 128], xmm1
        mov     rax, qword ptr [rsp + 112]
        vmovsd  xmm0, qword ptr [rax + 8*rcx]   # xmm0 = mem[0],zero
        vsubsd  xmm0, xmm0, xmm1
        vmovsd  qword ptr [rax + 8*rcx], xmm0
        movabs  r12, 140286483749744
        mov     rdi, r12
        mov     rsi, qword ptr [rsp + 32]
        movabs  rax, offset StrideIndex
        vzeroupper
        call    rax
        mov     r15, rax
        mov     rdi, r12
        mov     rsi, qword ptr [rsp + 48]
        movabs  rax, offset StrideIndex
        call    rax
        vmovapd xmm7, xmmword ptr [rsp + 128]
        test    r14, r14
        mov     rsi, qword ptr [rsp + 120]
        mov     rdx, qword ptr [rsp + 240]
        je      L1490
        vbroadcastsd    ymm0, xmm7
        mov     rax, qword ptr [rsp + 88]
        lea     rcx, [rdx + rax]
        mov     rax, r15
        imul    rax, rsi
        add     rax, rcx
        xor     ecx, ecx
        nop
L1408:
        vmovupd ymm1, ymmword ptr [rbx + 8*rcx - 96]
        vmovupd ymm2, ymmword ptr [rbx + 8*rcx - 64]
        vmovupd ymm3, ymmword ptr [rbx + 8*rcx - 32]
        vmovupd ymm4, ymmword ptr [rbx + 8*rcx]
        vfnmadd213pd    ymm1, ymm0, ymmword ptr [rax + 8*rcx - 96] # ymm1 = -(ymm0 * ymm1) + mem
        vfnmadd213pd    ymm2, ymm0, ymmword ptr [rax + 8*rcx - 64] # ymm2 = -(ymm0 * ymm2) + mem
        vfnmadd213pd    ymm3, ymm0, ymmword ptr [rax + 8*rcx - 32] # ymm3 = -(ymm0 * ymm3) + mem
        vfnmadd213pd    ymm4, ymm0, ymmword ptr [rax + 8*rcx] # ymm4 = -(ymm0 * ymm4) + mem
        vmovupd ymmword ptr [rax + 8*rcx - 96], ymm1
        vmovupd ymmword ptr [rax + 8*rcx - 64], ymm2
        vmovupd ymmword ptr [rax + 8*rcx - 32], ymm3
        vmovupd ymmword ptr [rax + 8*rcx], ymm4
        add     rcx, 16
        cmp     r14, rcx
        jne     L1408
L1490:
        cmp     r14, r13
        jg      L602
        mov     rcx, qword ptr [rsp + 232]
        imul    rcx, r15
        mov     rax, qword ptr [rsp + 96]
        add     rax, rdx
        add     rax, 8
        add     rax, rcx
        cmp     qword ptr [rsp + 56], r14
        jl      L560
        cmp     qword ptr [rsp + 176], r14
        jge     L1615
        mov     rcx, qword ptr [rsp + 24]
        vmovapd ymm4, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm0, ymm4, ymmword ptr [rcx + 8*r14 + 32]
        vmovupd ymm1, ymmword ptr [rcx + 8*r14]
        vmaskmovpd      ymm2, ymm4, ymmword ptr [rax + 8*r14 + 32]
        vbroadcastsd    ymm3, xmm7
        vfnmadd213pd    ymm1, ymm3, ymmword ptr [rax + 8*r14] # ymm1 = -(ymm3 * ymm1) + mem
        vfnmadd231pd    ymm2, ymm3, ymm0        # ymm2 = -(ymm3 * ymm0) + ymm2
        vmovupd ymmword ptr [rax + 8*r14], ymm1
        vmaskmovpd      ymmword ptr [rax + 8*r14 + 32], ymm4, ymm2
        jmp     L602
L1615:
        mov     rcx, qword ptr [rsp + 24]
        vmovupd ymm1, ymmword ptr [rcx + 8*r14]
        vmovupd ymm0, ymmword ptr [rcx + 8*r14 + 32]
        cmp     qword ptr [rsp + 168], r14
        jge     L1720
        lea     rcx, [rcx + 8*r14]
        add     rcx, 64
        vmovapd ymm5, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm2, ymm5, ymmword ptr [rcx]
        vmaskmovpd      ymm3, ymm5, ymmword ptr [rax + 8*r14 + 64]
        vbroadcastsd    ymm4, xmm7
        vfnmadd213pd    ymm1, ymm4, ymmword ptr [rax + 8*r14] # ymm1 = -(ymm4 * ymm1) + mem
        vfnmadd213pd    ymm0, ymm4, ymmword ptr [rax + 8*r14 + 32] # ymm0 = -(ymm4 * ymm0) + mem
        vfnmadd231pd    ymm3, ymm4, ymm2        # ymm3 = -(ymm4 * ymm2) + ymm3
        vmovupd ymmword ptr [rax + 8*r14], ymm1
        vmovupd ymmword ptr [rax + 8*r14 + 32], ymm0
        vmaskmovpd      ymmword ptr [rax + 8*r14 + 64], ymm5, ymm3
        jmp     L602
L1720:
        lea     rcx, [rcx + 8*r14]
        vmovapd ymm6, ymmword ptr [rsp + 256]
        vmaskmovpd      ymm2, ymm6, ymmword ptr [rcx + 96]
        vmaskmovpd      ymm3, ymm6, ymmword ptr [rax + 8*r14 + 96]
        vbroadcastsd    ymm4, xmm7
        vfnmadd213pd    ymm1, ymm4, ymmword ptr [rax + 8*r14] # ymm1 = -(ymm4 * ymm1) + mem
        vfnmadd213pd    ymm0, ymm4, ymmword ptr [rax + 8*r14 + 32] # ymm0 = -(ymm4 * ymm0) + mem
        vmovupd ymm5, ymmword ptr [rcx + 64]
        vfnmadd213pd    ymm5, ymm4, ymmword ptr [rax + 8*r14 + 64] # ymm5 = -(ymm4 * ymm5) + mem
        vfnmadd231pd    ymm3, ymm4, ymm2        # ymm3 = -(ymm4 * ymm2) + ymm3
        vmovupd ymmword ptr [rax + 8*r14], ymm1
        vmovupd ymmword ptr [rax + 8*r14 + 32], ymm0
        vmovupd ymmword ptr [rax + 8*r14 + 64], ymm5
        vmaskmovpd      ymmword ptr [rax + 8*r14 + 96], ymm6, ymm3
        jmp     L602
L1813:
        mov     rcx, qword ptr [rsp + 32]
        mov     rax, qword ptr [rcx]
        mov     rdx, qword ptr [rsp + 144]
        mov     qword ptr [rdx], rax
        vmovups ymm0, ymmword ptr [rcx]
        vmovups ymm1, ymmword ptr [rcx + 24]
        mov     rax, qword ptr [rsp + 160]
        vmovups ymmword ptr [rax + 24], ymm1
        vmovups ymmword ptr [rax], ymm0
        mov     rcx, qword ptr [rsp + 296]
        mov     rdx, qword ptr [rsp + 152]
        mov     qword ptr [rdx], rcx
        lea     rsp, [rbp - 40]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        vzeroupper
        ret
        nop     word ptr [rax + rax]

## code_typed.jl
julia> @code_typed reflectorApply!(x, τ, y)
CodeInfo(
1 ─── %1   = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %2   = Core.getfield(%1, 1)::UnitRange{Int64}
│     %3   = Core.getfield(%1, 2)::UnitRange{Int64}
│     %4   = Base.getfield(%2, :stop)::Int64
│     %5   = Base.getfield(%2, :start)::Int64
│     %6   = Base.sub_int(%4, %5)::Int64
│     %7   = Base.add_int(%6, 1)::Int64
│     %8   = Base.slt_int(%7, 0)::Bool
│     %9   = Base.ifelse(%8, 0, %7)::Int64
│     %10  = Base.getfield(%3, :stop)::Int64
│     %11  = Base.getfield(%3, :start)::Int64
│     %12  = Base.sub_int(%10, %11)::Int64
│     %13  = Base.add_int(%12, 1)::Int64
│     %14  = Base.slt_int(%13, 0)::Bool
│     %15  = Base.ifelse(%14, 0, %13)::Int64
│     %16  = Base.sle_int(1, %15)::Bool
│     %17  = Base.ifelse(%16, %15, 0)::Int64
│     %18  = Base.slt_int(%17, 1)::Bool
└────        goto #3 if not %18
2 ───        Base.nothing::Nothing
└────        goto #4
3 ───        goto #4
4 ┄── %23  = φ (#2 => true, #3 => false)::Bool
│     %24  = φ (#3 => 1)::Int64
│     %25  = φ (#3 => 1)::Int64
│     %26  = Base.not_int(%23)::Bool
└────        goto #151 if not %26
5 ┄── %28  = φ (#4 => %24, #150 => %682)::Int64
│     %29  = φ (#4 => %25, #150 => %683)::Int64
└────        goto #10 if not false
6 ─── %31  = Core.tuple(1, %28)::Tuple{Int64, Int64}
│     %32  = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %33  = Core.getfield(%32, 1)::UnitRange{Int64}
│     %34  = Core.getfield(%32, 2)::UnitRange{Int64}
│     %35  = Base.getfield(%33, :stop)::Int64
│     %36  = Base.getfield(%33, :start)::Int64
│     %37  = Base.sub_int(%35, %36)::Int64
│     %38  = Base.add_int(%37, 1)::Int64
│     %39  = Base.slt_int(%38, 0)::Bool
│     %40  = Base.ifelse(%39, 0, %38)::Int64
│     %41  = Base.getfield(%34, :stop)::Int64
│     %42  = Base.getfield(%34, :start)::Int64
│     %43  = Base.sub_int(%41, %42)::Int64
│     %44  = Base.add_int(%43, 1)::Int64
│     %45  = Base.slt_int(%44, 0)::Bool
│     %46  = Base.ifelse(%45, 0, %44)::Int64
│     %47  = Base.sle_int(1, 1)::Bool
│     %48  = Base.sle_int(1, %40)::Bool
│     %49  = Base.and_int(%47, %48)::Bool
│     %50  = Base.sle_int(1, %28)::Bool
│     %51  = Base.sle_int(%28, %46)::Bool
│     %52  = Base.and_int(%50, %51)::Bool
│     %53  = Base.and_int(%52, true)::Bool
│     %54  = Base.and_int(%49, %53)::Bool
└────        goto #8 if not %54
7 ───        Base.nothing::Nothing
└────        goto #9
8 ───        invoke Base.throw_boundserror(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, %31::Tuple{Int64, Int64})::Union{}
└────        unreachable
9 ───        nothing::Nothing
10 ┄─ %61  = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %62  = Base.getfield(%61, 1, false)::UnitRange{Int64}
│     %63  = Base.getfield(%62, :start)::Int64
│     %64  = Base.sub_int(1, 1)::Int64
│     %65  = Base.add_int(%63, %64)::Int64
└────        goto #19 if not false
11 ── %67  = Base.slt_int(0, 1)::Bool
└────        goto #15 if not %67
12 ── %69  = Base.getfield(%62, :stop)::Int64
│     %70  = Base.sle_int(%65, %69)::Bool
└────        goto #14 if not %70
13 ── %72  = Base.getfield(%62, :start)::Int64
│     %73  = Base.sle_int(%72, %65)::Bool
└────        goto #16
14 ──        goto #16
15 ──        goto #16
16 ┄─ %77  = φ (#13 => %73, #14 => false, #15 => false)::Bool
└────        goto #18 if not %77
17 ──        goto #19
18 ──        invoke Base.throw_boundserror(%62::UnitRange{Int64}, 1::Int64)::Union{}
└────        unreachable
19 ┄─        goto #20
20 ── %83  = Core.getfield(%61, 2)::UnitRange{Int64}
│     %84  = Base.getfield(%83, :start)::Int64
│     %85  = Base.sub_int(%28, 1)::Int64
│     %86  = Base.add_int(%84, %85)::Int64
└────        goto #29 if not false
21 ── %88  = Base.slt_int(0, %28)::Bool
└────        goto #25 if not %88
22 ── %90  = Base.getfield(%83, :stop)::Int64
│     %91  = Base.sle_int(%86, %90)::Bool
└────        goto #24 if not %91
23 ── %93  = Base.getfield(%83, :start)::Int64
│     %94  = Base.sle_int(%93, %86)::Bool
└────        goto #26
24 ──        goto #26
25 ──        goto #26
26 ┄─ %98  = φ (#23 => %94, #24 => false, #25 => false)::Bool
└────        goto #28 if not %98
27 ──        goto #29
28 ──        invoke Base.throw_boundserror(%83::UnitRange{Int64}, %28::Int64)::Union{}
└────        unreachable
29 ┄─        goto #30
30 ──        goto #31
31 ──        goto #32
32 ── %106 = Base.getfield(A, :parent)::Matrix{Float64}
│     %107 = Base.arrayref(false, %106, %65, %86)::Float64
└────        goto #33
33 ── %109 = Base.slt_int(%9, 2)::Bool
└────        goto #35 if not %109
34 ──        goto #36
35 ──        goto #36
36 ┄─        goto #37
37 ──        goto #38
38 ── %115 = Base.getfield(x, :parent)::Matrix{Float64}
│     %116 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%115)))::Ptr{Float64}
│     %117 = Base.getfield(x, :parent)::Matrix{Float64}
│     %118 = Base.getfield(x, :indices)::Tuple{UnitRange{Int64}, Int64}
│     %119 = LayoutPointers.getfield(%118, 1, false)::UnitRange{Int64}
│     %120 = Base.getfield(%119, :start)::Int64
│     %121 = Base.sub_int(%120, 1)::Int64
│     %122 = Core.getfield(%118, 2)::Int64
│     %123 = Base.sub_int(%122, 1)::Int64
│     %124 = Base.arraysize(%117, 1)::Int64
│            Base.arraysize(%117, 2)::Int64
│     %126 = Base.mul_int(1, %124)::Int64
│     %127 = Base.mul_int(%121, 1)::Int64
│     %128 = Base.mul_int(%123, %126)::Int64
│     %129 = Base.add_int(%127, %128)::Int64
│     %130 = Base.mul_int(8, %129)::Int64
│     %131 = Core.bitcast(Core.UInt, %116)::UInt64
│     %132 = Base.bitcast(UInt64, %130)::UInt64
│     %133 = Base.add_ptr(%131, %132)::UInt64
│     %134 = Core.bitcast(Ptr{Float64}, %133)::Ptr{Float64}
│            invoke LayoutPointers.StrideIndex(x::SubArray{Float64, 1, Matrix{Float64}, Tuple{UnitRange{Int64}, Int64}, true})::ArrayInterface.StrideIndex{1, (1,), 1, Tuple{Static.StaticInt{1}}, Tuple{Static.StaticInt{1}}}
│     %136 = Base.getfield(A, :parent)::Matrix{Float64}
│     %137 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%136)))::Ptr{Float64}
│     %138 = Base.getfield(A, :parent)::Matrix{Float64}
│     %139 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %140 = LayoutPointers.getfield(%139, 1, false)::UnitRange{Int64}
│     %141 = Base.getfield(%140, :start)::Int64
│     %142 = Base.sub_int(%141, 1)::Int64
│     %143 = Core.getfield(%139, 2)::UnitRange{Int64}
│     %144 = Base.getfield(%143, :start)::Int64
│     %145 = Base.sub_int(%144, 1)::Int64
│     %146 = Base.arraysize(%138, 1)::Int64
│            Base.arraysize(%138, 2)::Int64
│     %148 = Base.mul_int(1, %146)::Int64
│     %149 = Base.mul_int(%142, 1)::Int64
│     %150 = Base.mul_int(%145, %148)::Int64
│     %151 = Base.add_int(%149, %150)::Int64
│     %152 = Base.mul_int(8, %151)::Int64
│     %153 = Core.bitcast(Core.UInt, %137)::UInt64
│     %154 = Base.bitcast(UInt64, %152)::UInt64
│     %155 = Base.add_ptr(%153, %154)::UInt64
│     %156 = Core.bitcast(Ptr{Float64}, %155)::Ptr{Float64}
│     %157 = invoke LayoutPointers.StrideIndex(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false})::ArrayInterface.StrideIndex{2, (1, 2), 1, Tuple{Static.StaticInt{1}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}
│     %158 = Base.getfield(%157, :strides)::Tuple{Static.StaticInt{1}, Int64}
│     %159 = Core.getfield(%158, 2)::Int64
│     %160 = Base.mul_int(8, %159)::Int64
│     %161 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %28, 1)::Int64
│     %162 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %161, %160)::Int64
│     %163 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = ptrtoint i8* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %156, %162)::Ptr{Float64}
│     %164 = Base.llvmcall(("    \n\n    define i64 @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}}, %134)::Ptr{Float64}
│     %165 = Base.llvmcall(("    \n\n    define i64 @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}}, %163)::Ptr{Float64}
│     %166 = $(Expr(:gc_preserve_begin, :(%115), :(%136)))
│     %167 = Base.sub_int(%9, 2)::Int64
│     %168 = Base.slt_int(%167, 0)::Bool
└────        goto #40 if not %168
39 ──        goto #41
40 ── %171 = Base.sub_int(%167, 0)::Int64
│     %172 = Base.add_int(%171, 1)::Int64
└────        goto #41
41 ┄─ %174 = φ (#39 => 0, #40 => %172)::Int64
└────        goto #42
42 ──        goto #43
43 ── %177 = Base.slt_int(0, %167)::Bool
│     %178 = (0 === %167)::Bool
│     %179 = Base.or_int(%177, %178)::Bool
│            Base.llvmcall(("    declare void @llvm.assume(i1)\n\n    define void @entry(i8) alwaysinline {\n    top:\n        %b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Bool}, %179)::Nothing
│     %181 = Base.slt_int(15, %167)::Bool
│     %182 = (15 === %167)::Bool
│     %183 = Base.or_int(%181, %182)::Bool
└────        goto #52 if not %183
44 ── %185 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
│     %186 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
│     %187 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
└──── %188 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
45 ┄─ %189 = φ (#44 => 0, #50 => %214)::Int64
│     %190 = φ (#44 => true, #50 => %224)::Bool
│     %191 = φ (#44 => %185, #50 => %210)::NTuple{4, VecElement{Float64}}
│     %192 = φ (#44 => %186, #50 => %211)::NTuple{4, VecElement{Float64}}
│     %193 = φ (#44 => %187, #50 => %212)::NTuple{4, VecElement{Float64}}
│     %194 = φ (#44 => %188, #50 => %213)::NTuple{4, VecElement{Float64}}
│     %195 = φ (#44 => %185, #50 => %210)::NTuple{4, VecElement{Float64}}
│     %196 = φ (#44 => %187, #50 => %212)::NTuple{4, VecElement{Float64}}
│     %197 = φ (#44 => %186, #50 => %211)::NTuple{4, VecElement{Float64}}
│     %198 = φ (#44 => %188, #50 => %213)::NTuple{4, VecElement{Float64}}
└────        goto #51 if not %190
46 ── %200 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %164, %189)::Ptr{Float64}
│     %201 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %200)::NTuple{4, VecElement{Float64}}
│     %202 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %200)::NTuple{4, VecElement{Float64}}
│     %203 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %200)::NTuple{4, VecElement{Float64}}
│     %204 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %200)::NTuple{4, VecElement{Float64}}
│     %205 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %165, %189)::Ptr{Float64}
│     %206 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %205)::NTuple{4, VecElement{Float64}}
│     %207 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %205)::NTuple{4, VecElement{Float64}}
│     %208 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %205)::NTuple{4, VecElement{Float64}}
│     %209 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %205)::NTuple{4, VecElement{Float64}}
│     %210 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %201, %206, %191)::NTuple{4, VecElement{Float64}}
│     %211 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %202, %207, %192)::NTuple{4, VecElement{Float64}}
│     %212 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %203, %208, %193)::NTuple{4, VecElement{Float64}}
│     %213 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %204, %209, %194)::NTuple{4, VecElement{Float64}}
│     %214 = Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 16, %189)::Int64
│     %215 = Base.slt_int(%167, 0)::Bool
└────        goto #48 if not %215
47 ──        goto #49
48 ── %218 = Base.sub_int(%167, 0)::Int64
│     %219 = Base.add_int(%218, 1)::Int64
└────        goto #49
49 ┄─ %221 = φ (#47 => 0, #48 => %219)::Int64
│     %222 = Base.and_int(%221, -16)::Int64
│     %223 = (%214 === %222)::Bool
│     %224 = Base.not_int(%223)::Bool
└────        goto #50
50 ──        goto #45
51 ── %227 = Base.llvmcall("%res = fadd nsz contract <4 x double> %0, %1\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %195, %196)::NTuple{4, VecElement{Float64}}
│     %228 = Base.llvmcall("%res = fadd nsz contract <4 x double> %0, %1\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %197, %198)::NTuple{4, VecElement{Float64}}
└────        goto #53
52 ── %230 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
└──── %231 = Base.llvmcall("ret <4 x double> zeroinitializer", NTuple{4, VecElement{Float64}}, Tuple{})::NTuple{4, VecElement{Float64}}
53 ┄─ %232 = φ (#51 => %189, #52 => 0)::Int64
│     %233 = φ (#51 => %227, #52 => %230)::NTuple{4, VecElement{Float64}}
│     %234 = φ (#51 => %228, #52 => %231)::NTuple{4, VecElement{Float64}}
│     %235 = φ (#51 => %227, #52 => %230)::NTuple{4, VecElement{Float64}}
│     %236 = φ (#51 => %227, #52 => %230)::NTuple{4, VecElement{Float64}}
│     %237 = φ (#51 => %227, #52 => %230)::NTuple{4, VecElement{Float64}}
│     %238 = φ (#51 => %228, #52 => %231)::NTuple{4, VecElement{Float64}}
│     %239 = φ (#51 => %228, #52 => %231)::NTuple{4, VecElement{Float64}}
│     %240 = φ (#51 => %227, #52 => %230)::NTuple{4, VecElement{Float64}}
│     %241 = φ (#51 => %228, #52 => %231)::NTuple{4, VecElement{Float64}}
│     %242 = φ (#51 => %228, #52 => %231)::NTuple{4, VecElement{Float64}}
│     %243 = Base.slt_int(%167, 0)::Bool
└────        goto #55 if not %243
54 ──        goto #56
55 ── %246 = Base.sub_int(%167, 0)::Int64
│     %247 = Base.add_int(%246, 1)::Int64
└────        goto #56
56 ┄─ %249 = φ (#54 => 0, #55 => %247)::Int64
│     %250 = Base.and_int(%249, -8)::Int64
│     %251 = (%232 === %250)::Bool
│     %252 = Base.not_int(%251)::Bool
└────        goto #57
57 ──        goto #59 if not %252
58 ── %255 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %164, %232)::Ptr{Float64}
│     %256 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %255)::NTuple{4, VecElement{Float64}}
│     %257 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %255)::NTuple{4, VecElement{Float64}}
│     %258 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %165, %232)::Ptr{Float64}
│     %259 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %258)::NTuple{4, VecElement{Float64}}
│     %260 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %258)::NTuple{4, VecElement{Float64}}
│     %261 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %256, %259, %233)::NTuple{4, VecElement{Float64}}
│     %262 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %257, %260, %234)::NTuple{4, VecElement{Float64}}
└──── %263 = Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %232)::Int64
59 ┄─ %264 = φ (#58 => %261, #57 => %235)::NTuple{4, VecElement{Float64}}
│     %265 = φ (#58 => %261, #57 => %236)::NTuple{4, VecElement{Float64}}
│     %266 = φ (#58 => %261, #57 => %237)::NTuple{4, VecElement{Float64}}
│     %267 = φ (#58 => %262, #57 => %238)::NTuple{4, VecElement{Float64}}
│     %268 = φ (#58 => %262, #57 => %239)::NTuple{4, VecElement{Float64}}
│     %269 = φ (#58 => %261, #57 => %240)::NTuple{4, VecElement{Float64}}
│     %270 = φ (#58 => %262, #57 => %241)::NTuple{4, VecElement{Float64}}
│     %271 = φ (#58 => %262, #57 => %242)::NTuple{4, VecElement{Float64}}
│     %272 = φ (#58 => %263, #57 => %232)::Int64
│     %273 = Base.sle_int(%272, %167)::Bool
└────        goto #63 if not %273
60 ── %275 = Base.bitcast(UInt64, %174)::UInt64
│     %276 = Base.llvmcall("%res = sub nsw nuw i64 %0, %1\nret i64 %res", UInt64, Tuple{UInt64, UInt64}, %275, 0x0000000000000001)::UInt64
│     %277 = Base.and_int(%276, 0x0000000000000003)::UInt64
│     %278 = Base.llvmcall("  %ie = insertelement <4 x i64> undef, i64 %0, i32 0\n  %v = shufflevector <4 x i64> %ie, <4 x i64> undef, <4 x i32> zeroinitializer\n  ret <4 x i64> %v\n", NTuple{4, VecElement{UInt64}}, Tuple{UInt64}, %277)::NTuple{4, VecElement{UInt64}}
│     %279 = Base.llvmcall("    %ie = insertelement <4 x i64> undef, i64 %0, i32 0\n    %v = shufflevector <4 x i64> %ie, <4 x i64> undef, <4 x i32> zeroinitializer\n    %res = add nsw <4 x i64> %v, <i64 0, i64 1, i64 2, i64 3>\n    ret <4 x i64> %res\n", NTuple{4, VecElement{UInt64}}, Tuple{UInt64}, 0x0000000000000000)::NTuple{4, VecElement{UInt64}}
│     %280 = Base.llvmcall("%m = icmp uge <4 x i64> %0, %1\n%restrunc.0 = bitcast <4 x i1> %m to i4\n%res.0 = zext i4 %restrunc.0 to i8\nret i8 %res.0", VectorizationBase.UInt8, Tuple{NTuple{4, VecElement{UInt64}}, NTuple{4, VecElement{UInt64}}}, %278, %279)::UInt8
│            Base.llvmcall("%res = add nsw nuw i64 %0, %1\nret i64 %res", UInt64, Tuple{UInt64, UInt64}, %277, 0x0000000000000001)::UInt64
│     %282 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %167, 4)::Int64
│     %283 = Base.slt_int(%282, %272)::Bool
└────        goto #62 if not %283
61 ── %285 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %272)::Int64
│     %286 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = bitcast i8* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, Int64, UInt8}, %164, %285, %280)::NTuple{4, VecElement{Float64}}
│     %287 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %272)::Int64
│     %288 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = bitcast i8* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, Int64, UInt8}, %165, %287, %280)::NTuple{4, VecElement{Float64}}
│     %289 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %286, %288, %264)::NTuple{4, VecElement{Float64}}
│     %290 = Base.llvmcall("%masktrunc.0 = trunc i8 %0 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = select nsz arcp contract reassoc <4 x i1> %mask.0, <4 x double> %1, <4 x double> %2\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{UInt8, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %280, %289, %265)::NTuple{4, VecElement{Float64}}
│            Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 4, %272)::Int64
└────        goto #63
62 ── %293 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %164, %272)::Ptr{Float64}
│     %294 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %293)::NTuple{4, VecElement{Float64}}
│     %295 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %293, %280)::NTuple{4, VecElement{Float64}}
│     %296 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %165, %272)::Ptr{Float64}
│     %297 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %296)::NTuple{4, VecElement{Float64}}
│     %298 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %296, %280)::NTuple{4, VecElement{Float64}}
│     %299 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %294, %297, %266)::NTuple{4, VecElement{Float64}}
│     %300 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %295, %298, %267)::NTuple{4, VecElement{Float64}}
│     %301 = Base.llvmcall("%masktrunc.0 = trunc i8 %0 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = select nsz arcp contract reassoc <4 x i1> %mask.0, <4 x double> %1, <4 x double> %2\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{UInt8, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %280, %300, %268)::NTuple{4, VecElement{Float64}}
└────        Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %272)::Int64
63 ┄─ %303 = φ (#61 => %290, #62 => %299, #59 => %269)::NTuple{4, VecElement{Float64}}
│     %304 = φ (#61 => %271, #62 => %301, #59 => %270)::NTuple{4, VecElement{Float64}}
│     %305 = Base.llvmcall("%res = fadd nsz contract <4 x double> %0, %1\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %303, %304)::NTuple{4, VecElement{Float64}}
└────        goto #64
64 ──        $(Expr(:gc_preserve_end, :(%166)))
│     %308 = Base.llvmcall(("    declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)\n\n    define double @entry(double, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc double @llvm.vector.reduce.fadd.v4f64(double %0, <4 x double> %1)\nret double %res\n    }\n", "entry"), VectorizationBase.Float64, Tuple{Float64, NTuple{4, VecElement{Float64}}}, %107, %305)::Float64
│     %309 = Base.mul_float(τ, %308)::Float64
└────        goto #69 if not false
65 ── %311 = Core.tuple(1, %28)::Tuple{Int64, Int64}
│     %312 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %313 = Core.getfield(%312, 1)::UnitRange{Int64}
│     %314 = Core.getfield(%312, 2)::UnitRange{Int64}
│     %315 = Base.getfield(%313, :stop)::Int64
│     %316 = Base.getfield(%313, :start)::Int64
│     %317 = Base.sub_int(%315, %316)::Int64
│     %318 = Base.add_int(%317, 1)::Int64
│     %319 = Base.slt_int(%318, 0)::Bool
│     %320 = Base.ifelse(%319, 0, %318)::Int64
│     %321 = Base.getfield(%314, :stop)::Int64
│     %322 = Base.getfield(%314, :start)::Int64
│     %323 = Base.sub_int(%321, %322)::Int64
│     %324 = Base.add_int(%323, 1)::Int64
│     %325 = Base.slt_int(%324, 0)::Bool
│     %326 = Base.ifelse(%325, 0, %324)::Int64
│     %327 = Base.sle_int(1, 1)::Bool
│     %328 = Base.sle_int(1, %320)::Bool
│     %329 = Base.and_int(%327, %328)::Bool
│     %330 = Base.sle_int(1, %28)::Bool
│     %331 = Base.sle_int(%28, %326)::Bool
│     %332 = Base.and_int(%330, %331)::Bool
│     %333 = Base.and_int(%332, true)::Bool
│     %334 = Base.and_int(%329, %333)::Bool
└────        goto #67 if not %334
66 ──        Base.nothing::Nothing
└────        goto #68
67 ──        invoke Base.throw_boundserror(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, %311::Tuple{Int64, Int64})::Union{}
└────        unreachable
68 ──        nothing::Nothing
69 ┄─ %341 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %342 = Base.getfield(%341, 1, false)::UnitRange{Int64}
│     %343 = Base.getfield(%342, :start)::Int64
│     %344 = Base.sub_int(1, 1)::Int64
│     %345 = Base.add_int(%343, %344)::Int64
└────        goto #78 if not false
70 ── %347 = Base.slt_int(0, 1)::Bool
└────        goto #74 if not %347
71 ── %349 = Base.getfield(%342, :stop)::Int64
│     %350 = Base.sle_int(%345, %349)::Bool
└────        goto #73 if not %350
72 ── %352 = Base.getfield(%342, :start)::Int64
│     %353 = Base.sle_int(%352, %345)::Bool
└────        goto #75
73 ──        goto #75
74 ──        goto #75
75 ┄─ %357 = φ (#72 => %353, #73 => false, #74 => false)::Bool
└────        goto #77 if not %357
76 ──        goto #78
77 ──        invoke Base.throw_boundserror(%342::UnitRange{Int64}, 1::Int64)::Union{}
└────        unreachable
78 ┄─        goto #79
79 ── %363 = Core.getfield(%341, 2)::UnitRange{Int64}
│     %364 = Base.getfield(%363, :start)::Int64
│     %365 = Base.sub_int(%28, 1)::Int64
│     %366 = Base.add_int(%364, %365)::Int64
└────        goto #88 if not false
80 ── %368 = Base.slt_int(0, %28)::Bool
└────        goto #84 if not %368
81 ── %370 = Base.getfield(%363, :stop)::Int64
│     %371 = Base.sle_int(%366, %370)::Bool
└────        goto #83 if not %371
82 ── %373 = Base.getfield(%363, :start)::Int64
│     %374 = Base.sle_int(%373, %366)::Bool
└────        goto #85
83 ──        goto #85
84 ──        goto #85
85 ┄─ %378 = φ (#82 => %374, #83 => false, #84 => false)::Bool
└────        goto #87 if not %378
86 ──        goto #88
87 ──        invoke Base.throw_boundserror(%363::UnitRange{Int64}, %28::Int64)::Union{}
└────        unreachable
88 ┄─        goto #89
89 ──        goto #90
90 ──        goto #91
91 ── %386 = Base.getfield(A, :parent)::Matrix{Float64}
│     %387 = Base.arrayref(false, %386, %345, %366)::Float64
└────        goto #92
92 ── %389 = Base.sub_float(%387, %309)::Float64
└────        goto #97 if not false
93 ── %391 = Core.tuple(1, %28)::Tuple{Int64, Int64}
│     %392 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %393 = Core.getfield(%392, 1)::UnitRange{Int64}
│     %394 = Core.getfield(%392, 2)::UnitRange{Int64}
│     %395 = Base.getfield(%393, :stop)::Int64
│     %396 = Base.getfield(%393, :start)::Int64
│     %397 = Base.sub_int(%395, %396)::Int64
│     %398 = Base.add_int(%397, 1)::Int64
│     %399 = Base.slt_int(%398, 0)::Bool
│     %400 = Base.ifelse(%399, 0, %398)::Int64
│     %401 = Base.getfield(%394, :stop)::Int64
│     %402 = Base.getfield(%394, :start)::Int64
│     %403 = Base.sub_int(%401, %402)::Int64
│     %404 = Base.add_int(%403, 1)::Int64
│     %405 = Base.slt_int(%404, 0)::Bool
│     %406 = Base.ifelse(%405, 0, %404)::Int64
│     %407 = Base.sle_int(1, 1)::Bool
│     %408 = Base.sle_int(1, %400)::Bool
│     %409 = Base.and_int(%407, %408)::Bool
│     %410 = Base.sle_int(1, %28)::Bool
│     %411 = Base.sle_int(%28, %406)::Bool
│     %412 = Base.and_int(%410, %411)::Bool
│     %413 = Base.and_int(%412, true)::Bool
│     %414 = Base.and_int(%409, %413)::Bool
└────        goto #95 if not %414
94 ──        Base.nothing::Nothing
└────        goto #96
95 ──        invoke Base.throw_boundserror(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, %391::Tuple{Int64, Int64})::Union{}
└────        unreachable
96 ──        nothing::Nothing
97 ┄─ %421 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %422 = Base.getfield(%421, 1, false)::UnitRange{Int64}
│     %423 = Base.getfield(%422, :start)::Int64
│     %424 = Base.sub_int(1, 1)::Int64
│     %425 = Base.add_int(%423, %424)::Int64
└────        goto #106 if not false
98 ── %427 = Base.slt_int(0, 1)::Bool
└────        goto #102 if not %427
99 ── %429 = Base.getfield(%422, :stop)::Int64
│     %430 = Base.sle_int(%425, %429)::Bool
└────        goto #101 if not %430
100 ─ %432 = Base.getfield(%422, :start)::Int64
│     %433 = Base.sle_int(%432, %425)::Bool
└────        goto #103
101 ─        goto #103
102 ─        goto #103
103 ┄ %437 = φ (#100 => %433, #101 => false, #102 => false)::Bool
└────        goto #105 if not %437
104 ─        goto #106
105 ─        invoke Base.throw_boundserror(%422::UnitRange{Int64}, 1::Int64)::Union{}
└────        unreachable
106 ┄        goto #107
107 ─ %443 = Core.getfield(%421, 2)::UnitRange{Int64}
│     %444 = Base.getfield(%443, :start)::Int64
│     %445 = Base.sub_int(%28, 1)::Int64
│     %446 = Base.add_int(%444, %445)::Int64
└────        goto #116 if not false
108 ─ %448 = Base.slt_int(0, %28)::Bool
└────        goto #112 if not %448
109 ─ %450 = Base.getfield(%443, :stop)::Int64
│     %451 = Base.sle_int(%446, %450)::Bool
└────        goto #111 if not %451
110 ─ %453 = Base.getfield(%443, :start)::Int64
│     %454 = Base.sle_int(%453, %446)::Bool
└────        goto #113
111 ─        goto #113
112 ─        goto #113
113 ┄ %458 = φ (#110 => %454, #111 => false, #112 => false)::Bool
└────        goto #115 if not %458
114 ─        goto #116
115 ─        invoke Base.throw_boundserror(%443::UnitRange{Int64}, %28::Int64)::Union{}
└────        unreachable
116 ┄        goto #117
117 ─        goto #118
118 ─        goto #119
119 ─ %466 = Base.getfield(A, :parent)::Matrix{Float64}
│            Base.arrayset(false, %466, %389, %425, %446)::Matrix{Float64}
└────        goto #120
120 ─ %469 = Base.slt_int(%9, 2)::Bool
└────        goto #122 if not %469
121 ─        goto #123
122 ─        goto #123
123 ┄        goto #124
124 ─        goto #125
125 ─ %475 = Base.getfield(A, :parent)::Matrix{Float64}
│     %476 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%475)))::Ptr{Float64}
│     %477 = Base.getfield(A, :parent)::Matrix{Float64}
│     %478 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│     %479 = LayoutPointers.getfield(%478, 1, false)::UnitRange{Int64}
│     %480 = Base.getfield(%479, :start)::Int64
│     %481 = Base.sub_int(%480, 1)::Int64
│     %482 = Core.getfield(%478, 2)::UnitRange{Int64}
│     %483 = Base.getfield(%482, :start)::Int64
│     %484 = Base.sub_int(%483, 1)::Int64
│     %485 = Base.arraysize(%477, 1)::Int64
│            Base.arraysize(%477, 2)::Int64
│     %487 = Base.mul_int(1, %485)::Int64
│     %488 = Base.mul_int(%481, 1)::Int64
│     %489 = Base.mul_int(%484, %487)::Int64
│     %490 = Base.add_int(%488, %489)::Int64
│     %491 = Base.mul_int(8, %490)::Int64
│     %492 = Core.bitcast(Core.UInt, %476)::UInt64
│     %493 = Base.bitcast(UInt64, %491)::UInt64
│     %494 = Base.add_ptr(%492, %493)::UInt64
│     %495 = Core.bitcast(Ptr{Float64}, %494)::Ptr{Float64}
│     %496 = invoke LayoutPointers.StrideIndex(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false})::ArrayInterface.StrideIndex{2, (1, 2), 1, Tuple{Static.StaticInt{1}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}
│     %497 = Base.getfield(%496, :strides)::Tuple{Static.StaticInt{1}, Int64}
│     %498 = Core.getfield(%497, 2)::Int64
│     %499 = Base.mul_int(8, %498)::Int64
│     %500 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %28, 1)::Int64
│     %501 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %500, %499)::Int64
│            Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = ptrtoint i8* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %495, %501)::Ptr{Float64}
│     %503 = Base.getfield(x, :parent)::Matrix{Float64}
│     %504 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%503)))::Ptr{Float64}
│     %505 = Base.getfield(x, :parent)::Matrix{Float64}
│     %506 = Base.getfield(x, :indices)::Tuple{UnitRange{Int64}, Int64}
│     %507 = LayoutPointers.getfield(%506, 1, false)::UnitRange{Int64}
│     %508 = Base.getfield(%507, :start)::Int64
│     %509 = Base.sub_int(%508, 1)::Int64
│     %510 = Core.getfield(%506, 2)::Int64
│     %511 = Base.sub_int(%510, 1)::Int64
│     %512 = Base.arraysize(%505, 1)::Int64
│            Base.arraysize(%505, 2)::Int64
│     %514 = Base.mul_int(1, %512)::Int64
│     %515 = Base.mul_int(%509, 1)::Int64
│     %516 = Base.mul_int(%511, %514)::Int64
│     %517 = Base.add_int(%515, %516)::Int64
│     %518 = Base.mul_int(8, %517)::Int64
│     %519 = Core.bitcast(Core.UInt, %504)::UInt64
│     %520 = Base.bitcast(UInt64, %518)::UInt64
│     %521 = Base.add_ptr(%519, %520)::UInt64
│     %522 = Core.bitcast(Ptr{Float64}, %521)::Ptr{Float64}
│            invoke LayoutPointers.StrideIndex(x::SubArray{Float64, 1, Matrix{Float64}, Tuple{UnitRange{Int64}, Int64}, true})::ArrayInterface.StrideIndex{1, (1,), 1, Tuple{Static.StaticInt{1}}, Tuple{Static.StaticInt{1}}}
│     %524 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %28, 1)::Int64
│     %525 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %524, %499)::Int64
│     %526 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = ptrtoint i8* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %495, %525)::Ptr{Float64}
│     %527 = Base.llvmcall(("    \n\n    define i64 @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}}, %522)::Ptr{Float64}
│     %528 = Base.llvmcall(("    \n\n    define i64 @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}}, %526)::Ptr{Float64}
│     %529 = $(Expr(:gc_preserve_begin, :(%503), :(%475)))
│     %530 = Base.sub_int(%9, 2)::Int64
│     %531 = Base.slt_int(%530, 0)::Bool
└────        goto #127 if not %531
126 ─        goto #128
127 ─ %534 = Base.sub_int(%530, 0)::Int64
│     %535 = Base.add_int(%534, 1)::Int64
└────        goto #128
128 ┄ %537 = φ (#126 => 0, #127 => %535)::Int64
└────        goto #129
129 ─        goto #130
130 ─ %540 = Base.slt_int(0, %530)::Bool
│     %541 = (0 === %530)::Bool
│     %542 = Base.or_int(%540, %541)::Bool
└────        Base.llvmcall(("    declare void @llvm.assume(i1)\n\n    define void @entry(i8) alwaysinline {\n    top:\n        %b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Bool}, %542)::Nothing
131 ┄ %544 = φ (#130 => 0, #136 => %581)::Int64
│     %545 = Base.slt_int(%530, 0)::Bool
└────        goto #133 if not %545
132 ─        goto #134
133 ─ %548 = Base.sub_int(%530, 0)::Int64
│     %549 = Base.add_int(%548, 1)::Int64
└────        goto #134
134 ┄ %551 = φ (#132 => 0, #133 => %549)::Int64
│     %552 = Base.and_int(%551, -16)::Int64
│     %553 = (%544 === %552)::Bool
│     %554 = Base.not_int(%553)::Bool
└────        goto #135
135 ─        goto #137 if not %554
136 ─ %557 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %527, %544)::Ptr{Float64}
│     %558 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %557)::NTuple{4, VecElement{Float64}}
│     %559 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %557)::NTuple{4, VecElement{Float64}}
│     %560 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %557)::NTuple{4, VecElement{Float64}}
│     %561 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %557)::NTuple{4, VecElement{Float64}}
│     %562 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│     %563 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %562)::NTuple{4, VecElement{Float64}}
│     %564 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %562)::NTuple{4, VecElement{Float64}}
│     %565 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %562)::NTuple{4, VecElement{Float64}}
│     %566 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %562)::NTuple{4, VecElement{Float64}}
│     %567 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %558)::NTuple{4, VecElement{Float64}}
│     %568 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %559)::NTuple{4, VecElement{Float64}}
│     %569 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %560)::NTuple{4, VecElement{Float64}}
│     %570 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %561)::NTuple{4, VecElement{Float64}}
│     %571 = Base.llvmcall("  %ie = insertelement <4 x double> undef, double %0, i32 0\n  %v = shufflevector <4 x double> %ie, <4 x double> undef, <4 x i32> zeroinitializer\n  ret <4 x double> %v\n", NTuple{4, VecElement{Float64}}, Tuple{Float64}, %309)::NTuple{4, VecElement{Float64}}
│     %572 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %567, %571, %563)::NTuple{4, VecElement{Float64}}
│     %573 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %568, %571, %564)::NTuple{4, VecElement{Float64}}
│     %574 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %569, %571, %565)::NTuple{4, VecElement{Float64}}
│     %575 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %570, %571, %566)::NTuple{4, VecElement{Float64}}
│     %576 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.1, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %576, %572)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %576, %573)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %576, %574)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %576, %575)::Nothing
│     %581 = Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 16, %544)::Int64
└────        goto #131
137 ─ %583 = Base.sle_int(%544, %530)::Bool
└────        goto #145 if not %583
138 ─ %585 = Base.bitcast(UInt64, %537)::UInt64
│     %586 = Base.llvmcall("%res = sub nsw nuw i64 %0, %1\nret i64 %res", UInt64, Tuple{UInt64, UInt64}, %585, 0x0000000000000001)::UInt64
│     %587 = Base.and_int(%586, 0x0000000000000003)::UInt64
│     %588 = Base.llvmcall("  %ie = insertelement <4 x i64> undef, i64 %0, i32 0\n  %v = shufflevector <4 x i64> %ie, <4 x i64> undef, <4 x i32> zeroinitializer\n  ret <4 x i64> %v\n", NTuple{4, VecElement{UInt64}}, Tuple{UInt64}, %587)::NTuple{4, VecElement{UInt64}}
│     %589 = Base.llvmcall("    %ie = insertelement <4 x i64> undef, i64 %0, i32 0\n    %v = shufflevector <4 x i64> %ie, <4 x i64> undef, <4 x i32> zeroinitializer\n    %res = add nsw <4 x i64> %v, <i64 0, i64 1, i64 2, i64 3>\n    ret <4 x i64> %res\n", NTuple{4, VecElement{UInt64}}, Tuple{UInt64}, 0x0000000000000000)::NTuple{4, VecElement{UInt64}}
│     %590 = Base.llvmcall("%m = icmp uge <4 x i64> %0, %1\n%restrunc.0 = bitcast <4 x i1> %m to i4\n%res.0 = zext i4 %restrunc.0 to i8\nret i8 %res.0", VectorizationBase.UInt8, Tuple{NTuple{4, VecElement{UInt64}}, NTuple{4, VecElement{UInt64}}}, %588, %589)::UInt8
│            Base.llvmcall("%res = add nsw nuw i64 %0, %1\nret i64 %res", UInt64, Tuple{UInt64, UInt64}, %587, 0x0000000000000001)::UInt64
│     %592 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %530, 4)::Int64
│     %593 = Base.slt_int(%592, %544)::Bool
└────        goto #140 if not %593
139 ─ %595 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %544)::Int64
│     %596 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = bitcast i8* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, Int64, UInt8}, %527, %595, %590)::NTuple{4, VecElement{Float64}}
│     %597 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %544)::Int64
│     %598 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %1\n%ptr.2 = bitcast i8* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, Int64, UInt8}, %528, %597, %590)::NTuple{4, VecElement{Float64}}
│     %599 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %596)::NTuple{4, VecElement{Float64}}
│     %600 = Base.llvmcall("  %ie = insertelement <4 x double> undef, double %0, i32 0\n  %v = shufflevector <4 x double> %ie, <4 x double> undef, <4 x i32> zeroinitializer\n  ret <4 x double> %v\n", NTuple{4, VecElement{Float64}}, Tuple{Float64}, %309)::NTuple{4, VecElement{Float64}}
│     %601 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %599, %600, %598)::NTuple{4, VecElement{Float64}}
│     %602 = Base.llvmcall("%res = mul nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %544)::Int64
│            Base.llvmcall(("    declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)\n\n    define void @entry(i64, <4 x double>, i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to i8*\n%ptr.1 = getelementptr inbounds i8, i8* %ptr.0, i64 %2\n%ptr.2 = bitcast i8* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %3 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\ncall void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %1, <4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}, Int64, UInt8}, %528, %601, %602, %590)::Nothing
│            Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 4, %544)::Int64
└────        goto #145
140 ─ %606 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %530, 8)::Int64
│     %607 = Base.slt_int(%606, %544)::Bool
└────        goto #142 if not %607
141 ─ %609 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %527, %544)::Ptr{Float64}
│     %610 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %609)::NTuple{4, VecElement{Float64}}
│     %611 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %609, %590)::NTuple{4, VecElement{Float64}}
│     %612 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│     %613 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %612)::NTuple{4, VecElement{Float64}}
│     %614 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %612, %590)::NTuple{4, VecElement{Float64}}
│     %615 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %610)::NTuple{4, VecElement{Float64}}
│     %616 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %611)::NTuple{4, VecElement{Float64}}
│     %617 = Base.llvmcall("  %ie = insertelement <4 x double> undef, double %0, i32 0\n  %v = shufflevector <4 x double> %ie, <4 x double> undef, <4 x i32> zeroinitializer\n  ret <4 x double> %v\n", NTuple{4, VecElement{Float64}}, Tuple{Float64}, %309)::NTuple{4, VecElement{Float64}}
│     %618 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %615, %617, %613)::NTuple{4, VecElement{Float64}}
│     %619 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %616, %617, %614)::NTuple{4, VecElement{Float64}}
│     %620 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.1, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %620, %618)::Nothing
│            Base.llvmcall(("    declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)\n\n    define void @entry(i64, <4 x double>, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\ncall void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %1, <4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}, UInt8}, %620, %619, %590)::Nothing
│            Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 8, %544)::Int64
└────        goto #145
142 ─ %625 = Base.llvmcall("%res = sub nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, %530, 12)::Int64
│     %626 = Base.slt_int(%625, %544)::Bool
└────        goto #144 if not %626
143 ─ %628 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %527, %544)::Ptr{Float64}
│     %629 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %628)::NTuple{4, VecElement{Float64}}
│     %630 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %628)::NTuple{4, VecElement{Float64}}
│     %631 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %628, %590)::NTuple{4, VecElement{Float64}}
│     %632 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│     %633 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %632)::NTuple{4, VecElement{Float64}}
│     %634 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %632)::NTuple{4, VecElement{Float64}}
│     %635 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %632, %590)::NTuple{4, VecElement{Float64}}
│     %636 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %629)::NTuple{4, VecElement{Float64}}
│     %637 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %630)::NTuple{4, VecElement{Float64}}
│     %638 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %631)::NTuple{4, VecElement{Float64}}
│     %639 = Base.llvmcall("  %ie = insertelement <4 x double> undef, double %0, i32 0\n  %v = shufflevector <4 x double> %ie, <4 x double> undef, <4 x i32> zeroinitializer\n  ret <4 x double> %v\n", NTuple{4, VecElement{Float64}}, Tuple{Float64}, %309)::NTuple{4, VecElement{Float64}}
│     %640 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %636, %639, %633)::NTuple{4, VecElement{Float64}}
│     %641 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %637, %639, %634)::NTuple{4, VecElement{Float64}}
│     %642 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %638, %639, %635)::NTuple{4, VecElement{Float64}}
│     %643 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.1, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %643, %640)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %643, %641)::Nothing
│            Base.llvmcall(("    declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)\n\n    define void @entry(i64, <4 x double>, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\ncall void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %1, <4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}, UInt8}, %643, %642, %590)::Nothing
│            Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 12, %544)::Int64
└────        goto #145
144 ─ %649 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %527, %544)::Ptr{Float64}
│     %650 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %649)::NTuple{4, VecElement{Float64}}
│     %651 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %649)::NTuple{4, VecElement{Float64}}
│     %652 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %649)::NTuple{4, VecElement{Float64}}
│     %653 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %649, %590)::NTuple{4, VecElement{Float64}}
│     %654 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│     %655 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.1, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %654)::NTuple{4, VecElement{Float64}}
│     %656 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %654)::NTuple{4, VecElement{Float64}}
│     %657 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n\n\n    define <4 x double> @entry(i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%res = load <4 x double>, <4 x double>* %ptr.2, align 8, !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}}, %654)::NTuple{4, VecElement{Float64}}
│     %658 = Base.llvmcall(("    !1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)\n\n    define <4 x double> @entry(i64, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %1 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\n%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0, <4 x double> zeroinitializer), !alias.scope !3\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{Ptr{Float64}, UInt8}, %654, %590)::NTuple{4, VecElement{Float64}}
│     %659 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %650)::NTuple{4, VecElement{Float64}}
│     %660 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %651)::NTuple{4, VecElement{Float64}}
│     %661 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %652)::NTuple{4, VecElement{Float64}}
│     %662 = Base.llvmcall("%res = fneg nsz arcp contract afn reassoc <4 x double> %0\nret <4 x double> %res", NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}}, %653)::NTuple{4, VecElement{Float64}}
│     %663 = Base.llvmcall("  %ie = insertelement <4 x double> undef, double %0, i32 0\n  %v = shufflevector <4 x double> %ie, <4 x double> undef, <4 x i32> zeroinitializer\n  ret <4 x double> %v\n", NTuple{4, VecElement{Float64}}, Tuple{Float64}, %309)::NTuple{4, VecElement{Float64}}
│     %664 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %659, %663, %655)::NTuple{4, VecElement{Float64}}
│     %665 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %660, %663, %656)::NTuple{4, VecElement{Float64}}
│     %666 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %661, %663, %657)::NTuple{4, VecElement{Float64}}
│     %667 = Base.llvmcall(("    declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)\n\n    define <4 x double> @entry(<4 x double>, <4 x double>, <4 x double>) alwaysinline {\n    top:\n        %res = call nsz arcp contract afn reassoc <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)\nret <4 x double> %res\n    }\n", "entry"), NTuple{4, VecElement{Float64}}, Tuple{NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}, NTuple{4, VecElement{Float64}}}, %662, %663, %658)::NTuple{4, VecElement{Float64}}
│     %668 = Base.llvmcall(("    \n\n    define i64 @entry(i64, i64) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i64 %1\n%ptr.2 = ptrtoint double* %ptr.1 to i64\nret i64 %ptr.2\n    }\n", "entry"), Ptr{Float64}, Tuple{Ptr{Float64}, Int64}, %528, %544)::Ptr{Float64}
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = bitcast double* %ptr.0 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.1, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %668, %664)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 4\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %668, %665)::Nothing
│            Base.llvmcall(("    \n\n    define void @entry(i64, <4 x double>) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 8\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\nstore <4 x double> %1, <4 x double>* %ptr.2, align 8\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}}, %668, %666)::Nothing
│            Base.llvmcall(("    declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)\n\n    define void @entry(i64, <4 x double>, i8) alwaysinline {\n    top:\n        %ptr.0 = inttoptr i64 %0 to double*\n%ptr.1 = getelementptr inbounds double, double* %ptr.0, i32 12\n%ptr.2 = bitcast double* %ptr.1 to <4 x double>*\n%masktrunc.0 = trunc i8 %2 to i4\n%mask.0 = bitcast i4 %masktrunc.0 to <4 x i1>\ncall void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %1, <4 x double>* %ptr.2, i32 8, <4 x i1> %mask.0)\nret void\n    }\n", "entry"), VectorizationBase.Cvoid, Tuple{Ptr{Float64}, NTuple{4, VecElement{Float64}}, UInt8}, %668, %667, %590)::Nothing
└────        Base.llvmcall("%res = add nsw i64 %0, %1\nret i64 %res", Int64, Tuple{Int64, Int64}, 16, %544)::Int64
145 ┄        goto #146
146 ─        $(Expr(:gc_preserve_end, :(%529)))
│     %676 = (%29 === %17)::Bool
└────        goto #148 if not %676
147 ─        Base.nothing::Nothing
└────        goto #149
148 ─ %680 = Base.add_int(%29, 1)::Int64
└────        goto #149
149 ┄ %682 = φ (#148 => %680)::Int64
│     %683 = φ (#148 => %680)::Int64
│     %684 = φ (#147 => true, #148 => false)::Bool
│     %685 = Base.not_int(%684)::Bool
└────        goto #151 if not %685
150 ─        goto #5
151 ┄        return A
) => SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}
	julia> @code_native debuginfo = :none syntax = :intel reflectorApply!(x, τ, y)
	.text
	push rbp
	mov rbp, rsp
	push r15
	push r14
	push r13
	push r12
	push rbx
	and rsp, -32
	sub rsp, 384
	vmovq qword ptr [rsp + 224], xmm0
	mov qword ptr [rsp + 48], rdx
	mov qword ptr [rsp + 144], rsi
	mov qword ptr [rsp + 160], rdi
	vpxor xmm0, xmm0, xmm0
	vmovdqa ymmword ptr [rsp + 288], ymm0
	mov rax, qword ptr fs:[0]
	mov rdx, qword ptr [rax - 8]
	mov qword ptr [rsp + 288], 8
	mov rax, qword ptr [rdx]
	mov qword ptr [rsp + 296], rax
	lea rax, [rsp + 288]
	mov qword ptr [rsp + 152], rdx
	mov qword ptr [rdx], rax
	mov rdi, qword ptr [rcx + 32]
	mov qword ptr [rsp + 32], rcx
	mov rcx, qword ptr [rcx + 24]
	mov rax, rcx
	mov qword ptr [rsp + 80], rcx
	sub rdi, rcx
	inc rdi
	test rdi, rdi
	jle L1813
	mov rcx, qword ptr [rsp + 32]
	mov rax, qword ptr [rcx + 8]
	mov rdx, qword ptr [rcx + 16]
	sub rdx, rax
	inc rdx
	mov rsi, rdx
	sar rsi, 63
	andn r11, rsi, rdx
	mov rdx, rdi
	sar rdx, 63
	andn rdx, rdx, rdi
	mov qword ptr [rsp + 216], rdx
	mov r8, qword ptr [rcx]
	mov rdi, qword ptr [r8 + 24]
	mov r15, qword ptr [rsp + 48]
	mov r9, qword ptr [r15]
	mov rcx, qword ptr [r15 + 24]
	dec rcx
	imul rcx, qword ptr [r9 + 24]
	mov rdx, qword ptr [rsp + 80]
	lea r10, [rdx - 1]
	mov qword ptr [rsp + 200], rdi
	imul r10, rdi
	lea rdi, [rax + r10]
	dec rdi
	shl rdi, 3
	mov qword ptr [rsp + 96], rdi
	lea r13, [r11 - 2]
	cmp r11, 18
	setl dil
	cmp r13, 15
	setne bl
	and bl, dil
	mov byte ptr [rsp + 23], bl
	lea rdx, [r11 - 1]
	mov r14, rdx
	and r14, -16
	and rdx, -8
	lea ebx, [r11 + 2]
	and ebx, 3
	xor esi, esi
	cmp r11, 1
	mov edi, 3
	cmovg rdi, rbx
	vmovq xmm0, rdi
	cmovle r14, rsi
	cmovle rdx, rsi
	mov qword ptr [rsp + 64], rdx
	lea rdx, [rax - 1]
	mov qword ptr [rsp + 184], rdx
	shl rax, 3
	lea rax, [rax + 8*r10]
	add rax, 96
	mov qword ptr [rsp + 88], rax
	mov rax, qword ptr [r15 + 8]
	lea rdx, [rax + rcx]
	shl rax, 3
	lea rax, [rax + 8*rcx]
	mov qword ptr [rsp + 192], r9
	mov rcx, qword ptr [r9]
	lea rdx, [rcx + 8*rdx]
	mov qword ptr [rsp + 24], rdx
	lea rbx, [rcx + rax + 96]
	movabs rax, offset .rodata.cst8
	vpbroadcastq ymm1, qword ptr [rax]
	movabs rax, offset .rodata.cst32
	vmovdqa ymm2, ymmword ptr [rax]
	vpbroadcastq ymm0, xmm0
	vpxor ymm0, ymm0, ymm1
	lea rax, [r11 - 6]
	mov qword ptr [rsp + 56], rax
	lea rax, [r11 - 10]
	mov qword ptr [rsp + 176], rax
	add r11, -14
	mov qword ptr [rsp + 168], r11
	vpcmpgtq ymm1, ymm2, ymm0
	vpcmpeqd ymm0, ymm0, ymm0
	vmovdqa ymmword ptr [rsp + 320], ymm1
	vpxor ymm0, ymm1, ymm0
	vmovdqa ymmword ptr [rsp + 256], ymm0
	mov qword ptr [rsp + 208], r8
	mov rax, qword ptr [r8]
	mov qword ptr [rsp + 112], rax
	mov r15d, 1
	mov qword ptr [rsp + 72], r13
	jmp L632
	nop dword ptr [rax + rax]
	L560:
	mov rcx, qword ptr [rsp + 24]
	vmovapd ymm3, ymmword ptr [rsp + 256]
	vmaskmovpd ymm0, ymm3, ymmword ptr [rcx + 8*r14]
	vmaskmovpd ymm1, ymm3, ymmword ptr [rax + 8*r14]
	vbroadcastsd ymm2, xmm7
	vfnmadd213pd ymm2, ymm0, ymm1 # ymm2 = -(ymm0 * ymm2) + ymm1
	vmaskmovpd ymmword ptr [rax + 8*r14], ymm3, ymm2
	L602:
	mov rcx, qword ptr [rsp + 248]
	lea r15, [rcx + 1]
	add rsi, 8
	cmp rcx, qword ptr [rsp + 216]
	je L1813
	L632:
	mov qword ptr [rsp + 120], rsi
	mov rax, qword ptr [rsp + 80]
	lea rcx, [r15 + rax]
	add rcx, -2
	imul rcx, qword ptr [rsp + 200]
	add rcx, qword ptr [rsp + 184]
	mov rax, qword ptr [rsp + 112]
	mov qword ptr [rsp + 40], rcx
	vmovsd xmm0, qword ptr [rax + 8*rcx] # xmm0 = mem[0],zero
	vmovsd qword ptr [rsp + 128], xmm0
	mov rax, qword ptr [rsp + 192]
	mov qword ptr [rsp + 312], rax
	mov r13, qword ptr [rsp + 208]
	mov qword ptr [rsp + 304], r13
	movabs r12, 140286483749744
	mov rdi, r12
	mov rsi, qword ptr [rsp + 48]
	movabs rax, offset StrideIndex
	vzeroupper
	call rax
	mov r13, qword ptr [r13]
	mov rax, qword ptr [rsp + 96]
	add rax, r13
	mov qword ptr [rsp + 104], rax
	mov rdi, r12
	mov rsi, qword ptr [rsp + 32]
	movabs rax, offset StrideIndex
	call rax
	mov qword ptr [rsp + 248], r15
	lea rsi, [8*r15 - 8]
	mov rcx, rsi
	imul rcx, rax
	cmp byte ptr [rsp + 23], 0
	mov qword ptr [rsp + 240], r13
	je L880
	vxorpd xmm0, xmm0, xmm0
	xor eax, eax
	vpxor xmm1, xmm1, xmm1
	mov rdx, qword ptr [rsp + 64]
	vmovsd xmm5, qword ptr [rsp + 128] # xmm5 = mem[0],zero
	mov rdi, qword ptr [rsp + 104]
	jmp L1017
	nop word ptr cs:[rax + rax]
	L880:
	mov rdx, qword ptr [rsp + 88]
	add rdx, r13
	imul rax, qword ptr [rsp + 120]
	add rax, rdx
	vpxor xmm1, xmm1, xmm1
	vxorpd xmm0, xmm0, xmm0
	vpxor xmm2, xmm2, xmm2
	vxorpd xmm3, xmm3, xmm3
	xor edx, edx
	nop word ptr cs:[rax + rax]
	L928:
	vmovupd ymm4, ymmword ptr [rbx + 8*rdx - 96]
	vmovupd ymm5, ymmword ptr [rbx + 8*rdx - 64]
	vmovupd ymm6, ymmword ptr [rbx + 8*rdx - 32]
	vmovupd ymm7, ymmword ptr [rbx + 8*rdx]
	vfmadd231pd ymm3, ymm4, ymmword ptr [rax + 8rdx - 96] # ymm3 = (ymm4 mem) + ymm3
	vfmadd231pd ymm2, ymm5, ymmword ptr [rax + 8rdx - 64] # ymm2 = (ymm5 mem) + ymm2
	vfmadd231pd ymm0, ymm6, ymmword ptr [rax + 8rdx - 32] # ymm0 = (ymm6 mem) + ymm0
	vfmadd231pd ymm1, ymm7, ymmword ptr [rax + 8rdx] # ymm1 = (ymm7 mem) + ymm1
	add rdx, 16
	cmp r14, rdx
	jne L928
	vaddpd ymm0, ymm3, ymm0
	vaddpd ymm1, ymm2, ymm1
	mov rax, r14
	mov rdx, qword ptr [rsp + 64]
	vmovsd xmm5, qword ptr [rsp + 128] # xmm5 = mem[0],zero
	mov rdi, qword ptr [rsp + 104]
	L1017:
	add rcx, rdi
	add rcx, 8
	cmp rax, rdx
	jne L1040
	mov rax, rdx
	mov r13, qword ptr [rsp + 72]
	jmp L1078
	nop
	L1040:
	mov rdx, qword ptr [rsp + 24]
	vmovupd ymm2, ymmword ptr [rdx + 8*rax]
	vmovupd ymm3, ymmword ptr [rdx + 8*rax + 32]
	vfmadd231pd ymm0, ymm2, ymmword ptr [rcx + 8rax] # ymm0 = (ymm2 mem) + ymm0
	vfmadd231pd ymm1, ymm3, ymmword ptr [rcx + 8rax + 32] # ymm1 = (ymm3 mem) + ymm1
	or rax, 8
	mov r13, qword ptr [rsp + 72]
	L1078:
	cmp rax, r13
	mov qword ptr [rsp + 232], rsi
	jle L1104
	mov rcx, qword ptr [rsp + 40]
	jmp L1232
	nop dword ptr [rax]
	L1104:
	cmp qword ptr [rsp + 56], rax
	jge L1168
	mov rdx, qword ptr [rsp + 24]
	vmovapd ymm3, ymmword ptr [rsp + 256]
	vmaskmovpd ymm2, ymm3, ymmword ptr [rdx + 8*rax]
	vmaskmovpd ymm3, ymm3, ymmword ptr [rcx + 8*rax]
	vfmadd213pd ymm3, ymm2, ymm0 # ymm3 = (ymm2 * ymm3) + ymm0
	vmovapd ymm2, ymmword ptr [rsp + 320]
	vblendvpd ymm0, ymm3, ymm0, ymm2
	mov rcx, qword ptr [rsp + 40]
	jmp L1232
	nop dword ptr [rax]
	L1168:
	mov rdx, qword ptr [rsp + 24]
	vmovapd ymm3, ymmword ptr [rsp + 256]
	vmaskmovpd ymm2, ymm3, ymmword ptr [rdx + 8*rax + 32]
	vmaskmovpd ymm3, ymm3, ymmword ptr [rcx + 8*rax + 32]
	vmovupd ymm4, ymmword ptr [rdx + 8*rax]
	vfmadd231pd ymm0, ymm4, ymmword ptr [rcx + 8rax] # ymm0 = (ymm4 mem) + ymm0
	vfmadd213pd ymm3, ymm2, ymm1 # ymm3 = (ymm2 * ymm3) + ymm1
	vmovapd ymm2, ymmword ptr [rsp + 320]
	vblendvpd ymm1, ymm3, ymm1, ymm2
	mov rcx, qword ptr [rsp + 40]
	L1232:
	vaddpd ymm0, ymm0, ymm1
	vextractf128 xmm1, ymm0, 1
	vaddpd xmm0, xmm0, xmm1
	vpermilpd xmm1, xmm0, 1 # xmm1 = xmm0[1,0]
	vaddsd xmm0, xmm0, xmm1
	vaddsd xmm0, xmm5, xmm0
	vmulsd xmm1, xmm0, qword ptr [rsp + 224]
	vmovapd xmmword ptr [rsp + 128], xmm1
	mov rax, qword ptr [rsp + 112]
	vmovsd xmm0, qword ptr [rax + 8*rcx] # xmm0 = mem[0],zero
	vsubsd xmm0, xmm0, xmm1
	vmovsd qword ptr [rax + 8*rcx], xmm0
	movabs r12, 140286483749744
	mov rdi, r12
	mov rsi, qword ptr [rsp + 32]
	movabs rax, offset StrideIndex
	vzeroupper
	call rax
	mov r15, rax
	mov rdi, r12
	mov rsi, qword ptr [rsp + 48]
	movabs rax, offset StrideIndex
	call rax
	vmovapd xmm7, xmmword ptr [rsp + 128]
	test r14, r14
	mov rsi, qword ptr [rsp + 120]
	mov rdx, qword ptr [rsp + 240]
	je L1490
	vbroadcastsd ymm0, xmm7
	mov rax, qword ptr [rsp + 88]
	lea rcx, [rdx + rax]
	mov rax, r15
	imul rax, rsi
	add rax, rcx
	xor ecx, ecx
	nop
	L1408:
	vmovupd ymm1, ymmword ptr [rbx + 8*rcx - 96]
	vmovupd ymm2, ymmword ptr [rbx + 8*rcx - 64]
	vmovupd ymm3, ymmword ptr [rbx + 8*rcx - 32]
	vmovupd ymm4, ymmword ptr [rbx + 8*rcx]
	vfnmadd213pd ymm1, ymm0, ymmword ptr [rax + 8rcx - 96] # ymm1 = -(ymm0 ymm1) + mem
	vfnmadd213pd ymm2, ymm0, ymmword ptr [rax + 8rcx - 64] # ymm2 = -(ymm0 ymm2) + mem
	vfnmadd213pd ymm3, ymm0, ymmword ptr [rax + 8rcx - 32] # ymm3 = -(ymm0 ymm3) + mem
	vfnmadd213pd ymm4, ymm0, ymmword ptr [rax + 8rcx] # ymm4 = -(ymm0 ymm4) + mem
	vmovupd ymmword ptr [rax + 8*rcx - 96], ymm1
	vmovupd ymmword ptr [rax + 8*rcx - 64], ymm2
	vmovupd ymmword ptr [rax + 8*rcx - 32], ymm3
	vmovupd ymmword ptr [rax + 8*rcx], ymm4
	add rcx, 16
	cmp r14, rcx
	jne L1408
	L1490:
	cmp r14, r13
	jg L602
	mov rcx, qword ptr [rsp + 232]
	imul rcx, r15
	mov rax, qword ptr [rsp + 96]
	add rax, rdx
	add rax, 8
	add rax, rcx
	cmp qword ptr [rsp + 56], r14
	jl L560
	cmp qword ptr [rsp + 176], r14
	jge L1615
	mov rcx, qword ptr [rsp + 24]
	vmovapd ymm4, ymmword ptr [rsp + 256]
	vmaskmovpd ymm0, ymm4, ymmword ptr [rcx + 8*r14 + 32]
	vmovupd ymm1, ymmword ptr [rcx + 8*r14]
	vmaskmovpd ymm2, ymm4, ymmword ptr [rax + 8*r14 + 32]
	vbroadcastsd ymm3, xmm7
	vfnmadd213pd ymm1, ymm3, ymmword ptr [rax + 8r14] # ymm1 = -(ymm3 ymm1) + mem
	vfnmadd231pd ymm2, ymm3, ymm0 # ymm2 = -(ymm3 * ymm0) + ymm2
	vmovupd ymmword ptr [rax + 8*r14], ymm1
	vmaskmovpd ymmword ptr [rax + 8*r14 + 32], ymm4, ymm2
	jmp L602
	L1615:
	mov rcx, qword ptr [rsp + 24]
	vmovupd ymm1, ymmword ptr [rcx + 8*r14]
	vmovupd ymm0, ymmword ptr [rcx + 8*r14 + 32]
	cmp qword ptr [rsp + 168], r14
	jge L1720
	lea rcx, [rcx + 8*r14]
	add rcx, 64
	vmovapd ymm5, ymmword ptr [rsp + 256]
	vmaskmovpd ymm2, ymm5, ymmword ptr [rcx]
	vmaskmovpd ymm3, ymm5, ymmword ptr [rax + 8*r14 + 64]
	vbroadcastsd ymm4, xmm7
	vfnmadd213pd ymm1, ymm4, ymmword ptr [rax + 8r14] # ymm1 = -(ymm4 ymm1) + mem
	vfnmadd213pd ymm0, ymm4, ymmword ptr [rax + 8r14 + 32] # ymm0 = -(ymm4 ymm0) + mem
	vfnmadd231pd ymm3, ymm4, ymm2 # ymm3 = -(ymm4 * ymm2) + ymm3
	vmovupd ymmword ptr [rax + 8*r14], ymm1
	vmovupd ymmword ptr [rax + 8*r14 + 32], ymm0
	vmaskmovpd ymmword ptr [rax + 8*r14 + 64], ymm5, ymm3
	jmp L602
	L1720:
	lea rcx, [rcx + 8*r14]
	vmovapd ymm6, ymmword ptr [rsp + 256]
	vmaskmovpd ymm2, ymm6, ymmword ptr [rcx + 96]
	vmaskmovpd ymm3, ymm6, ymmword ptr [rax + 8*r14 + 96]
	vbroadcastsd ymm4, xmm7
	vfnmadd213pd ymm1, ymm4, ymmword ptr [rax + 8r14] # ymm1 = -(ymm4 ymm1) + mem
	vfnmadd213pd ymm0, ymm4, ymmword ptr [rax + 8r14 + 32] # ymm0 = -(ymm4 ymm0) + mem
	vmovupd ymm5, ymmword ptr [rcx + 64]
	vfnmadd213pd ymm5, ymm4, ymmword ptr [rax + 8r14 + 64] # ymm5 = -(ymm4 ymm5) + mem
	vfnmadd231pd ymm3, ymm4, ymm2 # ymm3 = -(ymm4 * ymm2) + ymm3
	vmovupd ymmword ptr [rax + 8*r14], ymm1
	vmovupd ymmword ptr [rax + 8*r14 + 32], ymm0
	vmovupd ymmword ptr [rax + 8*r14 + 64], ymm5
	vmaskmovpd ymmword ptr [rax + 8*r14 + 96], ymm6, ymm3
	jmp L602
	L1813:
	mov rcx, qword ptr [rsp + 32]
	mov rax, qword ptr [rcx]
	mov rdx, qword ptr [rsp + 144]
	mov qword ptr [rdx], rax
	vmovups ymm0, ymmword ptr [rcx]
	vmovups ymm1, ymmword ptr [rcx + 24]
	mov rax, qword ptr [rsp + 160]
	vmovups ymmword ptr [rax + 24], ymm1
	vmovups ymmword ptr [rax], ymm0
	mov rcx, qword ptr [rsp + 296]
	mov rdx, qword ptr [rsp + 152]
	mov qword ptr [rdx], rcx
	lea rsp, [rbp - 40]
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	pop rbp
	vzeroupper
	ret
	nop word ptr [rax + rax]