yuyichao/asm.diff Secret

## asm.diff
--- llvm37.s    2016-06-04 13:23:34.947819989 -0400
+++ llvm38.s    2016-06-04 13:14:25.455283889 -0400
@@ -4,103 +4,95 @@
         vxorps  %xmm0, %xmm0, %xmm0
         testq   %rsi, %rsi
         je      .LBB1_13
-# BB#1:                                 # %overflow.checked
+# BB#1:                                 # %.lr.ph.preheader
+        vxorps  %xmm0, %xmm0, %xmm0
+        xorl    %ecx, %ecx
+        cmpq    $15, %rsi
+        jbe     .LBB1_2
+# BB#4:                                 # %min.iters.checked
         xorl    %ecx, %ecx
         movq    %rsi, %rax
-        vxorps  %ymm0, %ymm0, %ymm0
-        vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
-        andq    $-32, %rax
-        je      .LBB1_10
-# BB#2:                                 # %vector.body.preheader
-        leaq    -32(%rsi), %r8
+        andq    $-16, %rax
+        je      .LBB1_2
+# BB#5:                                 # %vector.body.preheader
+        leaq    -16(%rsi), %r8
         movl    %r8d, %ecx
-        shrl    $5, %ecx
+        shrl    $4, %ecx
         addl    $1, %ecx
         xorl    %edx, %edx
-        testb   $3, %cl
-        je      .LBB1_3
-# BB#4:                                 # %vector.body.prol.preheader
-        leal    -32(%rsi), %ecx
-        shrl    $5, %ecx
+        testb   $7, %cl
+        je      .LBB1_6
+# BB#7:                                 # %vector.body.prol.preheader
+        leal    -16(%rsi), %ecx
+        shrl    $4, %ecx
         addl    $1, %ecx
-        andl    $3, %ecx
+        andl    $7, %ecx
         negq    %rcx
         vxorps  %ymm0, %ymm0, %ymm0
         xorl    %edx, %edx
         vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
         .align  16, 0x90
-.LBB1_5:                                # %vector.body.prol
+.LBB1_8:                                # %vector.body.prol
                                         # =>This Inner Loop Header: Depth=1
         vaddps  (%rdi,%rdx,4), %ymm0, %ymm0
         vaddps  32(%rdi,%rdx,4), %ymm1, %ymm1
-        vaddps  64(%rdi,%rdx,4), %ymm2, %ymm2
-        vaddps  96(%rdi,%rdx,4), %ymm3, %ymm3
-        addq    $32, %rdx
+        addq    $16, %rdx
         addq    $1, %rcx
-        jne     .LBB1_5
-        jmp     .LBB1_6
-.LBB1_3:
+        jne     .LBB1_8
+        jmp     .LBB1_9
+.LBB1_6:
         vxorps  %ymm0, %ymm0, %ymm0
         vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
-.LBB1_6:                                # %vector.body.preheader.split
-        cmpq    $96, %r8
-        jb      .LBB1_9
-# BB#7:                                 # %vector.body.preheader.split.split
+.LBB1_9:                                # %vector.body.preheader.split
+        cmpq    $112, %r8
+        jb      .LBB1_12
+# BB#10:                                # %vector.body.preheader.split.split
         movq    %rsi, %rcx
-        andq    $-32, %rcx
+        andq    $-16, %rcx
         subq    %rdx, %rcx
         leaq    480(%rdi,%rdx,4), %rdx
         .align  16, 0x90
-.LBB1_8:                                # %vector.body
+.LBB1_11:                               # %vector.body
                                         # =>This Inner Loop Header: Depth=1
         vaddps  -480(%rdx), %ymm0, %ymm0
         vaddps  -448(%rdx), %ymm1, %ymm1
-        vaddps  -416(%rdx), %ymm2, %ymm2
-        vaddps  -384(%rdx), %ymm3, %ymm3
+        vaddps  -416(%rdx), %ymm0, %ymm0
+        vaddps  -384(%rdx), %ymm1, %ymm1
         vaddps  -352(%rdx), %ymm0, %ymm0
         vaddps  -320(%rdx), %ymm1, %ymm1
-        vaddps  -288(%rdx), %ymm2, %ymm2
-        vaddps  -256(%rdx), %ymm3, %ymm3
+        vaddps  -288(%rdx), %ymm0, %ymm0
+        vaddps  -256(%rdx), %ymm1, %ymm1
         vaddps  -224(%rdx), %ymm0, %ymm0
         vaddps  -192(%rdx), %ymm1, %ymm1
-        vaddps  -160(%rdx), %ymm2, %ymm2
-        vaddps  -128(%rdx), %ymm3, %ymm3
+        vaddps  -160(%rdx), %ymm0, %ymm0
+        vaddps  -128(%rdx), %ymm1, %ymm1
         vaddps  -96(%rdx), %ymm0, %ymm0
         vaddps  -64(%rdx), %ymm1, %ymm1
-        vaddps  -32(%rdx), %ymm2, %ymm2
-        vaddps  (%rdx), %ymm3, %ymm3
+        vaddps  -32(%rdx), %ymm0, %ymm0
+        vaddps  (%rdx), %ymm1, %ymm1
         addq    $512, %rdx              # imm = 0x200
         addq    $-128, %rcx
-        jne     .LBB1_8
-.LBB1_9:
-        movq    %rax, %rcx
-.LBB1_10:                               # %middle.block
+        jne     .LBB1_11
+.LBB1_12:                               # %middle.block
         vaddps  %ymm0, %ymm1, %ymm0
-        vaddps  %ymm0, %ymm2, %ymm0
-        vaddps  %ymm0, %ymm3, %ymm0
         vextractf128    $1, %ymm0, %xmm1
         vaddps  %ymm1, %ymm0, %ymm0
-        vpermilpd       $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
+        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
         vaddps  %ymm1, %ymm0, %ymm0
         vhaddps %ymm0, %ymm0, %ymm0
-        cmpq    %rsi, %rcx
+        movq    %rax, %rcx
+        cmpq    %rsi, %rax
         je      .LBB1_13
-# BB#11:                                # %.lr.ph.preheader
+.LBB1_2:                                # %.lr.ph.preheader13
         leaq    (%rdi,%rcx,4), %rax
         subq    %rcx, %rsi
         .align  16, 0x90
-.LBB1_12:                               # %.lr.ph
+.LBB1_3:                                # %.lr.ph
                                         # =>This Inner Loop Header: Depth=1
         vaddss  (%rax), %xmm0, %xmm0
         addq    $4, %rax
         addq    $-1, %rsi
-        jne     .LBB1_12
+        jne     .LBB1_3
 .LBB1_13:                               # %._crit_edge
         #APP
         #NO_APP

## benchmark.c
//

#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <stdio.h>
#include <string.h>

uint64_t gettime_ns()
{
    struct timespec t;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
}


__attribute__((noinline)) float sum32(float *a, size_t n)
{
    /* a = (float*)__builtin_assume_aligned(a, 64); */
    float s = 0;
    for (size_t i = 0;i < n;i++)
        s += a[i];
    __asm__ volatile ("" ::: "memory");
    return s;
}

int main()
{
    float *p = aligned_alloc(64, sizeof(float) * 1024);
    memset(p, 0, sizeof(float) * 1024);
    uint64_t start = gettime_ns();
    for (int i = 0;i < 1024 * 1024;i++)
        sum32(p, 1024);
    free(p);
    uint64_t end = gettime_ns();
    printf("%f\n", (end - start) / (1024.0 * 1024.0));
    return 0;
}

## gcc.s
sum32.constprop.0:
.LFB36:
        .cfi_startproc
        movq    %rdi, %rax
        shrq    $2, %rax
        negq    %rax
        andl    $7, %eax
        je      .L9
        vmovss  (%rdi), %xmm2
        cmpq    $1, %rax
        je      .L10
        vaddss  4(%rdi), %xmm2, %xmm2
        cmpq    $2, %rax
        je      .L11
        vaddss  8(%rdi), %xmm2, %xmm2
        cmpq    $3, %rax
        je      .L12
        vaddss  12(%rdi), %xmm2, %xmm2
        cmpq    $4, %rax
        je      .L13
        vaddss  16(%rdi), %xmm2, %xmm2
        cmpq    $5, %rax
        je      .L14
        vaddss  20(%rdi), %xmm2, %xmm2
        cmpq    $7, %rax
        jne     .L15
        vaddss  24(%rdi), %xmm2, %xmm2
        movl    $1017, %ecx
        movl    $7, %edx
.L3:
        movl    $1024, %esi
        movl    $1016, %r8d
        movl    $127, %r10d
        subq    %rax, %rsi
.L2:
        leaq    (%rdi,%rax,4), %r9
        vxorps  %xmm1, %xmm1, %xmm1
        xorl    %eax, %eax
.L4:
        addq    $1, %rax
        vaddps  (%r9), %ymm1, %ymm1
        addq    $32, %r9
        cmpq    %r10, %rax
        jb      .L4
        vhaddps %ymm1, %ymm1, %ymm1
        leaq    (%rdx,%r8), %rax
        movq    %rcx, %rdx
        subq    %r8, %rdx
        vhaddps %ymm1, %ymm1, %ymm0
        vperm2f128      $1, %ymm0, %ymm0, %ymm1
        vaddps  %ymm0, %ymm1, %ymm0
        vaddss  %xmm2, %xmm0, %xmm0
        cmpq    %r8, %rsi
        je      .L7
        vaddss  (%rdi,%rax,4), %xmm0, %xmm0
        leaq    1(%rax), %rcx
        cmpq    $1, %rdx
        je      .L7
        vaddss  (%rdi,%rcx,4), %xmm0, %xmm0
        leaq    2(%rax), %rcx
        cmpq    $2, %rdx
        je      .L7
        vaddss  (%rdi,%rcx,4), %xmm0, %xmm0
        leaq    3(%rax), %rcx
        cmpq    $3, %rdx
        je      .L7
        vaddss  (%rdi,%rcx,4), %xmm0, %xmm0
        leaq    4(%rax), %rcx
        cmpq    $4, %rdx
        je      .L7
        vaddss  (%rdi,%rcx,4), %xmm0, %xmm0
        leaq    5(%rax), %rcx
        cmpq    $5, %rdx
        je      .L7
        vaddss  (%rdi,%rcx,4), %xmm0, %xmm0
        addq    $6, %rax
        cmpq    $6, %rdx
        je      .L7
        vaddss  (%rdi,%rax,4), %xmm0, %xmm0
.L7:
        vzeroupper
        ret
        .p2align 4,,10
        .p2align 3
.L9:
        movl    $1024, %esi
        movl    $1024, %ecx
        vxorps  %xmm2, %xmm2, %xmm2
        xorl    %edx, %edx
        movl    $1024, %r8d
        movl    $128, %r10d
        jmp     .L2
        .p2align 4,,10
        .p2align 3
.L15:
        movl    $1018, %ecx
        movl    $6, %edx
        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L10:
        movl    $1023, %ecx
        movl    $1, %edx
        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L11:
        movl    $2, %edx
        movl    $1022, %ecx
        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L12:
        movl    $3, %edx
        movl    $1021, %ecx
        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L13:
        movl    $4, %edx
        movl    $1020, %ecx
        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L14:
        movl    $5, %edx
        movl    $1019, %ecx
        jmp     .L3
        .cfi_endproc

## llvm37.ll
; Function Attrs: noinline nounwind uwtable
define float @sum32(float* nocapture readonly %a, i64 %n) #3 {
  %1 = icmp eq i64 %n, 0
  br i1 %1, label %._crit_edge, label %overflow.checked

overflow.checked:                                 ; preds = %0
  %n.vec = and i64 %n, -32
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %middle.block, label %vector.body.preheader

vector.body.preheader:                            ; preds = %overflow.checked
  %2 = add i64 %n, -32
  %3 = lshr i64 %2, 5
  %4 = add nuw nsw i64 %3, 1
  %xtraiter = and i64 %4, 3
  %lcmp.mod = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader

vector.body.prol.preheader:                       ; preds = %vector.body.preheader
  br label %vector.body.prol

vector.body.prol:                                 ; preds = %vector.body.prol.preheader, %vector.body.prol
  %index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ]
  %vec.phi.prol = phi <8 x float> [ %13, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %vec.phi6.prol = phi <8 x float> [ %14, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %vec.phi7.prol = phi <8 x float> [ %15, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %vec.phi8.prol = phi <8 x float> [ %16, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ]
  %5 = getelementptr inbounds float, float* %a, i64 %index.prol
  %6 = bitcast float* %5 to <8 x float>*
  %wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7
  %7 = getelementptr float, float* %5, i64 8
  %8 = bitcast float* %7 to <8 x float>*
  %wide.load9.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7
  %9 = getelementptr float, float* %5, i64 16
  %10 = bitcast float* %9 to <8 x float>*
  %wide.load10.prol = load <8 x float>, <8 x float>* %10, align 4, !tbaa !7
  %11 = getelementptr float, float* %5, i64 24
  %12 = bitcast float* %11 to <8 x float>*
  %wide.load11.prol = load <8 x float>, <8 x float>* %12, align 4, !tbaa !7
  %13 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol
  %14 = fadd fast <8 x float> %wide.load9.prol, %vec.phi6.prol
  %15 = fadd fast <8 x float> %wide.load10.prol, %vec.phi7.prol
  %16 = fadd fast <8 x float> %wide.load11.prol, %vec.phi8.prol
  %index.next.prol = add i64 %index.prol, 32
  %prol.iter.sub = add i64 %prol.iter, -1
  %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
  br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9

vector.body.preheader.split.loopexit:             ; preds = %vector.body.prol
  %index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ]
  %.lcssa35 = phi <8 x float> [ %16, %vector.body.prol ]
  %.lcssa34 = phi <8 x float> [ %15, %vector.body.prol ]
  %.lcssa33 = phi <8 x float> [ %14, %vector.body.prol ]
  %.lcssa32 = phi <8 x float> [ %13, %vector.body.prol ]
  br label %vector.body.preheader.split

vector.body.preheader.split:                      ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader
  %.lcssa27.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
  %.lcssa26.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
  %.lcssa25.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
  %.lcssa24.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
  %index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ]
  %vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
  %vec.phi6.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
  %vec.phi7.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
  %vec.phi8.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
  %17 = icmp ult i64 %2, 96
  br i1 %17, label %middle.block.loopexit, label %vector.body.preheader.split.split

vector.body.preheader.split.split:                ; preds = %vector.body.preheader.split
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split.split
  %index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.3, %vector.body ]
  %vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %62, %vector.body ]
  %vec.phi6 = phi <8 x float> [ %vec.phi6.unr, %vector.body.preheader.split.split ], [ %63, %vector.body ]
  %vec.phi7 = phi <8 x float> [ %vec.phi7.unr, %vector.body.preheader.split.split ], [ %64, %vector.body ]
  %vec.phi8 = phi <8 x float> [ %vec.phi8.unr, %vector.body.preheader.split.split ], [ %65, %vector.body ]
  %18 = getelementptr inbounds float, float* %a, i64 %index
  %19 = bitcast float* %18 to <8 x float>*
  %wide.load = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7
  %20 = getelementptr float, float* %18, i64 8
  %21 = bitcast float* %20 to <8 x float>*
  %wide.load9 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7
  %22 = getelementptr float, float* %18, i64 16
  %23 = bitcast float* %22 to <8 x float>*
  %wide.load10 = load <8 x float>, <8 x float>* %23, align 4, !tbaa !7
  %24 = getelementptr float, float* %18, i64 24
  %25 = bitcast float* %24 to <8 x float>*
  %wide.load11 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7
  %26 = fadd fast <8 x float> %wide.load, %vec.phi
  %27 = fadd fast <8 x float> %wide.load9, %vec.phi6
  %28 = fadd fast <8 x float> %wide.load10, %vec.phi7
  %29 = fadd fast <8 x float> %wide.load11, %vec.phi8
  %index.next = add i64 %index, 32
  %30 = getelementptr inbounds float, float* %a, i64 %index.next
  %31 = bitcast float* %30 to <8 x float>*
  %wide.load.1 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7
  %32 = getelementptr float, float* %30, i64 8
  %33 = bitcast float* %32 to <8 x float>*
  %wide.load9.1 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7
  %34 = getelementptr float, float* %30, i64 16
  %35 = bitcast float* %34 to <8 x float>*
  %wide.load10.1 = load <8 x float>, <8 x float>* %35, align 4, !tbaa !7
  %36 = getelementptr float, float* %30, i64 24
  %37 = bitcast float* %36 to <8 x float>*
  %wide.load11.1 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7
  %38 = fadd fast <8 x float> %wide.load.1, %26
  %39 = fadd fast <8 x float> %wide.load9.1, %27
  %40 = fadd fast <8 x float> %wide.load10.1, %28
  %41 = fadd fast <8 x float> %wide.load11.1, %29
  %index.next.1 = add i64 %index, 64
  %42 = getelementptr inbounds float, float* %a, i64 %index.next.1
  %43 = bitcast float* %42 to <8 x float>*
  %wide.load.2 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7
  %44 = getelementptr float, float* %42, i64 8
  %45 = bitcast float* %44 to <8 x float>*
  %wide.load9.2 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7
  %46 = getelementptr float, float* %42, i64 16
  %47 = bitcast float* %46 to <8 x float>*
  %wide.load10.2 = load <8 x float>, <8 x float>* %47, align 4, !tbaa !7
  %48 = getelementptr float, float* %42, i64 24
  %49 = bitcast float* %48 to <8 x float>*
  %wide.load11.2 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7
  %50 = fadd fast <8 x float> %wide.load.2, %38
  %51 = fadd fast <8 x float> %wide.load9.2, %39
  %52 = fadd fast <8 x float> %wide.load10.2, %40
  %53 = fadd fast <8 x float> %wide.load11.2, %41
  %index.next.2 = add i64 %index, 96
  %54 = getelementptr inbounds float, float* %a, i64 %index.next.2
  %55 = bitcast float* %54 to <8 x float>*
  %wide.load.3 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7
  %56 = getelementptr float, float* %54, i64 8
  %57 = bitcast float* %56 to <8 x float>*
  %wide.load9.3 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7
  %58 = getelementptr float, float* %54, i64 16
  %59 = bitcast float* %58 to <8 x float>*
  %wide.load10.3 = load <8 x float>, <8 x float>* %59, align 4, !tbaa !7
  %60 = getelementptr float, float* %54, i64 24
  %61 = bitcast float* %60 to <8 x float>*
  %wide.load11.3 = load <8 x float>, <8 x float>* %61, align 4, !tbaa !7
  %62 = fadd fast <8 x float> %wide.load.3, %50
  %63 = fadd fast <8 x float> %wide.load9.3, %51
  %64 = fadd fast <8 x float> %wide.load10.3, %52
  %65 = fadd fast <8 x float> %wide.load11.3, %53
  %index.next.3 = add i64 %index, 128
  %66 = icmp eq i64 %index.next.3, %n.vec
  br i1 %66, label %middle.block.loopexit.unr-lcssa, label %vector.body, !llvm.loop !11

middle.block.loopexit.unr-lcssa:                  ; preds = %vector.body
  %.lcssa31 = phi <8 x float> [ %65, %vector.body ]
  %.lcssa30 = phi <8 x float> [ %64, %vector.body ]
  %.lcssa29 = phi <8 x float> [ %63, %vector.body ]
  %.lcssa28 = phi <8 x float> [ %62, %vector.body ]
  br label %middle.block.loopexit

middle.block.loopexit:                            ; preds = %vector.body.preheader.split, %middle.block.loopexit.unr-lcssa
  %.lcssa27 = phi <8 x float> [ %.lcssa27.unr, %vector.body.preheader.split ], [ %.lcssa31, %middle.block.loopexit.unr-lcssa ]
  %.lcssa26 = phi <8 x float> [ %.lcssa26.unr, %vector.body.preheader.split ], [ %.lcssa30, %middle.block.loopexit.unr-lcssa ]
  %.lcssa25 = phi <8 x float> [ %.lcssa25.unr, %vector.body.preheader.split ], [ %.lcssa29, %middle.block.loopexit.unr-lcssa ]
  %.lcssa24 = phi <8 x float> [ %.lcssa24.unr, %vector.body.preheader.split ], [ %.lcssa28, %middle.block.loopexit.unr-lcssa ]
  br label %middle.block

middle.block:                                     ; preds = %middle.block.loopexit, %overflow.checked
  %resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %middle.block.loopexit ]
  %rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa24, %middle.block.loopexit ]
  %rdx.vec.exit.phi14 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa25, %middle.block.loopexit ]
  %rdx.vec.exit.phi15 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa26, %middle.block.loopexit ]
  %rdx.vec.exit.phi16 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa27, %middle.block.loopexit ]
  %bin.rdx = fadd fast <8 x float> %rdx.vec.exit.phi14, %rdx.vec.exit.phi
  %bin.rdx17 = fadd fast <8 x float> %rdx.vec.exit.phi15, %bin.rdx
  %bin.rdx18 = fadd fast <8 x float> %rdx.vec.exit.phi16, %bin.rdx17
  %rdx.shuf = shufflevector <8 x float> %bin.rdx18, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx19 = fadd fast <8 x float> %bin.rdx18, %rdx.shuf
  %rdx.shuf20 = shufflevector <8 x float> %bin.rdx19, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx21 = fadd fast <8 x float> %bin.rdx19, %rdx.shuf20
  %rdx.shuf22 = shufflevector <8 x float> %bin.rdx21, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx23 = fadd fast <8 x float> %bin.rdx21, %rdx.shuf22
  %67 = extractelement <8 x float> %bin.rdx23, i32 0
  %cmp.n = icmp eq i64 %resume.val, %n
  br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader

.lr.ph.preheader:                                 ; preds = %middle.block
  br label %.lr.ph

._crit_edge.loopexit:                             ; preds = %.lr.ph
  %.lcssa = phi float [ %70, %.lr.ph ]
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %middle.block, %0
  %s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %67, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ]
  tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #1, !srcloc !14
  ret float %s.0.lcssa

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %i.02 = phi i64 [ %71, %.lr.ph ], [ %resume.val, %.lr.ph.preheader ]
  %s.01 = phi float [ %70, %.lr.ph ], [ %67, %.lr.ph.preheader ]
  %68 = getelementptr inbounds float, float* %a, i64 %i.02
  %69 = load float, float* %68, align 4, !tbaa !7
  %70 = fadd fast float %69, %s.01
  %71 = add nuw i64 %i.02, 1
  %exitcond = icmp eq i64 %71, %n
  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15
}

## llvm37.s
sum32:                                  # @sum32
        .cfi_startproc
# BB#0:
        vxorps  %xmm0, %xmm0, %xmm0
        testq   %rsi, %rsi
        je      .LBB1_13
# BB#1:                                 # %overflow.checked
        xorl    %ecx, %ecx
        movq    %rsi, %rax
        vxorps  %ymm0, %ymm0, %ymm0
        vxorps  %ymm1, %ymm1, %ymm1
        vxorps  %ymm2, %ymm2, %ymm2
        vxorps  %ymm3, %ymm3, %ymm3
        andq    $-32, %rax
        je      .LBB1_10
# BB#2:                                 # %vector.body.preheader
        leaq    -32(%rsi), %r8
        movl    %r8d, %ecx
        shrl    $5, %ecx
        addl    $1, %ecx
        xorl    %edx, %edx
        testb   $3, %cl
        je      .LBB1_3
# BB#4:                                 # %vector.body.prol.preheader
        leal    -32(%rsi), %ecx
        shrl    $5, %ecx
        addl    $1, %ecx
        andl    $3, %ecx
        negq    %rcx
        vxorps  %ymm0, %ymm0, %ymm0
        xorl    %edx, %edx
        vxorps  %ymm1, %ymm1, %ymm1
        vxorps  %ymm2, %ymm2, %ymm2
        vxorps  %ymm3, %ymm3, %ymm3
        .align  16, 0x90
.LBB1_5:                                # %vector.body.prol
                                        # =>This Inner Loop Header: Depth=1
        vaddps  (%rdi,%rdx,4), %ymm0, %ymm0
        vaddps  32(%rdi,%rdx,4), %ymm1, %ymm1
        vaddps  64(%rdi,%rdx,4), %ymm2, %ymm2
        vaddps  96(%rdi,%rdx,4), %ymm3, %ymm3
        addq    $32, %rdx
        addq    $1, %rcx
        jne     .LBB1_5
        jmp     .LBB1_6
.LBB1_3:
        vxorps  %ymm0, %ymm0, %ymm0
        vxorps  %ymm1, %ymm1, %ymm1
        vxorps  %ymm2, %ymm2, %ymm2
        vxorps  %ymm3, %ymm3, %ymm3
.LBB1_6:                                # %vector.body.preheader.split
        cmpq    $96, %r8
        jb      .LBB1_9
# BB#7:                                 # %vector.body.preheader.split.split
        movq    %rsi, %rcx
        andq    $-32, %rcx
        subq    %rdx, %rcx
        leaq    480(%rdi,%rdx,4), %rdx
        .align  16, 0x90
.LBB1_8:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        vaddps  -480(%rdx), %ymm0, %ymm0
        vaddps  -448(%rdx), %ymm1, %ymm1
        vaddps  -416(%rdx), %ymm2, %ymm2
        vaddps  -384(%rdx), %ymm3, %ymm3
        vaddps  -352(%rdx), %ymm0, %ymm0
        vaddps  -320(%rdx), %ymm1, %ymm1
        vaddps  -288(%rdx), %ymm2, %ymm2
        vaddps  -256(%rdx), %ymm3, %ymm3
        vaddps  -224(%rdx), %ymm0, %ymm0
        vaddps  -192(%rdx), %ymm1, %ymm1
        vaddps  -160(%rdx), %ymm2, %ymm2
        vaddps  -128(%rdx), %ymm3, %ymm3
        vaddps  -96(%rdx), %ymm0, %ymm0
        vaddps  -64(%rdx), %ymm1, %ymm1
        vaddps  -32(%rdx), %ymm2, %ymm2
        vaddps  (%rdx), %ymm3, %ymm3
        addq    $512, %rdx              # imm = 0x200
        addq    $-128, %rcx
        jne     .LBB1_8
.LBB1_9:
        movq    %rax, %rcx
.LBB1_10:                               # %middle.block
        vaddps  %ymm0, %ymm1, %ymm0
        vaddps  %ymm0, %ymm2, %ymm0
        vaddps  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddps  %ymm1, %ymm0, %ymm0
        vpermilpd       $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
        vaddps  %ymm1, %ymm0, %ymm0
        vhaddps %ymm0, %ymm0, %ymm0
        cmpq    %rsi, %rcx
        je      .LBB1_13
# BB#11:                                # %.lr.ph.preheader
        leaq    (%rdi,%rcx,4), %rax
        subq    %rcx, %rsi
        .align  16, 0x90
.LBB1_12:                               # %.lr.ph
                                        # =>This Inner Loop Header: Depth=1
        vaddss  (%rax), %xmm0, %xmm0
        addq    $4, %rax
        addq    $-1, %rsi
        jne     .LBB1_12
.LBB1_13:                               # %._crit_edge
        #APP
        #NO_APP
        vzeroupper
        retq
.Lfunc_end1:
        .size   sum32, .Lfunc_end1-sum32
        .cfi_endproc

## llvm38.ll
; Function Attrs: noinline nounwind uwtable
define float @sum32(float* nocapture readonly %a, i64 %n) #3 {
  %1 = icmp eq i64 %n, 0
  br i1 %1, label %._crit_edge, label %.lr.ph.preheader

.lr.ph.preheader:                                 ; preds = %0
  %min.iters.check = icmp ult i64 %n, 16
  br i1 %min.iters.check, label %.lr.ph.preheader13, label %min.iters.checked

.lr.ph.preheader13:                               ; preds = %middle.block, %min.iters.checked, %.lr.ph.preheader
  %i.02.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %.lr.ph.preheader ], [ %n.vec, %middle.block ]
  %s.01.ph = phi float [ 0.000000e+00, %min.iters.checked ], [ 0.000000e+00, %.lr.ph.preheader ], [ %61, %middle.block ]
  br label %.lr.ph

min.iters.checked:                                ; preds = %.lr.ph.preheader
  %n.vec = and i64 %n, -16
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %.lr.ph.preheader13, label %vector.body.preheader

vector.body.preheader:                            ; preds = %min.iters.checked
  %2 = add i64 %n, -16
  %3 = lshr i64 %2, 4
  %4 = add nuw nsw i64 %3, 1
  %xtraiter = and i64 %4, 7
  %lcmp.mod = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader

vector.body.prol.preheader:                       ; preds = %vector.body.preheader
  br label %vector.body.prol

vector.body.prol:                                 ; preds = %vector.body.prol.preheader, %vector.body.prol
  %index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ]
  %vec.phi.prol = phi <8 x float> [ %9, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %vec.phi4.prol = phi <8 x float> [ %10, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
  %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ]
  %5 = getelementptr inbounds float, float* %a, i64 %index.prol
  %6 = bitcast float* %5 to <8 x float>*
  %wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7
  %7 = getelementptr float, float* %5, i64 8
  %8 = bitcast float* %7 to <8 x float>*
  %wide.load5.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7
  %9 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol
  %10 = fadd fast <8 x float> %wide.load5.prol, %vec.phi4.prol
  %index.next.prol = add i64 %index.prol, 16
  %prol.iter.sub = add i64 %prol.iter, -1
  %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
  br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9

vector.body.preheader.split.loopexit:             ; preds = %vector.body.prol
  %index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ]
  %.lcssa19 = phi <8 x float> [ %10, %vector.body.prol ]
  %.lcssa18 = phi <8 x float> [ %9, %vector.body.prol ]
  br label %vector.body.preheader.split

vector.body.preheader.split:                      ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader
  %.lcssa15.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ]
  %.lcssa14.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ]
  %index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ]
  %vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ]
  %vec.phi4.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ]
  %11 = icmp ult i64 %2, 112
  br i1 %11, label %middle.block, label %vector.body.preheader.split.split

vector.body.preheader.split.split:                ; preds = %vector.body.preheader.split
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split.split
  %index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.7, %vector.body ]
  %vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %58, %vector.body ]
  %vec.phi4 = phi <8 x float> [ %vec.phi4.unr, %vector.body.preheader.split.split ], [ %59, %vector.body ]
  %12 = getelementptr inbounds float, float* %a, i64 %index
  %13 = bitcast float* %12 to <8 x float>*
  %wide.load = load <8 x float>, <8 x float>* %13, align 4, !tbaa !7
  %14 = getelementptr float, float* %12, i64 8
  %15 = bitcast float* %14 to <8 x float>*
  %wide.load5 = load <8 x float>, <8 x float>* %15, align 4, !tbaa !7
  %16 = fadd fast <8 x float> %wide.load, %vec.phi
  %17 = fadd fast <8 x float> %wide.load5, %vec.phi4
  %index.next = add i64 %index, 16
  %18 = getelementptr inbounds float, float* %a, i64 %index.next
  %19 = bitcast float* %18 to <8 x float>*
  %wide.load.1 = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7
  %20 = getelementptr float, float* %18, i64 8
  %21 = bitcast float* %20 to <8 x float>*
  %wide.load5.1 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7
  %22 = fadd fast <8 x float> %wide.load.1, %16
  %23 = fadd fast <8 x float> %wide.load5.1, %17
  %index.next.1 = add i64 %index, 32
  %24 = getelementptr inbounds float, float* %a, i64 %index.next.1
  %25 = bitcast float* %24 to <8 x float>*
  %wide.load.2 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7
  %26 = getelementptr float, float* %24, i64 8
  %27 = bitcast float* %26 to <8 x float>*
  %wide.load5.2 = load <8 x float>, <8 x float>* %27, align 4, !tbaa !7
  %28 = fadd fast <8 x float> %wide.load.2, %22
  %29 = fadd fast <8 x float> %wide.load5.2, %23
  %index.next.2 = add i64 %index, 48
  %30 = getelementptr inbounds float, float* %a, i64 %index.next.2
  %31 = bitcast float* %30 to <8 x float>*
  %wide.load.3 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7
  %32 = getelementptr float, float* %30, i64 8
  %33 = bitcast float* %32 to <8 x float>*
  %wide.load5.3 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7
  %34 = fadd fast <8 x float> %wide.load.3, %28
  %35 = fadd fast <8 x float> %wide.load5.3, %29
  %index.next.3 = add i64 %index, 64
  %36 = getelementptr inbounds float, float* %a, i64 %index.next.3
  %37 = bitcast float* %36 to <8 x float>*
  %wide.load.4 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7
  %38 = getelementptr float, float* %36, i64 8
  %39 = bitcast float* %38 to <8 x float>*
  %wide.load5.4 = load <8 x float>, <8 x float>* %39, align 4, !tbaa !7
  %40 = fadd fast <8 x float> %wide.load.4, %34
  %41 = fadd fast <8 x float> %wide.load5.4, %35
  %index.next.4 = add i64 %index, 80
  %42 = getelementptr inbounds float, float* %a, i64 %index.next.4
  %43 = bitcast float* %42 to <8 x float>*
  %wide.load.5 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7
  %44 = getelementptr float, float* %42, i64 8
  %45 = bitcast float* %44 to <8 x float>*
  %wide.load5.5 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7
  %46 = fadd fast <8 x float> %wide.load.5, %40
  %47 = fadd fast <8 x float> %wide.load5.5, %41
  %index.next.5 = add i64 %index, 96
  %48 = getelementptr inbounds float, float* %a, i64 %index.next.5
  %49 = bitcast float* %48 to <8 x float>*
  %wide.load.6 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7
  %50 = getelementptr float, float* %48, i64 8
  %51 = bitcast float* %50 to <8 x float>*
  %wide.load5.6 = load <8 x float>, <8 x float>* %51, align 4, !tbaa !7
  %52 = fadd fast <8 x float> %wide.load.6, %46
  %53 = fadd fast <8 x float> %wide.load5.6, %47
  %index.next.6 = add i64 %index, 112
  %54 = getelementptr inbounds float, float* %a, i64 %index.next.6
  %55 = bitcast float* %54 to <8 x float>*
  %wide.load.7 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7
  %56 = getelementptr float, float* %54, i64 8
  %57 = bitcast float* %56 to <8 x float>*
  %wide.load5.7 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7
  %58 = fadd fast <8 x float> %wide.load.7, %52
  %59 = fadd fast <8 x float> %wide.load5.7, %53
  %index.next.7 = add i64 %index, 128
  %60 = icmp eq i64 %index.next.7, %n.vec
  br i1 %60, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !11

middle.block.unr-lcssa:                           ; preds = %vector.body
  %.lcssa17 = phi <8 x float> [ %59, %vector.body ]
  %.lcssa16 = phi <8 x float> [ %58, %vector.body ]
  br label %middle.block

middle.block:                                     ; preds = %vector.body.preheader.split, %middle.block.unr-lcssa
  %.lcssa15 = phi <8 x float> [ %.lcssa15.unr, %vector.body.preheader.split ], [ %.lcssa17, %middle.block.unr-lcssa ]
  %.lcssa14 = phi <8 x float> [ %.lcssa14.unr, %vector.body.preheader.split ], [ %.lcssa16, %middle.block.unr-lcssa ]
  %bin.rdx = fadd fast <8 x float> %.lcssa15, %.lcssa14
  %rdx.shuf = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx8 = fadd fast <8 x float> %bin.rdx, %rdx.shuf
  %rdx.shuf9 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx10 = fadd fast <8 x float> %bin.rdx8, %rdx.shuf9
  %rdx.shuf11 = shufflevector <8 x float> %bin.rdx10, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx12 = fadd fast <8 x float> %bin.rdx10, %rdx.shuf11
  %61 = extractelement <8 x float> %bin.rdx12, i32 0
  %cmp.n = icmp eq i64 %n.vec, %n
  br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader13

._crit_edge.loopexit:                             ; preds = %.lr.ph
  %.lcssa = phi float [ %64, %.lr.ph ]
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %middle.block, %0
  %s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %61, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ]
  tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #4, !srcloc !14
  ret float %s.0.lcssa

.lr.ph:                                           ; preds = %.lr.ph.preheader13, %.lr.ph
  %i.02 = phi i64 [ %65, %.lr.ph ], [ %i.02.ph, %.lr.ph.preheader13 ]
  %s.01 = phi float [ %64, %.lr.ph ], [ %s.01.ph, %.lr.ph.preheader13 ]
  %62 = getelementptr inbounds float, float* %a, i64 %i.02
  %63 = load float, float* %62, align 4, !tbaa !7
  %64 = fadd fast float %63, %s.01
  %65 = add nuw i64 %i.02, 1
  %exitcond = icmp eq i64 %65, %n
  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15
}

## llvm38.s
sum32:                                  # @sum32
        .cfi_startproc
# BB#0:
        vxorps  %xmm0, %xmm0, %xmm0
        testq   %rsi, %rsi
        je      .LBB1_13
# BB#1:                                 # %.lr.ph.preheader
        vxorps  %xmm0, %xmm0, %xmm0
        xorl    %ecx, %ecx
        cmpq    $15, %rsi
        jbe     .LBB1_2
# BB#4:                                 # %min.iters.checked
        xorl    %ecx, %ecx
        movq    %rsi, %rax
        andq    $-16, %rax
        je      .LBB1_2
# BB#5:                                 # %vector.body.preheader
        leaq    -16(%rsi), %r8
        movl    %r8d, %ecx
        shrl    $4, %ecx
        addl    $1, %ecx
        xorl    %edx, %edx
        testb   $7, %cl
        je      .LBB1_6
# BB#7:                                 # %vector.body.prol.preheader
        leal    -16(%rsi), %ecx
        shrl    $4, %ecx
        addl    $1, %ecx
        andl    $7, %ecx
        negq    %rcx
        vxorps  %ymm0, %ymm0, %ymm0
        xorl    %edx, %edx
        vxorps  %ymm1, %ymm1, %ymm1
        .align  16, 0x90
.LBB1_8:                                # %vector.body.prol
                                        # =>This Inner Loop Header: Depth=1
        vaddps  (%rdi,%rdx,4), %ymm0, %ymm0
        vaddps  32(%rdi,%rdx,4), %ymm1, %ymm1
        addq    $16, %rdx
        addq    $1, %rcx
        jne     .LBB1_8
        jmp     .LBB1_9
.LBB1_6:
        vxorps  %ymm0, %ymm0, %ymm0
        vxorps  %ymm1, %ymm1, %ymm1
.LBB1_9:                                # %vector.body.preheader.split
        cmpq    $112, %r8
        jb      .LBB1_12
# BB#10:                                # %vector.body.preheader.split.split
        movq    %rsi, %rcx
        andq    $-16, %rcx
        subq    %rdx, %rcx
        leaq    480(%rdi,%rdx,4), %rdx
        .align  16, 0x90
.LBB1_11:                               # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        vaddps  -480(%rdx), %ymm0, %ymm0
        vaddps  -448(%rdx), %ymm1, %ymm1
        vaddps  -416(%rdx), %ymm0, %ymm0
        vaddps  -384(%rdx), %ymm1, %ymm1
        vaddps  -352(%rdx), %ymm0, %ymm0
        vaddps  -320(%rdx), %ymm1, %ymm1
        vaddps  -288(%rdx), %ymm0, %ymm0
        vaddps  -256(%rdx), %ymm1, %ymm1
        vaddps  -224(%rdx), %ymm0, %ymm0
        vaddps  -192(%rdx), %ymm1, %ymm1
        vaddps  -160(%rdx), %ymm0, %ymm0
        vaddps  -128(%rdx), %ymm1, %ymm1
        vaddps  -96(%rdx), %ymm0, %ymm0
        vaddps  -64(%rdx), %ymm1, %ymm1
        vaddps  -32(%rdx), %ymm0, %ymm0
        vaddps  (%rdx), %ymm1, %ymm1
        addq    $512, %rdx              # imm = 0x200
        addq    $-128, %rcx
        jne     .LBB1_11
.LBB1_12:                               # %middle.block
        vaddps  %ymm0, %ymm1, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddps  %ymm1, %ymm0, %ymm0
        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
        vaddps  %ymm1, %ymm0, %ymm0
        vhaddps %ymm0, %ymm0, %ymm0
        movq    %rax, %rcx
        cmpq    %rsi, %rax
        je      .LBB1_13
.LBB1_2:                                # %.lr.ph.preheader13
        leaq    (%rdi,%rcx,4), %rax
        subq    %rcx, %rsi
        .align  16, 0x90
.LBB1_3:                                # %.lr.ph
                                        # =>This Inner Loop Header: Depth=1
        vaddss  (%rax), %xmm0, %xmm0
        addq    $4, %rax
        addq    $-1, %rsi
        jne     .LBB1_3
.LBB1_13:                               # %._crit_edge
        #APP
        #NO_APP
        vzeroupper
        retq
.Lfunc_end1:
        .size   sum32, .Lfunc_end1-sum32
        .cfi_endproc
	--- llvm37.s 2016-06-04 13:23:34.947819989 -0400
	+++ llvm38.s 2016-06-04 13:14:25.455283889 -0400
	@@ -4,103 +4,95 @@
	vxorps %xmm0, %xmm0, %xmm0
	testq %rsi, %rsi
	je .LBB1_13
	-# BB#1: # %overflow.checked
	+# BB#1: # %.lr.ph.preheader
	+ vxorps %xmm0, %xmm0, %xmm0
	+ xorl %ecx, %ecx
	+ cmpq $15, %rsi
	+ jbe .LBB1_2
	+# BB#4: # %min.iters.checked
	xorl %ecx, %ecx
	movq %rsi, %rax
	- vxorps %ymm0, %ymm0, %ymm0
	- vxorps %ymm1, %ymm1, %ymm1
	- vxorps %ymm2, %ymm2, %ymm2
	- vxorps %ymm3, %ymm3, %ymm3
	- andq $-32, %rax
	- je .LBB1_10
	-# BB#2: # %vector.body.preheader
	- leaq -32(%rsi), %r8
	+ andq $-16, %rax
	+ je .LBB1_2
	+# BB#5: # %vector.body.preheader
	+ leaq -16(%rsi), %r8
	movl %r8d, %ecx
	- shrl $5, %ecx
	+ shrl $4, %ecx
	addl $1, %ecx
	xorl %edx, %edx
	- testb $3, %cl
	- je .LBB1_3
	-# BB#4: # %vector.body.prol.preheader
	- leal -32(%rsi), %ecx
	- shrl $5, %ecx
	+ testb $7, %cl
	+ je .LBB1_6
	+# BB#7: # %vector.body.prol.preheader
	+ leal -16(%rsi), %ecx
	+ shrl $4, %ecx
	addl $1, %ecx
	- andl $3, %ecx
	+ andl $7, %ecx
	negq %rcx
	vxorps %ymm0, %ymm0, %ymm0
	xorl %edx, %edx
	vxorps %ymm1, %ymm1, %ymm1
	- vxorps %ymm2, %ymm2, %ymm2
	- vxorps %ymm3, %ymm3, %ymm3
	.align 16, 0x90
	-.LBB1_5: # %vector.body.prol
	+.LBB1_8: # %vector.body.prol
	# =>This Inner Loop Header: Depth=1
	vaddps (%rdi,%rdx,4), %ymm0, %ymm0
	vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
	- vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
	- vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
	- addq $32, %rdx
	+ addq $16, %rdx
	addq $1, %rcx
	- jne .LBB1_5
	- jmp .LBB1_6
	-.LBB1_3:
	+ jne .LBB1_8
	+ jmp .LBB1_9
	+.LBB1_6:
	vxorps %ymm0, %ymm0, %ymm0
	vxorps %ymm1, %ymm1, %ymm1
	- vxorps %ymm2, %ymm2, %ymm2
	- vxorps %ymm3, %ymm3, %ymm3
	-.LBB1_6: # %vector.body.preheader.split
	- cmpq $96, %r8
	- jb .LBB1_9
	-# BB#7: # %vector.body.preheader.split.split
	+.LBB1_9: # %vector.body.preheader.split
	+ cmpq $112, %r8
	+ jb .LBB1_12
	+# BB#10: # %vector.body.preheader.split.split
	movq %rsi, %rcx
	- andq $-32, %rcx
	+ andq $-16, %rcx
	subq %rdx, %rcx
	leaq 480(%rdi,%rdx,4), %rdx
	.align 16, 0x90
	-.LBB1_8: # %vector.body
	+.LBB1_11: # %vector.body
	# =>This Inner Loop Header: Depth=1
	vaddps -480(%rdx), %ymm0, %ymm0
	vaddps -448(%rdx), %ymm1, %ymm1
	- vaddps -416(%rdx), %ymm2, %ymm2
	- vaddps -384(%rdx), %ymm3, %ymm3
	+ vaddps -416(%rdx), %ymm0, %ymm0
	+ vaddps -384(%rdx), %ymm1, %ymm1
	vaddps -352(%rdx), %ymm0, %ymm0
	vaddps -320(%rdx), %ymm1, %ymm1
	- vaddps -288(%rdx), %ymm2, %ymm2
	- vaddps -256(%rdx), %ymm3, %ymm3
	+ vaddps -288(%rdx), %ymm0, %ymm0
	+ vaddps -256(%rdx), %ymm1, %ymm1
	vaddps -224(%rdx), %ymm0, %ymm0
	vaddps -192(%rdx), %ymm1, %ymm1
	- vaddps -160(%rdx), %ymm2, %ymm2
	- vaddps -128(%rdx), %ymm3, %ymm3
	+ vaddps -160(%rdx), %ymm0, %ymm0
	+ vaddps -128(%rdx), %ymm1, %ymm1
	vaddps -96(%rdx), %ymm0, %ymm0
	vaddps -64(%rdx), %ymm1, %ymm1
	- vaddps -32(%rdx), %ymm2, %ymm2
	- vaddps (%rdx), %ymm3, %ymm3
	+ vaddps -32(%rdx), %ymm0, %ymm0
	+ vaddps (%rdx), %ymm1, %ymm1
	addq $512, %rdx # imm = 0x200
	addq $-128, %rcx
	- jne .LBB1_8
	-.LBB1_9:
	- movq %rax, %rcx
	-.LBB1_10: # %middle.block
	+ jne .LBB1_11
	+.LBB1_12: # %middle.block
	vaddps %ymm0, %ymm1, %ymm0
	- vaddps %ymm0, %ymm2, %ymm0
	- vaddps %ymm0, %ymm3, %ymm0
	vextractf128 $1, %ymm0, %xmm1
	vaddps %ymm1, %ymm0, %ymm0
	- vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
	+ vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
	vaddps %ymm1, %ymm0, %ymm0
	vhaddps %ymm0, %ymm0, %ymm0
	- cmpq %rsi, %rcx
	+ movq %rax, %rcx
	+ cmpq %rsi, %rax
	je .LBB1_13
	-# BB#11: # %.lr.ph.preheader
	+.LBB1_2: # %.lr.ph.preheader13
	leaq (%rdi,%rcx,4), %rax
	subq %rcx, %rsi
	.align 16, 0x90
	-.LBB1_12: # %.lr.ph
	+.LBB1_3: # %.lr.ph
	# =>This Inner Loop Header: Depth=1
	vaddss (%rax), %xmm0, %xmm0
	addq $4, %rax
	addq $-1, %rsi
	- jne .LBB1_12
	+ jne .LBB1_3
	.LBB1_13: # %._crit_edge
	#APP
	#NO_APP
	//

	#include <stdlib.h>
	#include <stdint.h>
	#include <time.h>
	#include <stdio.h>
	#include <string.h>

	uint64_t gettime_ns()
	{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC, &t);
	return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
	}


	__attribute__((noinline)) float sum32(float *a, size_t n)
	{
	/* a = (float)__builtin_assume_aligned(a, 64); /
	float s = 0;
	for (size_t i = 0;i < n;i++)
	s += a[i];
	__asm__ volatile ("" ::: "memory");
	return s;
	}

	int main()
	{
	float p = aligned_alloc(64, sizeof(float) 1024);
	memset(p, 0, sizeof(float) * 1024);
	uint64_t start = gettime_ns();
	for (int i = 0;i < 1024 * 1024;i++)
	sum32(p, 1024);
	free(p);
	uint64_t end = gettime_ns();
	printf("%f\n", (end - start) / (1024.0 * 1024.0));
	return 0;
	}
	sum32.constprop.0:
	.LFB36:
	.cfi_startproc
	movq %rdi, %rax
	shrq $2, %rax
	negq %rax
	andl $7, %eax
	je .L9
	vmovss (%rdi), %xmm2
	cmpq $1, %rax
	je .L10
	vaddss 4(%rdi), %xmm2, %xmm2
	cmpq $2, %rax
	je .L11
	vaddss 8(%rdi), %xmm2, %xmm2
	cmpq $3, %rax
	je .L12
	vaddss 12(%rdi), %xmm2, %xmm2
	cmpq $4, %rax
	je .L13
	vaddss 16(%rdi), %xmm2, %xmm2
	cmpq $5, %rax
	je .L14
	vaddss 20(%rdi), %xmm2, %xmm2
	cmpq $7, %rax
	jne .L15
	vaddss 24(%rdi), %xmm2, %xmm2
	movl $1017, %ecx
	movl $7, %edx
	.L3:
	movl $1024, %esi
	movl $1016, %r8d
	movl $127, %r10d
	subq %rax, %rsi
	.L2:
	leaq (%rdi,%rax,4), %r9
	vxorps %xmm1, %xmm1, %xmm1
	xorl %eax, %eax
	.L4:
	addq $1, %rax
	vaddps (%r9), %ymm1, %ymm1
	addq $32, %r9
	cmpq %r10, %rax
	jb .L4
	vhaddps %ymm1, %ymm1, %ymm1
	leaq (%rdx,%r8), %rax
	movq %rcx, %rdx
	subq %r8, %rdx
	vhaddps %ymm1, %ymm1, %ymm0
	vperm2f128 $1, %ymm0, %ymm0, %ymm1
	vaddps %ymm0, %ymm1, %ymm0
	vaddss %xmm2, %xmm0, %xmm0
	cmpq %r8, %rsi
	je .L7
	vaddss (%rdi,%rax,4), %xmm0, %xmm0
	leaq 1(%rax), %rcx
	cmpq $1, %rdx
	je .L7
	vaddss (%rdi,%rcx,4), %xmm0, %xmm0
	leaq 2(%rax), %rcx
	cmpq $2, %rdx
	je .L7
	vaddss (%rdi,%rcx,4), %xmm0, %xmm0
	leaq 3(%rax), %rcx
	cmpq $3, %rdx
	je .L7
	vaddss (%rdi,%rcx,4), %xmm0, %xmm0
	leaq 4(%rax), %rcx
	cmpq $4, %rdx
	je .L7
	vaddss (%rdi,%rcx,4), %xmm0, %xmm0
	leaq 5(%rax), %rcx
	cmpq $5, %rdx
	je .L7
	vaddss (%rdi,%rcx,4), %xmm0, %xmm0
	addq $6, %rax
	cmpq $6, %rdx
	je .L7
	vaddss (%rdi,%rax,4), %xmm0, %xmm0
	.L7:
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
	.L9:
	movl $1024, %esi
	movl $1024, %ecx
	vxorps %xmm2, %xmm2, %xmm2
	xorl %edx, %edx
	movl $1024, %r8d
	movl $128, %r10d
	jmp .L2
	.p2align 4,,10
	.p2align 3
	.L15:
	movl $1018, %ecx
	movl $6, %edx
	jmp .L3
	.p2align 4,,10
	.p2align 3
	.L10:
	movl $1023, %ecx
	movl $1, %edx
	jmp .L3
	.p2align 4,,10
	.p2align 3
	.L11:
	movl $2, %edx
	movl $1022, %ecx
	jmp .L3
	.p2align 4,,10
	.p2align 3
	.L12:
	movl $3, %edx
	movl $1021, %ecx
	jmp .L3
	.p2align 4,,10
	.p2align 3
	.L13:
	movl $4, %edx
	movl $1020, %ecx
	jmp .L3
	.p2align 4,,10
	.p2align 3
	.L14:
	movl $5, %edx
	movl $1019, %ecx
	jmp .L3
	.cfi_endproc
	; Function Attrs: noinline nounwind uwtable
	define float @sum32(float* nocapture readonly %a, i64 %n) #3 {
	%1 = icmp eq i64 %n, 0
	br i1 %1, label %._crit_edge, label %overflow.checked

	overflow.checked: ; preds = %0
	%n.vec = and i64 %n, -32
	%cmp.zero = icmp eq i64 %n.vec, 0
	br i1 %cmp.zero, label %middle.block, label %vector.body.preheader

	vector.body.preheader: ; preds = %overflow.checked
	%2 = add i64 %n, -32
	%3 = lshr i64 %2, 5
	%4 = add nuw nsw i64 %3, 1
	%xtraiter = and i64 %4, 3
	%lcmp.mod = icmp eq i64 %xtraiter, 0
	br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader

	vector.body.prol.preheader: ; preds = %vector.body.preheader
	br label %vector.body.prol

	vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol
	%index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ]
	%vec.phi.prol = phi <8 x float> [ %13, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
	%vec.phi6.prol = phi <8 x float> [ %14, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
	%vec.phi7.prol = phi <8 x float> [ %15, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
	%vec.phi8.prol = phi <8 x float> [ %16, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
	%prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ]
	%5 = getelementptr inbounds float, float* %a, i64 %index.prol
	%6 = bitcast float* %5 to <8 x float>*
	%wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7
	%7 = getelementptr float, float* %5, i64 8
	%8 = bitcast float* %7 to <8 x float>*
	%wide.load9.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7
	%9 = getelementptr float, float* %5, i64 16
	%10 = bitcast float* %9 to <8 x float>*
	%wide.load10.prol = load <8 x float>, <8 x float>* %10, align 4, !tbaa !7
	%11 = getelementptr float, float* %5, i64 24
	%12 = bitcast float* %11 to <8 x float>*
	%wide.load11.prol = load <8 x float>, <8 x float>* %12, align 4, !tbaa !7
	%13 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol
	%14 = fadd fast <8 x float> %wide.load9.prol, %vec.phi6.prol
	%15 = fadd fast <8 x float> %wide.load10.prol, %vec.phi7.prol
	%16 = fadd fast <8 x float> %wide.load11.prol, %vec.phi8.prol
	%index.next.prol = add i64 %index.prol, 32
	%prol.iter.sub = add i64 %prol.iter, -1
	%prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
	br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9

	vector.body.preheader.split.loopexit: ; preds = %vector.body.prol
	%index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ]
	%.lcssa35 = phi <8 x float> [ %16, %vector.body.prol ]
	%.lcssa34 = phi <8 x float> [ %15, %vector.body.prol ]
	%.lcssa33 = phi <8 x float> [ %14, %vector.body.prol ]
	%.lcssa32 = phi <8 x float> [ %13, %vector.body.prol ]
	br label %vector.body.preheader.split

	vector.body.preheader.split: ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader
	%.lcssa27.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
	%.lcssa26.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
	%.lcssa25.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
	%.lcssa24.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
	%index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ]
	%vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
	%vec.phi6.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
	%vec.phi7.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
	%vec.phi8.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
	%17 = icmp ult i64 %2, 96
	br i1 %17, label %middle.block.loopexit, label %vector.body.preheader.split.split

	vector.body.preheader.split.split: ; preds = %vector.body.preheader.split
	br label %vector.body

	vector.body: ; preds = %vector.body, %vector.body.preheader.split.split
	%index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.3, %vector.body ]
	%vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %62, %vector.body ]
	%vec.phi6 = phi <8 x float> [ %vec.phi6.unr, %vector.body.preheader.split.split ], [ %63, %vector.body ]
	%vec.phi7 = phi <8 x float> [ %vec.phi7.unr, %vector.body.preheader.split.split ], [ %64, %vector.body ]
	%vec.phi8 = phi <8 x float> [ %vec.phi8.unr, %vector.body.preheader.split.split ], [ %65, %vector.body ]
	%18 = getelementptr inbounds float, float* %a, i64 %index
	%19 = bitcast float* %18 to <8 x float>*
	%wide.load = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7
	%20 = getelementptr float, float* %18, i64 8
	%21 = bitcast float* %20 to <8 x float>*
	%wide.load9 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7
	%22 = getelementptr float, float* %18, i64 16
	%23 = bitcast float* %22 to <8 x float>*
	%wide.load10 = load <8 x float>, <8 x float>* %23, align 4, !tbaa !7
	%24 = getelementptr float, float* %18, i64 24
	%25 = bitcast float* %24 to <8 x float>*
	%wide.load11 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7
	%26 = fadd fast <8 x float> %wide.load, %vec.phi
	%27 = fadd fast <8 x float> %wide.load9, %vec.phi6
	%28 = fadd fast <8 x float> %wide.load10, %vec.phi7
	%29 = fadd fast <8 x float> %wide.load11, %vec.phi8
	%index.next = add i64 %index, 32
	%30 = getelementptr inbounds float, float* %a, i64 %index.next
	%31 = bitcast float* %30 to <8 x float>*
	%wide.load.1 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7
	%32 = getelementptr float, float* %30, i64 8
	%33 = bitcast float* %32 to <8 x float>*
	%wide.load9.1 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7
	%34 = getelementptr float, float* %30, i64 16
	%35 = bitcast float* %34 to <8 x float>*
	%wide.load10.1 = load <8 x float>, <8 x float>* %35, align 4, !tbaa !7
	%36 = getelementptr float, float* %30, i64 24
	%37 = bitcast float* %36 to <8 x float>*
	%wide.load11.1 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7
	%38 = fadd fast <8 x float> %wide.load.1, %26
	%39 = fadd fast <8 x float> %wide.load9.1, %27
	%40 = fadd fast <8 x float> %wide.load10.1, %28
	%41 = fadd fast <8 x float> %wide.load11.1, %29
	%index.next.1 = add i64 %index, 64
	%42 = getelementptr inbounds float, float* %a, i64 %index.next.1
	%43 = bitcast float* %42 to <8 x float>*
	%wide.load.2 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7
	%44 = getelementptr float, float* %42, i64 8
	%45 = bitcast float* %44 to <8 x float>*
	%wide.load9.2 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7
	%46 = getelementptr float, float* %42, i64 16
	%47 = bitcast float* %46 to <8 x float>*
	%wide.load10.2 = load <8 x float>, <8 x float>* %47, align 4, !tbaa !7
	%48 = getelementptr float, float* %42, i64 24
	%49 = bitcast float* %48 to <8 x float>*
	%wide.load11.2 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7
	%50 = fadd fast <8 x float> %wide.load.2, %38
	%51 = fadd fast <8 x float> %wide.load9.2, %39
	%52 = fadd fast <8 x float> %wide.load10.2, %40
	%53 = fadd fast <8 x float> %wide.load11.2, %41
	%index.next.2 = add i64 %index, 96
	%54 = getelementptr inbounds float, float* %a, i64 %index.next.2
	%55 = bitcast float* %54 to <8 x float>*
	%wide.load.3 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7
	%56 = getelementptr float, float* %54, i64 8
	%57 = bitcast float* %56 to <8 x float>*
	%wide.load9.3 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7
	%58 = getelementptr float, float* %54, i64 16
	%59 = bitcast float* %58 to <8 x float>*
	%wide.load10.3 = load <8 x float>, <8 x float>* %59, align 4, !tbaa !7
	%60 = getelementptr float, float* %54, i64 24
	%61 = bitcast float* %60 to <8 x float>*
	%wide.load11.3 = load <8 x float>, <8 x float>* %61, align 4, !tbaa !7
	%62 = fadd fast <8 x float> %wide.load.3, %50
	%63 = fadd fast <8 x float> %wide.load9.3, %51
	%64 = fadd fast <8 x float> %wide.load10.3, %52
	%65 = fadd fast <8 x float> %wide.load11.3, %53
	%index.next.3 = add i64 %index, 128
	%66 = icmp eq i64 %index.next.3, %n.vec
	br i1 %66, label %middle.block.loopexit.unr-lcssa, label %vector.body, !llvm.loop !11

	middle.block.loopexit.unr-lcssa: ; preds = %vector.body
	%.lcssa31 = phi <8 x float> [ %65, %vector.body ]
	%.lcssa30 = phi <8 x float> [ %64, %vector.body ]
	%.lcssa29 = phi <8 x float> [ %63, %vector.body ]
	%.lcssa28 = phi <8 x float> [ %62, %vector.body ]
	br label %middle.block.loopexit

	middle.block.loopexit: ; preds = %vector.body.preheader.split, %middle.block.loopexit.unr-lcssa
	%.lcssa27 = phi <8 x float> [ %.lcssa27.unr, %vector.body.preheader.split ], [ %.lcssa31, %middle.block.loopexit.unr-lcssa ]
	%.lcssa26 = phi <8 x float> [ %.lcssa26.unr, %vector.body.preheader.split ], [ %.lcssa30, %middle.block.loopexit.unr-lcssa ]
	%.lcssa25 = phi <8 x float> [ %.lcssa25.unr, %vector.body.preheader.split ], [ %.lcssa29, %middle.block.loopexit.unr-lcssa ]
	%.lcssa24 = phi <8 x float> [ %.lcssa24.unr, %vector.body.preheader.split ], [ %.lcssa28, %middle.block.loopexit.unr-lcssa ]
	br label %middle.block

	middle.block: ; preds = %middle.block.loopexit, %overflow.checked
	%resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %middle.block.loopexit ]
	%rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa24, %middle.block.loopexit ]
	%rdx.vec.exit.phi14 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa25, %middle.block.loopexit ]
	%rdx.vec.exit.phi15 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa26, %middle.block.loopexit ]
	%rdx.vec.exit.phi16 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa27, %middle.block.loopexit ]
	%bin.rdx = fadd fast <8 x float> %rdx.vec.exit.phi14, %rdx.vec.exit.phi
	%bin.rdx17 = fadd fast <8 x float> %rdx.vec.exit.phi15, %bin.rdx
	%bin.rdx18 = fadd fast <8 x float> %rdx.vec.exit.phi16, %bin.rdx17
	%rdx.shuf = shufflevector <8 x float> %bin.rdx18, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx19 = fadd fast <8 x float> %bin.rdx18, %rdx.shuf
	%rdx.shuf20 = shufflevector <8 x float> %bin.rdx19, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx21 = fadd fast <8 x float> %bin.rdx19, %rdx.shuf20
	%rdx.shuf22 = shufflevector <8 x float> %bin.rdx21, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%bin.rdx23 = fadd fast <8 x float> %bin.rdx21, %rdx.shuf22
	%67 = extractelement <8 x float> %bin.rdx23, i32 0
	%cmp.n = icmp eq i64 %resume.val, %n
	br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader

	.lr.ph.preheader: ; preds = %middle.block
	br label %.lr.ph

	._crit_edge.loopexit: ; preds = %.lr.ph
	%.lcssa = phi float [ %70, %.lr.ph ]
	br label %._crit_edge

	._crit_edge: ; preds = %._crit_edge.loopexit, %middle.block, %0
	%s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %67, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ]
	tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #1, !srcloc !14
	ret float %s.0.lcssa

	.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
	%i.02 = phi i64 [ %71, %.lr.ph ], [ %resume.val, %.lr.ph.preheader ]
	%s.01 = phi float [ %70, %.lr.ph ], [ %67, %.lr.ph.preheader ]
	%68 = getelementptr inbounds float, float* %a, i64 %i.02
	%69 = load float, float* %68, align 4, !tbaa !7
	%70 = fadd fast float %69, %s.01
	%71 = add nuw i64 %i.02, 1
	%exitcond = icmp eq i64 %71, %n
	br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15
	}
	sum32: # @sum32
	.cfi_startproc
	# BB#0:
	vxorps %xmm0, %xmm0, %xmm0
	testq %rsi, %rsi
	je .LBB1_13
	# BB#1: # %overflow.checked
	xorl %ecx, %ecx
	movq %rsi, %rax
	vxorps %ymm0, %ymm0, %ymm0
	vxorps %ymm1, %ymm1, %ymm1
	vxorps %ymm2, %ymm2, %ymm2
	vxorps %ymm3, %ymm3, %ymm3
	andq $-32, %rax
	je .LBB1_10
	# BB#2: # %vector.body.preheader
	leaq -32(%rsi), %r8
	movl %r8d, %ecx
	shrl $5, %ecx
	addl $1, %ecx
	xorl %edx, %edx
	testb $3, %cl
	je .LBB1_3
	# BB#4: # %vector.body.prol.preheader
	leal -32(%rsi), %ecx
	shrl $5, %ecx
	addl $1, %ecx
	andl $3, %ecx
	negq %rcx
	vxorps %ymm0, %ymm0, %ymm0
	xorl %edx, %edx
	vxorps %ymm1, %ymm1, %ymm1
	vxorps %ymm2, %ymm2, %ymm2
	vxorps %ymm3, %ymm3, %ymm3
	.align 16, 0x90
	.LBB1_5: # %vector.body.prol
	# =>This Inner Loop Header: Depth=1
	vaddps (%rdi,%rdx,4), %ymm0, %ymm0
	vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
	vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
	vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
	addq $32, %rdx
	addq $1, %rcx
	jne .LBB1_5
	jmp .LBB1_6
	.LBB1_3:
	vxorps %ymm0, %ymm0, %ymm0
	vxorps %ymm1, %ymm1, %ymm1
	vxorps %ymm2, %ymm2, %ymm2
	vxorps %ymm3, %ymm3, %ymm3
	.LBB1_6: # %vector.body.preheader.split
	cmpq $96, %r8
	jb .LBB1_9
	# BB#7: # %vector.body.preheader.split.split
	movq %rsi, %rcx
	andq $-32, %rcx
	subq %rdx, %rcx
	leaq 480(%rdi,%rdx,4), %rdx
	.align 16, 0x90
	.LBB1_8: # %vector.body
	# =>This Inner Loop Header: Depth=1
	vaddps -480(%rdx), %ymm0, %ymm0
	vaddps -448(%rdx), %ymm1, %ymm1
	vaddps -416(%rdx), %ymm2, %ymm2
	vaddps -384(%rdx), %ymm3, %ymm3
	vaddps -352(%rdx), %ymm0, %ymm0
	vaddps -320(%rdx), %ymm1, %ymm1
	vaddps -288(%rdx), %ymm2, %ymm2
	vaddps -256(%rdx), %ymm3, %ymm3
	vaddps -224(%rdx), %ymm0, %ymm0
	vaddps -192(%rdx), %ymm1, %ymm1
	vaddps -160(%rdx), %ymm2, %ymm2
	vaddps -128(%rdx), %ymm3, %ymm3
	vaddps -96(%rdx), %ymm0, %ymm0
	vaddps -64(%rdx), %ymm1, %ymm1
	vaddps -32(%rdx), %ymm2, %ymm2
	vaddps (%rdx), %ymm3, %ymm3
	addq $512, %rdx # imm = 0x200
	addq $-128, %rcx
	jne .LBB1_8
	.LBB1_9:
	movq %rax, %rcx
	.LBB1_10: # %middle.block
	vaddps %ymm0, %ymm1, %ymm0
	vaddps %ymm0, %ymm2, %ymm0
	vaddps %ymm0, %ymm3, %ymm0
	vextractf128 $1, %ymm0, %xmm1
	vaddps %ymm1, %ymm0, %ymm0
	vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
	vaddps %ymm1, %ymm0, %ymm0
	vhaddps %ymm0, %ymm0, %ymm0
	cmpq %rsi, %rcx
	je .LBB1_13
	# BB#11: # %.lr.ph.preheader
	leaq (%rdi,%rcx,4), %rax
	subq %rcx, %rsi
	.align 16, 0x90
	.LBB1_12: # %.lr.ph
	# =>This Inner Loop Header: Depth=1
	vaddss (%rax), %xmm0, %xmm0
	addq $4, %rax
	addq $-1, %rsi
	jne .LBB1_12
	.LBB1_13: # %._crit_edge
	#APP
	#NO_APP
	vzeroupper
	retq
	.Lfunc_end1:
	.size sum32, .Lfunc_end1-sum32
	.cfi_endproc