Skip to content

Instantly share code, notes, and snippets.

@yuyichao
Last active June 4, 2016 17:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yuyichao/5b07f71c1f19248ec5511d758532a4b0 to your computer and use it in GitHub Desktop.
Save yuyichao/5b07f71c1f19248ec5511d758532a4b0 to your computer and use it in GitHub Desktop.
--- llvm37.s 2016-06-04 13:23:34.947819989 -0400
+++ llvm38.s 2016-06-04 13:14:25.455283889 -0400
@@ -4,103 +4,95 @@
vxorps %xmm0, %xmm0, %xmm0
testq %rsi, %rsi
je .LBB1_13
-# BB#1: # %overflow.checked
+# BB#1: # %.lr.ph.preheader
+ vxorps %xmm0, %xmm0, %xmm0
+ xorl %ecx, %ecx
+ cmpq $15, %rsi
+ jbe .LBB1_2
+# BB#4: # %min.iters.checked
xorl %ecx, %ecx
movq %rsi, %rax
- vxorps %ymm0, %ymm0, %ymm0
- vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
- andq $-32, %rax
- je .LBB1_10
-# BB#2: # %vector.body.preheader
- leaq -32(%rsi), %r8
+ andq $-16, %rax
+ je .LBB1_2
+# BB#5: # %vector.body.preheader
+ leaq -16(%rsi), %r8
movl %r8d, %ecx
- shrl $5, %ecx
+ shrl $4, %ecx
addl $1, %ecx
xorl %edx, %edx
- testb $3, %cl
- je .LBB1_3
-# BB#4: # %vector.body.prol.preheader
- leal -32(%rsi), %ecx
- shrl $5, %ecx
+ testb $7, %cl
+ je .LBB1_6
+# BB#7: # %vector.body.prol.preheader
+ leal -16(%rsi), %ecx
+ shrl $4, %ecx
addl $1, %ecx
- andl $3, %ecx
+ andl $7, %ecx
negq %rcx
vxorps %ymm0, %ymm0, %ymm0
xorl %edx, %edx
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
.align 16, 0x90
-.LBB1_5: # %vector.body.prol
+.LBB1_8: # %vector.body.prol
# =>This Inner Loop Header: Depth=1
vaddps (%rdi,%rdx,4), %ymm0, %ymm0
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
- vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
- vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
- addq $32, %rdx
+ addq $16, %rdx
addq $1, %rcx
- jne .LBB1_5
- jmp .LBB1_6
-.LBB1_3:
+ jne .LBB1_8
+ jmp .LBB1_9
+.LBB1_6:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
-.LBB1_6: # %vector.body.preheader.split
- cmpq $96, %r8
- jb .LBB1_9
-# BB#7: # %vector.body.preheader.split.split
+.LBB1_9: # %vector.body.preheader.split
+ cmpq $112, %r8
+ jb .LBB1_12
+# BB#10: # %vector.body.preheader.split.split
movq %rsi, %rcx
- andq $-32, %rcx
+ andq $-16, %rcx
subq %rdx, %rcx
leaq 480(%rdi,%rdx,4), %rdx
.align 16, 0x90
-.LBB1_8: # %vector.body
+.LBB1_11: # %vector.body
# =>This Inner Loop Header: Depth=1
vaddps -480(%rdx), %ymm0, %ymm0
vaddps -448(%rdx), %ymm1, %ymm1
- vaddps -416(%rdx), %ymm2, %ymm2
- vaddps -384(%rdx), %ymm3, %ymm3
+ vaddps -416(%rdx), %ymm0, %ymm0
+ vaddps -384(%rdx), %ymm1, %ymm1
vaddps -352(%rdx), %ymm0, %ymm0
vaddps -320(%rdx), %ymm1, %ymm1
- vaddps -288(%rdx), %ymm2, %ymm2
- vaddps -256(%rdx), %ymm3, %ymm3
+ vaddps -288(%rdx), %ymm0, %ymm0
+ vaddps -256(%rdx), %ymm1, %ymm1
vaddps -224(%rdx), %ymm0, %ymm0
vaddps -192(%rdx), %ymm1, %ymm1
- vaddps -160(%rdx), %ymm2, %ymm2
- vaddps -128(%rdx), %ymm3, %ymm3
+ vaddps -160(%rdx), %ymm0, %ymm0
+ vaddps -128(%rdx), %ymm1, %ymm1
vaddps -96(%rdx), %ymm0, %ymm0
vaddps -64(%rdx), %ymm1, %ymm1
- vaddps -32(%rdx), %ymm2, %ymm2
- vaddps (%rdx), %ymm3, %ymm3
+ vaddps -32(%rdx), %ymm0, %ymm0
+ vaddps (%rdx), %ymm1, %ymm1
addq $512, %rdx # imm = 0x200
addq $-128, %rcx
- jne .LBB1_8
-.LBB1_9:
- movq %rax, %rcx
-.LBB1_10: # %middle.block
+ jne .LBB1_11
+.LBB1_12: # %middle.block
vaddps %ymm0, %ymm1, %ymm0
- vaddps %ymm0, %ymm2, %ymm0
- vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %ymm1, %ymm0, %ymm0
- vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
+ vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %ymm1, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm0
- cmpq %rsi, %rcx
+ movq %rax, %rcx
+ cmpq %rsi, %rax
je .LBB1_13
-# BB#11: # %.lr.ph.preheader
+.LBB1_2: # %.lr.ph.preheader13
leaq (%rdi,%rcx,4), %rax
subq %rcx, %rsi
.align 16, 0x90
-.LBB1_12: # %.lr.ph
+.LBB1_3: # %.lr.ph
# =>This Inner Loop Header: Depth=1
vaddss (%rax), %xmm0, %xmm0
addq $4, %rax
addq $-1, %rsi
- jne .LBB1_12
+ jne .LBB1_3
.LBB1_13: # %._crit_edge
#APP
#NO_APP
//
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
uint64_t gettime_ns()
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
}
__attribute__((noinline)) float sum32(float *a, size_t n)
{
/* a = (float*)__builtin_assume_aligned(a, 64); */
float s = 0;
for (size_t i = 0;i < n;i++)
s += a[i];
__asm__ volatile ("" ::: "memory");
return s;
}
int main()
{
float *p = aligned_alloc(64, sizeof(float) * 1024);
memset(p, 0, sizeof(float) * 1024);
uint64_t start = gettime_ns();
for (int i = 0;i < 1024 * 1024;i++)
sum32(p, 1024);
free(p);
uint64_t end = gettime_ns();
printf("%f\n", (end - start) / (1024.0 * 1024.0));
return 0;
}
sum32.constprop.0:
.LFB36:
.cfi_startproc
movq %rdi, %rax
shrq $2, %rax
negq %rax
andl $7, %eax
je .L9
vmovss (%rdi), %xmm2
cmpq $1, %rax
je .L10
vaddss 4(%rdi), %xmm2, %xmm2
cmpq $2, %rax
je .L11
vaddss 8(%rdi), %xmm2, %xmm2
cmpq $3, %rax
je .L12
vaddss 12(%rdi), %xmm2, %xmm2
cmpq $4, %rax
je .L13
vaddss 16(%rdi), %xmm2, %xmm2
cmpq $5, %rax
je .L14
vaddss 20(%rdi), %xmm2, %xmm2
cmpq $7, %rax
jne .L15
vaddss 24(%rdi), %xmm2, %xmm2
movl $1017, %ecx
movl $7, %edx
.L3:
movl $1024, %esi
movl $1016, %r8d
movl $127, %r10d
subq %rax, %rsi
.L2:
leaq (%rdi,%rax,4), %r9
vxorps %xmm1, %xmm1, %xmm1
xorl %eax, %eax
.L4:
addq $1, %rax
vaddps (%r9), %ymm1, %ymm1
addq $32, %r9
cmpq %r10, %rax
jb .L4
vhaddps %ymm1, %ymm1, %ymm1
leaq (%rdx,%r8), %rax
movq %rcx, %rdx
subq %r8, %rdx
vhaddps %ymm1, %ymm1, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vaddss %xmm2, %xmm0, %xmm0
cmpq %r8, %rsi
je .L7
vaddss (%rdi,%rax,4), %xmm0, %xmm0
leaq 1(%rax), %rcx
cmpq $1, %rdx
je .L7
vaddss (%rdi,%rcx,4), %xmm0, %xmm0
leaq 2(%rax), %rcx
cmpq $2, %rdx
je .L7
vaddss (%rdi,%rcx,4), %xmm0, %xmm0
leaq 3(%rax), %rcx
cmpq $3, %rdx
je .L7
vaddss (%rdi,%rcx,4), %xmm0, %xmm0
leaq 4(%rax), %rcx
cmpq $4, %rdx
je .L7
vaddss (%rdi,%rcx,4), %xmm0, %xmm0
leaq 5(%rax), %rcx
cmpq $5, %rdx
je .L7
vaddss (%rdi,%rcx,4), %xmm0, %xmm0
addq $6, %rax
cmpq $6, %rdx
je .L7
vaddss (%rdi,%rax,4), %xmm0, %xmm0
.L7:
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L9:
movl $1024, %esi
movl $1024, %ecx
vxorps %xmm2, %xmm2, %xmm2
xorl %edx, %edx
movl $1024, %r8d
movl $128, %r10d
jmp .L2
.p2align 4,,10
.p2align 3
.L15:
movl $1018, %ecx
movl $6, %edx
jmp .L3
.p2align 4,,10
.p2align 3
.L10:
movl $1023, %ecx
movl $1, %edx
jmp .L3
.p2align 4,,10
.p2align 3
.L11:
movl $2, %edx
movl $1022, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
.L12:
movl $3, %edx
movl $1021, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
.L13:
movl $4, %edx
movl $1020, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
.L14:
movl $5, %edx
movl $1019, %ecx
jmp .L3
.cfi_endproc
; Function Attrs: noinline nounwind uwtable
define float @sum32(float* nocapture readonly %a, i64 %n) #3 {
%1 = icmp eq i64 %n, 0
br i1 %1, label %._crit_edge, label %overflow.checked
overflow.checked: ; preds = %0
%n.vec = and i64 %n, -32
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %middle.block, label %vector.body.preheader
vector.body.preheader: ; preds = %overflow.checked
%2 = add i64 %n, -32
%3 = lshr i64 %2, 5
%4 = add nuw nsw i64 %3, 1
%xtraiter = and i64 %4, 3
%lcmp.mod = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader
vector.body.prol.preheader: ; preds = %vector.body.preheader
br label %vector.body.prol
vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol
%index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ]
%vec.phi.prol = phi <8 x float> [ %13, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%vec.phi6.prol = phi <8 x float> [ %14, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%vec.phi7.prol = phi <8 x float> [ %15, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%vec.phi8.prol = phi <8 x float> [ %16, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ]
%5 = getelementptr inbounds float, float* %a, i64 %index.prol
%6 = bitcast float* %5 to <8 x float>*
%wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7
%7 = getelementptr float, float* %5, i64 8
%8 = bitcast float* %7 to <8 x float>*
%wide.load9.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7
%9 = getelementptr float, float* %5, i64 16
%10 = bitcast float* %9 to <8 x float>*
%wide.load10.prol = load <8 x float>, <8 x float>* %10, align 4, !tbaa !7
%11 = getelementptr float, float* %5, i64 24
%12 = bitcast float* %11 to <8 x float>*
%wide.load11.prol = load <8 x float>, <8 x float>* %12, align 4, !tbaa !7
%13 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol
%14 = fadd fast <8 x float> %wide.load9.prol, %vec.phi6.prol
%15 = fadd fast <8 x float> %wide.load10.prol, %vec.phi7.prol
%16 = fadd fast <8 x float> %wide.load11.prol, %vec.phi8.prol
%index.next.prol = add i64 %index.prol, 32
%prol.iter.sub = add i64 %prol.iter, -1
%prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9
vector.body.preheader.split.loopexit: ; preds = %vector.body.prol
%index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ]
%.lcssa35 = phi <8 x float> [ %16, %vector.body.prol ]
%.lcssa34 = phi <8 x float> [ %15, %vector.body.prol ]
%.lcssa33 = phi <8 x float> [ %14, %vector.body.prol ]
%.lcssa32 = phi <8 x float> [ %13, %vector.body.prol ]
br label %vector.body.preheader.split
vector.body.preheader.split: ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader
%.lcssa27.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
%.lcssa26.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
%.lcssa25.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
%.lcssa24.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
%index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ]
%vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ]
%vec.phi6.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ]
%vec.phi7.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ]
%vec.phi8.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ]
%17 = icmp ult i64 %2, 96
br i1 %17, label %middle.block.loopexit, label %vector.body.preheader.split.split
vector.body.preheader.split.split: ; preds = %vector.body.preheader.split
br label %vector.body
vector.body: ; preds = %vector.body, %vector.body.preheader.split.split
%index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.3, %vector.body ]
%vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %62, %vector.body ]
%vec.phi6 = phi <8 x float> [ %vec.phi6.unr, %vector.body.preheader.split.split ], [ %63, %vector.body ]
%vec.phi7 = phi <8 x float> [ %vec.phi7.unr, %vector.body.preheader.split.split ], [ %64, %vector.body ]
%vec.phi8 = phi <8 x float> [ %vec.phi8.unr, %vector.body.preheader.split.split ], [ %65, %vector.body ]
%18 = getelementptr inbounds float, float* %a, i64 %index
%19 = bitcast float* %18 to <8 x float>*
%wide.load = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7
%20 = getelementptr float, float* %18, i64 8
%21 = bitcast float* %20 to <8 x float>*
%wide.load9 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7
%22 = getelementptr float, float* %18, i64 16
%23 = bitcast float* %22 to <8 x float>*
%wide.load10 = load <8 x float>, <8 x float>* %23, align 4, !tbaa !7
%24 = getelementptr float, float* %18, i64 24
%25 = bitcast float* %24 to <8 x float>*
%wide.load11 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7
%26 = fadd fast <8 x float> %wide.load, %vec.phi
%27 = fadd fast <8 x float> %wide.load9, %vec.phi6
%28 = fadd fast <8 x float> %wide.load10, %vec.phi7
%29 = fadd fast <8 x float> %wide.load11, %vec.phi8
%index.next = add i64 %index, 32
%30 = getelementptr inbounds float, float* %a, i64 %index.next
%31 = bitcast float* %30 to <8 x float>*
%wide.load.1 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7
%32 = getelementptr float, float* %30, i64 8
%33 = bitcast float* %32 to <8 x float>*
%wide.load9.1 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7
%34 = getelementptr float, float* %30, i64 16
%35 = bitcast float* %34 to <8 x float>*
%wide.load10.1 = load <8 x float>, <8 x float>* %35, align 4, !tbaa !7
%36 = getelementptr float, float* %30, i64 24
%37 = bitcast float* %36 to <8 x float>*
%wide.load11.1 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7
%38 = fadd fast <8 x float> %wide.load.1, %26
%39 = fadd fast <8 x float> %wide.load9.1, %27
%40 = fadd fast <8 x float> %wide.load10.1, %28
%41 = fadd fast <8 x float> %wide.load11.1, %29
%index.next.1 = add i64 %index, 64
%42 = getelementptr inbounds float, float* %a, i64 %index.next.1
%43 = bitcast float* %42 to <8 x float>*
%wide.load.2 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7
%44 = getelementptr float, float* %42, i64 8
%45 = bitcast float* %44 to <8 x float>*
%wide.load9.2 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7
%46 = getelementptr float, float* %42, i64 16
%47 = bitcast float* %46 to <8 x float>*
%wide.load10.2 = load <8 x float>, <8 x float>* %47, align 4, !tbaa !7
%48 = getelementptr float, float* %42, i64 24
%49 = bitcast float* %48 to <8 x float>*
%wide.load11.2 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7
%50 = fadd fast <8 x float> %wide.load.2, %38
%51 = fadd fast <8 x float> %wide.load9.2, %39
%52 = fadd fast <8 x float> %wide.load10.2, %40
%53 = fadd fast <8 x float> %wide.load11.2, %41
%index.next.2 = add i64 %index, 96
%54 = getelementptr inbounds float, float* %a, i64 %index.next.2
%55 = bitcast float* %54 to <8 x float>*
%wide.load.3 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7
%56 = getelementptr float, float* %54, i64 8
%57 = bitcast float* %56 to <8 x float>*
%wide.load9.3 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7
%58 = getelementptr float, float* %54, i64 16
%59 = bitcast float* %58 to <8 x float>*
%wide.load10.3 = load <8 x float>, <8 x float>* %59, align 4, !tbaa !7
%60 = getelementptr float, float* %54, i64 24
%61 = bitcast float* %60 to <8 x float>*
%wide.load11.3 = load <8 x float>, <8 x float>* %61, align 4, !tbaa !7
%62 = fadd fast <8 x float> %wide.load.3, %50
%63 = fadd fast <8 x float> %wide.load9.3, %51
%64 = fadd fast <8 x float> %wide.load10.3, %52
%65 = fadd fast <8 x float> %wide.load11.3, %53
%index.next.3 = add i64 %index, 128
%66 = icmp eq i64 %index.next.3, %n.vec
br i1 %66, label %middle.block.loopexit.unr-lcssa, label %vector.body, !llvm.loop !11
middle.block.loopexit.unr-lcssa: ; preds = %vector.body
%.lcssa31 = phi <8 x float> [ %65, %vector.body ]
%.lcssa30 = phi <8 x float> [ %64, %vector.body ]
%.lcssa29 = phi <8 x float> [ %63, %vector.body ]
%.lcssa28 = phi <8 x float> [ %62, %vector.body ]
br label %middle.block.loopexit
middle.block.loopexit: ; preds = %vector.body.preheader.split, %middle.block.loopexit.unr-lcssa
%.lcssa27 = phi <8 x float> [ %.lcssa27.unr, %vector.body.preheader.split ], [ %.lcssa31, %middle.block.loopexit.unr-lcssa ]
%.lcssa26 = phi <8 x float> [ %.lcssa26.unr, %vector.body.preheader.split ], [ %.lcssa30, %middle.block.loopexit.unr-lcssa ]
%.lcssa25 = phi <8 x float> [ %.lcssa25.unr, %vector.body.preheader.split ], [ %.lcssa29, %middle.block.loopexit.unr-lcssa ]
%.lcssa24 = phi <8 x float> [ %.lcssa24.unr, %vector.body.preheader.split ], [ %.lcssa28, %middle.block.loopexit.unr-lcssa ]
br label %middle.block
middle.block: ; preds = %middle.block.loopexit, %overflow.checked
%resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %middle.block.loopexit ]
%rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa24, %middle.block.loopexit ]
%rdx.vec.exit.phi14 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa25, %middle.block.loopexit ]
%rdx.vec.exit.phi15 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa26, %middle.block.loopexit ]
%rdx.vec.exit.phi16 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa27, %middle.block.loopexit ]
%bin.rdx = fadd fast <8 x float> %rdx.vec.exit.phi14, %rdx.vec.exit.phi
%bin.rdx17 = fadd fast <8 x float> %rdx.vec.exit.phi15, %bin.rdx
%bin.rdx18 = fadd fast <8 x float> %rdx.vec.exit.phi16, %bin.rdx17
%rdx.shuf = shufflevector <8 x float> %bin.rdx18, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx19 = fadd fast <8 x float> %bin.rdx18, %rdx.shuf
%rdx.shuf20 = shufflevector <8 x float> %bin.rdx19, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx21 = fadd fast <8 x float> %bin.rdx19, %rdx.shuf20
%rdx.shuf22 = shufflevector <8 x float> %bin.rdx21, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx23 = fadd fast <8 x float> %bin.rdx21, %rdx.shuf22
%67 = extractelement <8 x float> %bin.rdx23, i32 0
%cmp.n = icmp eq i64 %resume.val, %n
br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader
.lr.ph.preheader: ; preds = %middle.block
br label %.lr.ph
._crit_edge.loopexit: ; preds = %.lr.ph
%.lcssa = phi float [ %70, %.lr.ph ]
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %middle.block, %0
%s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %67, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ]
tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #1, !srcloc !14
ret float %s.0.lcssa
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%i.02 = phi i64 [ %71, %.lr.ph ], [ %resume.val, %.lr.ph.preheader ]
%s.01 = phi float [ %70, %.lr.ph ], [ %67, %.lr.ph.preheader ]
%68 = getelementptr inbounds float, float* %a, i64 %i.02
%69 = load float, float* %68, align 4, !tbaa !7
%70 = fadd fast float %69, %s.01
%71 = add nuw i64 %i.02, 1
%exitcond = icmp eq i64 %71, %n
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15
}
sum32: # @sum32
.cfi_startproc
# BB#0:
vxorps %xmm0, %xmm0, %xmm0
testq %rsi, %rsi
je .LBB1_13
# BB#1: # %overflow.checked
xorl %ecx, %ecx
movq %rsi, %rax
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
andq $-32, %rax
je .LBB1_10
# BB#2: # %vector.body.preheader
leaq -32(%rsi), %r8
movl %r8d, %ecx
shrl $5, %ecx
addl $1, %ecx
xorl %edx, %edx
testb $3, %cl
je .LBB1_3
# BB#4: # %vector.body.prol.preheader
leal -32(%rsi), %ecx
shrl $5, %ecx
addl $1, %ecx
andl $3, %ecx
negq %rcx
vxorps %ymm0, %ymm0, %ymm0
xorl %edx, %edx
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
.align 16, 0x90
.LBB1_5: # %vector.body.prol
# =>This Inner Loop Header: Depth=1
vaddps (%rdi,%rdx,4), %ymm0, %ymm0
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
addq $32, %rdx
addq $1, %rcx
jne .LBB1_5
jmp .LBB1_6
.LBB1_3:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
.LBB1_6: # %vector.body.preheader.split
cmpq $96, %r8
jb .LBB1_9
# BB#7: # %vector.body.preheader.split.split
movq %rsi, %rcx
andq $-32, %rcx
subq %rdx, %rcx
leaq 480(%rdi,%rdx,4), %rdx
.align 16, 0x90
.LBB1_8: # %vector.body
# =>This Inner Loop Header: Depth=1
vaddps -480(%rdx), %ymm0, %ymm0
vaddps -448(%rdx), %ymm1, %ymm1
vaddps -416(%rdx), %ymm2, %ymm2
vaddps -384(%rdx), %ymm3, %ymm3
vaddps -352(%rdx), %ymm0, %ymm0
vaddps -320(%rdx), %ymm1, %ymm1
vaddps -288(%rdx), %ymm2, %ymm2
vaddps -256(%rdx), %ymm3, %ymm3
vaddps -224(%rdx), %ymm0, %ymm0
vaddps -192(%rdx), %ymm1, %ymm1
vaddps -160(%rdx), %ymm2, %ymm2
vaddps -128(%rdx), %ymm3, %ymm3
vaddps -96(%rdx), %ymm0, %ymm0
vaddps -64(%rdx), %ymm1, %ymm1
vaddps -32(%rdx), %ymm2, %ymm2
vaddps (%rdx), %ymm3, %ymm3
addq $512, %rdx # imm = 0x200
addq $-128, %rcx
jne .LBB1_8
.LBB1_9:
movq %rax, %rcx
.LBB1_10: # %middle.block
vaddps %ymm0, %ymm1, %ymm0
vaddps %ymm0, %ymm2, %ymm0
vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %ymm1, %ymm0, %ymm0
vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
vaddps %ymm1, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm0
cmpq %rsi, %rcx
je .LBB1_13
# BB#11: # %.lr.ph.preheader
leaq (%rdi,%rcx,4), %rax
subq %rcx, %rsi
.align 16, 0x90
.LBB1_12: # %.lr.ph
# =>This Inner Loop Header: Depth=1
vaddss (%rax), %xmm0, %xmm0
addq $4, %rax
addq $-1, %rsi
jne .LBB1_12
.LBB1_13: # %._crit_edge
#APP
#NO_APP
vzeroupper
retq
.Lfunc_end1:
.size sum32, .Lfunc_end1-sum32
.cfi_endproc
; Function Attrs: noinline nounwind uwtable
define float @sum32(float* nocapture readonly %a, i64 %n) #3 {
%1 = icmp eq i64 %n, 0
br i1 %1, label %._crit_edge, label %.lr.ph.preheader
.lr.ph.preheader: ; preds = %0
%min.iters.check = icmp ult i64 %n, 16
br i1 %min.iters.check, label %.lr.ph.preheader13, label %min.iters.checked
.lr.ph.preheader13: ; preds = %middle.block, %min.iters.checked, %.lr.ph.preheader
%i.02.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %.lr.ph.preheader ], [ %n.vec, %middle.block ]
%s.01.ph = phi float [ 0.000000e+00, %min.iters.checked ], [ 0.000000e+00, %.lr.ph.preheader ], [ %61, %middle.block ]
br label %.lr.ph
min.iters.checked: ; preds = %.lr.ph.preheader
%n.vec = and i64 %n, -16
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %.lr.ph.preheader13, label %vector.body.preheader
vector.body.preheader: ; preds = %min.iters.checked
%2 = add i64 %n, -16
%3 = lshr i64 %2, 4
%4 = add nuw nsw i64 %3, 1
%xtraiter = and i64 %4, 7
%lcmp.mod = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader
vector.body.prol.preheader: ; preds = %vector.body.preheader
br label %vector.body.prol
vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol
%index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ]
%vec.phi.prol = phi <8 x float> [ %9, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%vec.phi4.prol = phi <8 x float> [ %10, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ]
%prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ]
%5 = getelementptr inbounds float, float* %a, i64 %index.prol
%6 = bitcast float* %5 to <8 x float>*
%wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7
%7 = getelementptr float, float* %5, i64 8
%8 = bitcast float* %7 to <8 x float>*
%wide.load5.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7
%9 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol
%10 = fadd fast <8 x float> %wide.load5.prol, %vec.phi4.prol
%index.next.prol = add i64 %index.prol, 16
%prol.iter.sub = add i64 %prol.iter, -1
%prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0
br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9
vector.body.preheader.split.loopexit: ; preds = %vector.body.prol
%index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ]
%.lcssa19 = phi <8 x float> [ %10, %vector.body.prol ]
%.lcssa18 = phi <8 x float> [ %9, %vector.body.prol ]
br label %vector.body.preheader.split
vector.body.preheader.split: ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader
%.lcssa15.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ]
%.lcssa14.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ]
%index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ]
%vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ]
%vec.phi4.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ]
%11 = icmp ult i64 %2, 112
br i1 %11, label %middle.block, label %vector.body.preheader.split.split
vector.body.preheader.split.split: ; preds = %vector.body.preheader.split
br label %vector.body
vector.body: ; preds = %vector.body, %vector.body.preheader.split.split
%index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.7, %vector.body ]
%vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %58, %vector.body ]
%vec.phi4 = phi <8 x float> [ %vec.phi4.unr, %vector.body.preheader.split.split ], [ %59, %vector.body ]
%12 = getelementptr inbounds float, float* %a, i64 %index
%13 = bitcast float* %12 to <8 x float>*
%wide.load = load <8 x float>, <8 x float>* %13, align 4, !tbaa !7
%14 = getelementptr float, float* %12, i64 8
%15 = bitcast float* %14 to <8 x float>*
%wide.load5 = load <8 x float>, <8 x float>* %15, align 4, !tbaa !7
%16 = fadd fast <8 x float> %wide.load, %vec.phi
%17 = fadd fast <8 x float> %wide.load5, %vec.phi4
%index.next = add i64 %index, 16
%18 = getelementptr inbounds float, float* %a, i64 %index.next
%19 = bitcast float* %18 to <8 x float>*
%wide.load.1 = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7
%20 = getelementptr float, float* %18, i64 8
%21 = bitcast float* %20 to <8 x float>*
%wide.load5.1 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7
%22 = fadd fast <8 x float> %wide.load.1, %16
%23 = fadd fast <8 x float> %wide.load5.1, %17
%index.next.1 = add i64 %index, 32
%24 = getelementptr inbounds float, float* %a, i64 %index.next.1
%25 = bitcast float* %24 to <8 x float>*
%wide.load.2 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7
%26 = getelementptr float, float* %24, i64 8
%27 = bitcast float* %26 to <8 x float>*
%wide.load5.2 = load <8 x float>, <8 x float>* %27, align 4, !tbaa !7
%28 = fadd fast <8 x float> %wide.load.2, %22
%29 = fadd fast <8 x float> %wide.load5.2, %23
%index.next.2 = add i64 %index, 48
%30 = getelementptr inbounds float, float* %a, i64 %index.next.2
%31 = bitcast float* %30 to <8 x float>*
%wide.load.3 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7
%32 = getelementptr float, float* %30, i64 8
%33 = bitcast float* %32 to <8 x float>*
%wide.load5.3 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7
%34 = fadd fast <8 x float> %wide.load.3, %28
%35 = fadd fast <8 x float> %wide.load5.3, %29
%index.next.3 = add i64 %index, 64
%36 = getelementptr inbounds float, float* %a, i64 %index.next.3
%37 = bitcast float* %36 to <8 x float>*
%wide.load.4 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7
%38 = getelementptr float, float* %36, i64 8
%39 = bitcast float* %38 to <8 x float>*
%wide.load5.4 = load <8 x float>, <8 x float>* %39, align 4, !tbaa !7
%40 = fadd fast <8 x float> %wide.load.4, %34
%41 = fadd fast <8 x float> %wide.load5.4, %35
%index.next.4 = add i64 %index, 80
%42 = getelementptr inbounds float, float* %a, i64 %index.next.4
%43 = bitcast float* %42 to <8 x float>*
%wide.load.5 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7
%44 = getelementptr float, float* %42, i64 8
%45 = bitcast float* %44 to <8 x float>*
%wide.load5.5 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7
%46 = fadd fast <8 x float> %wide.load.5, %40
%47 = fadd fast <8 x float> %wide.load5.5, %41
%index.next.5 = add i64 %index, 96
%48 = getelementptr inbounds float, float* %a, i64 %index.next.5
%49 = bitcast float* %48 to <8 x float>*
%wide.load.6 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7
%50 = getelementptr float, float* %48, i64 8
%51 = bitcast float* %50 to <8 x float>*
%wide.load5.6 = load <8 x float>, <8 x float>* %51, align 4, !tbaa !7
%52 = fadd fast <8 x float> %wide.load.6, %46
%53 = fadd fast <8 x float> %wide.load5.6, %47
%index.next.6 = add i64 %index, 112
%54 = getelementptr inbounds float, float* %a, i64 %index.next.6
%55 = bitcast float* %54 to <8 x float>*
%wide.load.7 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7
%56 = getelementptr float, float* %54, i64 8
%57 = bitcast float* %56 to <8 x float>*
%wide.load5.7 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7
%58 = fadd fast <8 x float> %wide.load.7, %52
%59 = fadd fast <8 x float> %wide.load5.7, %53
%index.next.7 = add i64 %index, 128
%60 = icmp eq i64 %index.next.7, %n.vec
br i1 %60, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !11
middle.block.unr-lcssa: ; preds = %vector.body
%.lcssa17 = phi <8 x float> [ %59, %vector.body ]
%.lcssa16 = phi <8 x float> [ %58, %vector.body ]
br label %middle.block
middle.block: ; preds = %vector.body.preheader.split, %middle.block.unr-lcssa
%.lcssa15 = phi <8 x float> [ %.lcssa15.unr, %vector.body.preheader.split ], [ %.lcssa17, %middle.block.unr-lcssa ]
%.lcssa14 = phi <8 x float> [ %.lcssa14.unr, %vector.body.preheader.split ], [ %.lcssa16, %middle.block.unr-lcssa ]
%bin.rdx = fadd fast <8 x float> %.lcssa15, %.lcssa14
%rdx.shuf = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd fast <8 x float> %bin.rdx, %rdx.shuf
%rdx.shuf9 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx10 = fadd fast <8 x float> %bin.rdx8, %rdx.shuf9
%rdx.shuf11 = shufflevector <8 x float> %bin.rdx10, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx12 = fadd fast <8 x float> %bin.rdx10, %rdx.shuf11
%61 = extractelement <8 x float> %bin.rdx12, i32 0
%cmp.n = icmp eq i64 %n.vec, %n
br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader13
._crit_edge.loopexit: ; preds = %.lr.ph
%.lcssa = phi float [ %64, %.lr.ph ]
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %middle.block, %0
%s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %61, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ]
tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #4, !srcloc !14
ret float %s.0.lcssa
.lr.ph: ; preds = %.lr.ph.preheader13, %.lr.ph
%i.02 = phi i64 [ %65, %.lr.ph ], [ %i.02.ph, %.lr.ph.preheader13 ]
%s.01 = phi float [ %64, %.lr.ph ], [ %s.01.ph, %.lr.ph.preheader13 ]
%62 = getelementptr inbounds float, float* %a, i64 %i.02
%63 = load float, float* %62, align 4, !tbaa !7
%64 = fadd fast float %63, %s.01
%65 = add nuw i64 %i.02, 1
%exitcond = icmp eq i64 %65, %n
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15
}
sum32: # @sum32
.cfi_startproc
# BB#0:
vxorps %xmm0, %xmm0, %xmm0
testq %rsi, %rsi
je .LBB1_13
# BB#1: # %.lr.ph.preheader
vxorps %xmm0, %xmm0, %xmm0
xorl %ecx, %ecx
cmpq $15, %rsi
jbe .LBB1_2
# BB#4: # %min.iters.checked
xorl %ecx, %ecx
movq %rsi, %rax
andq $-16, %rax
je .LBB1_2
# BB#5: # %vector.body.preheader
leaq -16(%rsi), %r8
movl %r8d, %ecx
shrl $4, %ecx
addl $1, %ecx
xorl %edx, %edx
testb $7, %cl
je .LBB1_6
# BB#7: # %vector.body.prol.preheader
leal -16(%rsi), %ecx
shrl $4, %ecx
addl $1, %ecx
andl $7, %ecx
negq %rcx
vxorps %ymm0, %ymm0, %ymm0
xorl %edx, %edx
vxorps %ymm1, %ymm1, %ymm1
.align 16, 0x90
.LBB1_8: # %vector.body.prol
# =>This Inner Loop Header: Depth=1
vaddps (%rdi,%rdx,4), %ymm0, %ymm0
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
addq $16, %rdx
addq $1, %rcx
jne .LBB1_8
jmp .LBB1_9
.LBB1_6:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
.LBB1_9: # %vector.body.preheader.split
cmpq $112, %r8
jb .LBB1_12
# BB#10: # %vector.body.preheader.split.split
movq %rsi, %rcx
andq $-16, %rcx
subq %rdx, %rcx
leaq 480(%rdi,%rdx,4), %rdx
.align 16, 0x90
.LBB1_11: # %vector.body
# =>This Inner Loop Header: Depth=1
vaddps -480(%rdx), %ymm0, %ymm0
vaddps -448(%rdx), %ymm1, %ymm1
vaddps -416(%rdx), %ymm0, %ymm0
vaddps -384(%rdx), %ymm1, %ymm1
vaddps -352(%rdx), %ymm0, %ymm0
vaddps -320(%rdx), %ymm1, %ymm1
vaddps -288(%rdx), %ymm0, %ymm0
vaddps -256(%rdx), %ymm1, %ymm1
vaddps -224(%rdx), %ymm0, %ymm0
vaddps -192(%rdx), %ymm1, %ymm1
vaddps -160(%rdx), %ymm0, %ymm0
vaddps -128(%rdx), %ymm1, %ymm1
vaddps -96(%rdx), %ymm0, %ymm0
vaddps -64(%rdx), %ymm1, %ymm1
vaddps -32(%rdx), %ymm0, %ymm0
vaddps (%rdx), %ymm1, %ymm1
addq $512, %rdx # imm = 0x200
addq $-128, %rcx
jne .LBB1_11
.LBB1_12: # %middle.block
vaddps %ymm0, %ymm1, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %ymm1, %ymm0, %ymm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %ymm1, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm0
movq %rax, %rcx
cmpq %rsi, %rax
je .LBB1_13
.LBB1_2: # %.lr.ph.preheader13
leaq (%rdi,%rcx,4), %rax
subq %rcx, %rsi
.align 16, 0x90
.LBB1_3: # %.lr.ph
# =>This Inner Loop Header: Depth=1
vaddss (%rax), %xmm0, %xmm0
addq $4, %rax
addq $-1, %rsi
jne .LBB1_3
.LBB1_13: # %._crit_edge
#APP
#NO_APP
vzeroupper
retq
.Lfunc_end1:
.size sum32, .Lfunc_end1-sum32
.cfi_endproc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment