-
-
Save yuyichao/5b07f71c1f19248ec5511d758532a4b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- llvm37.s 2016-06-04 13:23:34.947819989 -0400 | |
+++ llvm38.s 2016-06-04 13:14:25.455283889 -0400 | |
@@ -4,103 +4,95 @@ | |
vxorps %xmm0, %xmm0, %xmm0 | |
testq %rsi, %rsi | |
je .LBB1_13 | |
-# BB#1: # %overflow.checked | |
+# BB#1: # %.lr.ph.preheader | |
+ vxorps %xmm0, %xmm0, %xmm0 | |
+ xorl %ecx, %ecx | |
+ cmpq $15, %rsi | |
+ jbe .LBB1_2 | |
+# BB#4: # %min.iters.checked | |
xorl %ecx, %ecx | |
movq %rsi, %rax | |
- vxorps %ymm0, %ymm0, %ymm0 | |
- vxorps %ymm1, %ymm1, %ymm1 | |
- vxorps %ymm2, %ymm2, %ymm2 | |
- vxorps %ymm3, %ymm3, %ymm3 | |
- andq $-32, %rax | |
- je .LBB1_10 | |
-# BB#2: # %vector.body.preheader | |
- leaq -32(%rsi), %r8 | |
+ andq $-16, %rax | |
+ je .LBB1_2 | |
+# BB#5: # %vector.body.preheader | |
+ leaq -16(%rsi), %r8 | |
movl %r8d, %ecx | |
- shrl $5, %ecx | |
+ shrl $4, %ecx | |
addl $1, %ecx | |
xorl %edx, %edx | |
- testb $3, %cl | |
- je .LBB1_3 | |
-# BB#4: # %vector.body.prol.preheader | |
- leal -32(%rsi), %ecx | |
- shrl $5, %ecx | |
+ testb $7, %cl | |
+ je .LBB1_6 | |
+# BB#7: # %vector.body.prol.preheader | |
+ leal -16(%rsi), %ecx | |
+ shrl $4, %ecx | |
addl $1, %ecx | |
- andl $3, %ecx | |
+ andl $7, %ecx | |
negq %rcx | |
vxorps %ymm0, %ymm0, %ymm0 | |
xorl %edx, %edx | |
vxorps %ymm1, %ymm1, %ymm1 | |
- vxorps %ymm2, %ymm2, %ymm2 | |
- vxorps %ymm3, %ymm3, %ymm3 | |
.align 16, 0x90 | |
-.LBB1_5: # %vector.body.prol | |
+.LBB1_8: # %vector.body.prol | |
# =>This Inner Loop Header: Depth=1 | |
vaddps (%rdi,%rdx,4), %ymm0, %ymm0 | |
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1 | |
- vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2 | |
- vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3 | |
- addq $32, %rdx | |
+ addq $16, %rdx | |
addq $1, %rcx | |
- jne .LBB1_5 | |
- jmp .LBB1_6 | |
-.LBB1_3: | |
+ jne .LBB1_8 | |
+ jmp .LBB1_9 | |
+.LBB1_6: | |
vxorps %ymm0, %ymm0, %ymm0 | |
vxorps %ymm1, %ymm1, %ymm1 | |
- vxorps %ymm2, %ymm2, %ymm2 | |
- vxorps %ymm3, %ymm3, %ymm3 | |
-.LBB1_6: # %vector.body.preheader.split | |
- cmpq $96, %r8 | |
- jb .LBB1_9 | |
-# BB#7: # %vector.body.preheader.split.split | |
+.LBB1_9: # %vector.body.preheader.split | |
+ cmpq $112, %r8 | |
+ jb .LBB1_12 | |
+# BB#10: # %vector.body.preheader.split.split | |
movq %rsi, %rcx | |
- andq $-32, %rcx | |
+ andq $-16, %rcx | |
subq %rdx, %rcx | |
leaq 480(%rdi,%rdx,4), %rdx | |
.align 16, 0x90 | |
-.LBB1_8: # %vector.body | |
+.LBB1_11: # %vector.body | |
# =>This Inner Loop Header: Depth=1 | |
vaddps -480(%rdx), %ymm0, %ymm0 | |
vaddps -448(%rdx), %ymm1, %ymm1 | |
- vaddps -416(%rdx), %ymm2, %ymm2 | |
- vaddps -384(%rdx), %ymm3, %ymm3 | |
+ vaddps -416(%rdx), %ymm0, %ymm0 | |
+ vaddps -384(%rdx), %ymm1, %ymm1 | |
vaddps -352(%rdx), %ymm0, %ymm0 | |
vaddps -320(%rdx), %ymm1, %ymm1 | |
- vaddps -288(%rdx), %ymm2, %ymm2 | |
- vaddps -256(%rdx), %ymm3, %ymm3 | |
+ vaddps -288(%rdx), %ymm0, %ymm0 | |
+ vaddps -256(%rdx), %ymm1, %ymm1 | |
vaddps -224(%rdx), %ymm0, %ymm0 | |
vaddps -192(%rdx), %ymm1, %ymm1 | |
- vaddps -160(%rdx), %ymm2, %ymm2 | |
- vaddps -128(%rdx), %ymm3, %ymm3 | |
+ vaddps -160(%rdx), %ymm0, %ymm0 | |
+ vaddps -128(%rdx), %ymm1, %ymm1 | |
vaddps -96(%rdx), %ymm0, %ymm0 | |
vaddps -64(%rdx), %ymm1, %ymm1 | |
- vaddps -32(%rdx), %ymm2, %ymm2 | |
- vaddps (%rdx), %ymm3, %ymm3 | |
+ vaddps -32(%rdx), %ymm0, %ymm0 | |
+ vaddps (%rdx), %ymm1, %ymm1 | |
addq $512, %rdx # imm = 0x200 | |
addq $-128, %rcx | |
- jne .LBB1_8 | |
-.LBB1_9: | |
- movq %rax, %rcx | |
-.LBB1_10: # %middle.block | |
+ jne .LBB1_11 | |
+.LBB1_12: # %middle.block | |
vaddps %ymm0, %ymm1, %ymm0 | |
- vaddps %ymm0, %ymm2, %ymm0 | |
- vaddps %ymm0, %ymm3, %ymm0 | |
vextractf128 $1, %ymm0, %xmm1 | |
vaddps %ymm1, %ymm0, %ymm0 | |
- vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2] | |
+ vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] | |
vaddps %ymm1, %ymm0, %ymm0 | |
vhaddps %ymm0, %ymm0, %ymm0 | |
- cmpq %rsi, %rcx | |
+ movq %rax, %rcx | |
+ cmpq %rsi, %rax | |
je .LBB1_13 | |
-# BB#11: # %.lr.ph.preheader | |
+.LBB1_2: # %.lr.ph.preheader13 | |
leaq (%rdi,%rcx,4), %rax | |
subq %rcx, %rsi | |
.align 16, 0x90 | |
-.LBB1_12: # %.lr.ph | |
+.LBB1_3: # %.lr.ph | |
# =>This Inner Loop Header: Depth=1 | |
vaddss (%rax), %xmm0, %xmm0 | |
addq $4, %rax | |
addq $-1, %rsi | |
- jne .LBB1_12 | |
+ jne .LBB1_3 | |
.LBB1_13: # %._crit_edge | |
#APP | |
#NO_APP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <time.h> | |
#include <stdio.h> | |
#include <string.h> | |
uint64_t gettime_ns() | |
{ | |
struct timespec t; | |
clock_gettime(CLOCK_MONOTONIC, &t); | |
return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; | |
} | |
__attribute__((noinline)) float sum32(float *a, size_t n) | |
{ | |
/* a = (float*)__builtin_assume_aligned(a, 64); */ | |
float s = 0; | |
for (size_t i = 0;i < n;i++) | |
s += a[i]; | |
__asm__ volatile ("" ::: "memory"); | |
return s; | |
} | |
int main() | |
{ | |
float *p = aligned_alloc(64, sizeof(float) * 1024); | |
memset(p, 0, sizeof(float) * 1024); | |
uint64_t start = gettime_ns(); | |
for (int i = 0;i < 1024 * 1024;i++) | |
sum32(p, 1024); | |
free(p); | |
uint64_t end = gettime_ns(); | |
printf("%f\n", (end - start) / (1024.0 * 1024.0)); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sum32.constprop.0: | |
.LFB36: | |
.cfi_startproc | |
movq %rdi, %rax | |
shrq $2, %rax | |
negq %rax | |
andl $7, %eax | |
je .L9 | |
vmovss (%rdi), %xmm2 | |
cmpq $1, %rax | |
je .L10 | |
vaddss 4(%rdi), %xmm2, %xmm2 | |
cmpq $2, %rax | |
je .L11 | |
vaddss 8(%rdi), %xmm2, %xmm2 | |
cmpq $3, %rax | |
je .L12 | |
vaddss 12(%rdi), %xmm2, %xmm2 | |
cmpq $4, %rax | |
je .L13 | |
vaddss 16(%rdi), %xmm2, %xmm2 | |
cmpq $5, %rax | |
je .L14 | |
vaddss 20(%rdi), %xmm2, %xmm2 | |
cmpq $7, %rax | |
jne .L15 | |
vaddss 24(%rdi), %xmm2, %xmm2 | |
movl $1017, %ecx | |
movl $7, %edx | |
.L3: | |
movl $1024, %esi | |
movl $1016, %r8d | |
movl $127, %r10d | |
subq %rax, %rsi | |
.L2: | |
leaq (%rdi,%rax,4), %r9 | |
vxorps %xmm1, %xmm1, %xmm1 | |
xorl %eax, %eax | |
.L4: | |
addq $1, %rax | |
vaddps (%r9), %ymm1, %ymm1 | |
addq $32, %r9 | |
cmpq %r10, %rax | |
jb .L4 | |
vhaddps %ymm1, %ymm1, %ymm1 | |
leaq (%rdx,%r8), %rax | |
movq %rcx, %rdx | |
subq %r8, %rdx | |
vhaddps %ymm1, %ymm1, %ymm0 | |
vperm2f128 $1, %ymm0, %ymm0, %ymm1 | |
vaddps %ymm0, %ymm1, %ymm0 | |
vaddss %xmm2, %xmm0, %xmm0 | |
cmpq %r8, %rsi | |
je .L7 | |
vaddss (%rdi,%rax,4), %xmm0, %xmm0 | |
leaq 1(%rax), %rcx | |
cmpq $1, %rdx | |
je .L7 | |
vaddss (%rdi,%rcx,4), %xmm0, %xmm0 | |
leaq 2(%rax), %rcx | |
cmpq $2, %rdx | |
je .L7 | |
vaddss (%rdi,%rcx,4), %xmm0, %xmm0 | |
leaq 3(%rax), %rcx | |
cmpq $3, %rdx | |
je .L7 | |
vaddss (%rdi,%rcx,4), %xmm0, %xmm0 | |
leaq 4(%rax), %rcx | |
cmpq $4, %rdx | |
je .L7 | |
vaddss (%rdi,%rcx,4), %xmm0, %xmm0 | |
leaq 5(%rax), %rcx | |
cmpq $5, %rdx | |
je .L7 | |
vaddss (%rdi,%rcx,4), %xmm0, %xmm0 | |
addq $6, %rax | |
cmpq $6, %rdx | |
je .L7 | |
vaddss (%rdi,%rax,4), %xmm0, %xmm0 | |
.L7: | |
vzeroupper | |
ret | |
.p2align 4,,10 | |
.p2align 3 | |
.L9: | |
movl $1024, %esi | |
movl $1024, %ecx | |
vxorps %xmm2, %xmm2, %xmm2 | |
xorl %edx, %edx | |
movl $1024, %r8d | |
movl $128, %r10d | |
jmp .L2 | |
.p2align 4,,10 | |
.p2align 3 | |
.L15: | |
movl $1018, %ecx | |
movl $6, %edx | |
jmp .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L10: | |
movl $1023, %ecx | |
movl $1, %edx | |
jmp .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L11: | |
movl $2, %edx | |
movl $1022, %ecx | |
jmp .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L12: | |
movl $3, %edx | |
movl $1021, %ecx | |
jmp .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L13: | |
movl $4, %edx | |
movl $1020, %ecx | |
jmp .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L14: | |
movl $5, %edx | |
movl $1019, %ecx | |
jmp .L3 | |
.cfi_endproc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Function Attrs: noinline nounwind uwtable | |
define float @sum32(float* nocapture readonly %a, i64 %n) #3 { | |
%1 = icmp eq i64 %n, 0 | |
br i1 %1, label %._crit_edge, label %overflow.checked | |
overflow.checked: ; preds = %0 | |
%n.vec = and i64 %n, -32 | |
%cmp.zero = icmp eq i64 %n.vec, 0 | |
br i1 %cmp.zero, label %middle.block, label %vector.body.preheader | |
vector.body.preheader: ; preds = %overflow.checked | |
%2 = add i64 %n, -32 | |
%3 = lshr i64 %2, 5 | |
%4 = add nuw nsw i64 %3, 1 | |
%xtraiter = and i64 %4, 3 | |
%lcmp.mod = icmp eq i64 %xtraiter, 0 | |
br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader | |
vector.body.prol.preheader: ; preds = %vector.body.preheader | |
br label %vector.body.prol | |
vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol | |
%index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ] | |
%vec.phi.prol = phi <8 x float> [ %13, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%vec.phi6.prol = phi <8 x float> [ %14, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%vec.phi7.prol = phi <8 x float> [ %15, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%vec.phi8.prol = phi <8 x float> [ %16, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] | |
%5 = getelementptr inbounds float, float* %a, i64 %index.prol | |
%6 = bitcast float* %5 to <8 x float>* | |
%wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7 | |
%7 = getelementptr float, float* %5, i64 8 | |
%8 = bitcast float* %7 to <8 x float>* | |
%wide.load9.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7 | |
%9 = getelementptr float, float* %5, i64 16 | |
%10 = bitcast float* %9 to <8 x float>* | |
%wide.load10.prol = load <8 x float>, <8 x float>* %10, align 4, !tbaa !7 | |
%11 = getelementptr float, float* %5, i64 24 | |
%12 = bitcast float* %11 to <8 x float>* | |
%wide.load11.prol = load <8 x float>, <8 x float>* %12, align 4, !tbaa !7 | |
%13 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol | |
%14 = fadd fast <8 x float> %wide.load9.prol, %vec.phi6.prol | |
%15 = fadd fast <8 x float> %wide.load10.prol, %vec.phi7.prol | |
%16 = fadd fast <8 x float> %wide.load11.prol, %vec.phi8.prol | |
%index.next.prol = add i64 %index.prol, 32 | |
%prol.iter.sub = add i64 %prol.iter, -1 | |
%prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 | |
br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9 | |
vector.body.preheader.split.loopexit: ; preds = %vector.body.prol | |
%index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ] | |
%.lcssa35 = phi <8 x float> [ %16, %vector.body.prol ] | |
%.lcssa34 = phi <8 x float> [ %15, %vector.body.prol ] | |
%.lcssa33 = phi <8 x float> [ %14, %vector.body.prol ] | |
%.lcssa32 = phi <8 x float> [ %13, %vector.body.prol ] | |
br label %vector.body.preheader.split | |
vector.body.preheader.split: ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader | |
%.lcssa27.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ] | |
%.lcssa26.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ] | |
%.lcssa25.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ] | |
%.lcssa24.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ] | |
%index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ] | |
%vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa32, %vector.body.preheader.split.loopexit ] | |
%vec.phi6.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa33, %vector.body.preheader.split.loopexit ] | |
%vec.phi7.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa34, %vector.body.preheader.split.loopexit ] | |
%vec.phi8.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa35, %vector.body.preheader.split.loopexit ] | |
%17 = icmp ult i64 %2, 96 | |
br i1 %17, label %middle.block.loopexit, label %vector.body.preheader.split.split | |
vector.body.preheader.split.split: ; preds = %vector.body.preheader.split | |
br label %vector.body | |
vector.body: ; preds = %vector.body, %vector.body.preheader.split.split | |
%index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.3, %vector.body ] | |
%vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %62, %vector.body ] | |
%vec.phi6 = phi <8 x float> [ %vec.phi6.unr, %vector.body.preheader.split.split ], [ %63, %vector.body ] | |
%vec.phi7 = phi <8 x float> [ %vec.phi7.unr, %vector.body.preheader.split.split ], [ %64, %vector.body ] | |
%vec.phi8 = phi <8 x float> [ %vec.phi8.unr, %vector.body.preheader.split.split ], [ %65, %vector.body ] | |
%18 = getelementptr inbounds float, float* %a, i64 %index | |
%19 = bitcast float* %18 to <8 x float>* | |
%wide.load = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7 | |
%20 = getelementptr float, float* %18, i64 8 | |
%21 = bitcast float* %20 to <8 x float>* | |
%wide.load9 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7 | |
%22 = getelementptr float, float* %18, i64 16 | |
%23 = bitcast float* %22 to <8 x float>* | |
%wide.load10 = load <8 x float>, <8 x float>* %23, align 4, !tbaa !7 | |
%24 = getelementptr float, float* %18, i64 24 | |
%25 = bitcast float* %24 to <8 x float>* | |
%wide.load11 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7 | |
%26 = fadd fast <8 x float> %wide.load, %vec.phi | |
%27 = fadd fast <8 x float> %wide.load9, %vec.phi6 | |
%28 = fadd fast <8 x float> %wide.load10, %vec.phi7 | |
%29 = fadd fast <8 x float> %wide.load11, %vec.phi8 | |
%index.next = add i64 %index, 32 | |
%30 = getelementptr inbounds float, float* %a, i64 %index.next | |
%31 = bitcast float* %30 to <8 x float>* | |
%wide.load.1 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7 | |
%32 = getelementptr float, float* %30, i64 8 | |
%33 = bitcast float* %32 to <8 x float>* | |
%wide.load9.1 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7 | |
%34 = getelementptr float, float* %30, i64 16 | |
%35 = bitcast float* %34 to <8 x float>* | |
%wide.load10.1 = load <8 x float>, <8 x float>* %35, align 4, !tbaa !7 | |
%36 = getelementptr float, float* %30, i64 24 | |
%37 = bitcast float* %36 to <8 x float>* | |
%wide.load11.1 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7 | |
%38 = fadd fast <8 x float> %wide.load.1, %26 | |
%39 = fadd fast <8 x float> %wide.load9.1, %27 | |
%40 = fadd fast <8 x float> %wide.load10.1, %28 | |
%41 = fadd fast <8 x float> %wide.load11.1, %29 | |
%index.next.1 = add i64 %index, 64 | |
%42 = getelementptr inbounds float, float* %a, i64 %index.next.1 | |
%43 = bitcast float* %42 to <8 x float>* | |
%wide.load.2 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7 | |
%44 = getelementptr float, float* %42, i64 8 | |
%45 = bitcast float* %44 to <8 x float>* | |
%wide.load9.2 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7 | |
%46 = getelementptr float, float* %42, i64 16 | |
%47 = bitcast float* %46 to <8 x float>* | |
%wide.load10.2 = load <8 x float>, <8 x float>* %47, align 4, !tbaa !7 | |
%48 = getelementptr float, float* %42, i64 24 | |
%49 = bitcast float* %48 to <8 x float>* | |
%wide.load11.2 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7 | |
%50 = fadd fast <8 x float> %wide.load.2, %38 | |
%51 = fadd fast <8 x float> %wide.load9.2, %39 | |
%52 = fadd fast <8 x float> %wide.load10.2, %40 | |
%53 = fadd fast <8 x float> %wide.load11.2, %41 | |
%index.next.2 = add i64 %index, 96 | |
%54 = getelementptr inbounds float, float* %a, i64 %index.next.2 | |
%55 = bitcast float* %54 to <8 x float>* | |
%wide.load.3 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7 | |
%56 = getelementptr float, float* %54, i64 8 | |
%57 = bitcast float* %56 to <8 x float>* | |
%wide.load9.3 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7 | |
%58 = getelementptr float, float* %54, i64 16 | |
%59 = bitcast float* %58 to <8 x float>* | |
%wide.load10.3 = load <8 x float>, <8 x float>* %59, align 4, !tbaa !7 | |
%60 = getelementptr float, float* %54, i64 24 | |
%61 = bitcast float* %60 to <8 x float>* | |
%wide.load11.3 = load <8 x float>, <8 x float>* %61, align 4, !tbaa !7 | |
%62 = fadd fast <8 x float> %wide.load.3, %50 | |
%63 = fadd fast <8 x float> %wide.load9.3, %51 | |
%64 = fadd fast <8 x float> %wide.load10.3, %52 | |
%65 = fadd fast <8 x float> %wide.load11.3, %53 | |
%index.next.3 = add i64 %index, 128 | |
%66 = icmp eq i64 %index.next.3, %n.vec | |
br i1 %66, label %middle.block.loopexit.unr-lcssa, label %vector.body, !llvm.loop !11 | |
middle.block.loopexit.unr-lcssa: ; preds = %vector.body | |
%.lcssa31 = phi <8 x float> [ %65, %vector.body ] | |
%.lcssa30 = phi <8 x float> [ %64, %vector.body ] | |
%.lcssa29 = phi <8 x float> [ %63, %vector.body ] | |
%.lcssa28 = phi <8 x float> [ %62, %vector.body ] | |
br label %middle.block.loopexit | |
middle.block.loopexit: ; preds = %vector.body.preheader.split, %middle.block.loopexit.unr-lcssa | |
%.lcssa27 = phi <8 x float> [ %.lcssa27.unr, %vector.body.preheader.split ], [ %.lcssa31, %middle.block.loopexit.unr-lcssa ] | |
%.lcssa26 = phi <8 x float> [ %.lcssa26.unr, %vector.body.preheader.split ], [ %.lcssa30, %middle.block.loopexit.unr-lcssa ] | |
%.lcssa25 = phi <8 x float> [ %.lcssa25.unr, %vector.body.preheader.split ], [ %.lcssa29, %middle.block.loopexit.unr-lcssa ] | |
%.lcssa24 = phi <8 x float> [ %.lcssa24.unr, %vector.body.preheader.split ], [ %.lcssa28, %middle.block.loopexit.unr-lcssa ] | |
br label %middle.block | |
middle.block: ; preds = %middle.block.loopexit, %overflow.checked | |
%resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %middle.block.loopexit ] | |
%rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa24, %middle.block.loopexit ] | |
%rdx.vec.exit.phi14 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa25, %middle.block.loopexit ] | |
%rdx.vec.exit.phi15 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa26, %middle.block.loopexit ] | |
%rdx.vec.exit.phi16 = phi <8 x float> [ zeroinitializer, %overflow.checked ], [ %.lcssa27, %middle.block.loopexit ] | |
%bin.rdx = fadd fast <8 x float> %rdx.vec.exit.phi14, %rdx.vec.exit.phi | |
%bin.rdx17 = fadd fast <8 x float> %rdx.vec.exit.phi15, %bin.rdx | |
%bin.rdx18 = fadd fast <8 x float> %rdx.vec.exit.phi16, %bin.rdx17 | |
%rdx.shuf = shufflevector <8 x float> %bin.rdx18, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx19 = fadd fast <8 x float> %bin.rdx18, %rdx.shuf | |
%rdx.shuf20 = shufflevector <8 x float> %bin.rdx19, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx21 = fadd fast <8 x float> %bin.rdx19, %rdx.shuf20 | |
%rdx.shuf22 = shufflevector <8 x float> %bin.rdx21, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx23 = fadd fast <8 x float> %bin.rdx21, %rdx.shuf22 | |
%67 = extractelement <8 x float> %bin.rdx23, i32 0 | |
%cmp.n = icmp eq i64 %resume.val, %n | |
br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader | |
.lr.ph.preheader: ; preds = %middle.block | |
br label %.lr.ph | |
._crit_edge.loopexit: ; preds = %.lr.ph | |
%.lcssa = phi float [ %70, %.lr.ph ] | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %middle.block, %0 | |
%s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %67, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ] | |
tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #1, !srcloc !14 | |
ret float %s.0.lcssa | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%i.02 = phi i64 [ %71, %.lr.ph ], [ %resume.val, %.lr.ph.preheader ] | |
%s.01 = phi float [ %70, %.lr.ph ], [ %67, %.lr.ph.preheader ] | |
%68 = getelementptr inbounds float, float* %a, i64 %i.02 | |
%69 = load float, float* %68, align 4, !tbaa !7 | |
%70 = fadd fast float %69, %s.01 | |
%71 = add nuw i64 %i.02, 1 | |
%exitcond = icmp eq i64 %71, %n | |
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sum32: # @sum32 | |
.cfi_startproc | |
# BB#0: | |
vxorps %xmm0, %xmm0, %xmm0 | |
testq %rsi, %rsi | |
je .LBB1_13 | |
# BB#1: # %overflow.checked | |
xorl %ecx, %ecx | |
movq %rsi, %rax | |
vxorps %ymm0, %ymm0, %ymm0 | |
vxorps %ymm1, %ymm1, %ymm1 | |
vxorps %ymm2, %ymm2, %ymm2 | |
vxorps %ymm3, %ymm3, %ymm3 | |
andq $-32, %rax | |
je .LBB1_10 | |
# BB#2: # %vector.body.preheader | |
leaq -32(%rsi), %r8 | |
movl %r8d, %ecx | |
shrl $5, %ecx | |
addl $1, %ecx | |
xorl %edx, %edx | |
testb $3, %cl | |
je .LBB1_3 | |
# BB#4: # %vector.body.prol.preheader | |
leal -32(%rsi), %ecx | |
shrl $5, %ecx | |
addl $1, %ecx | |
andl $3, %ecx | |
negq %rcx | |
vxorps %ymm0, %ymm0, %ymm0 | |
xorl %edx, %edx | |
vxorps %ymm1, %ymm1, %ymm1 | |
vxorps %ymm2, %ymm2, %ymm2 | |
vxorps %ymm3, %ymm3, %ymm3 | |
.align 16, 0x90 | |
.LBB1_5: # %vector.body.prol | |
# =>This Inner Loop Header: Depth=1 | |
vaddps (%rdi,%rdx,4), %ymm0, %ymm0 | |
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1 | |
vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2 | |
vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3 | |
addq $32, %rdx | |
addq $1, %rcx | |
jne .LBB1_5 | |
jmp .LBB1_6 | |
.LBB1_3: | |
vxorps %ymm0, %ymm0, %ymm0 | |
vxorps %ymm1, %ymm1, %ymm1 | |
vxorps %ymm2, %ymm2, %ymm2 | |
vxorps %ymm3, %ymm3, %ymm3 | |
.LBB1_6: # %vector.body.preheader.split | |
cmpq $96, %r8 | |
jb .LBB1_9 | |
# BB#7: # %vector.body.preheader.split.split | |
movq %rsi, %rcx | |
andq $-32, %rcx | |
subq %rdx, %rcx | |
leaq 480(%rdi,%rdx,4), %rdx | |
.align 16, 0x90 | |
.LBB1_8: # %vector.body | |
# =>This Inner Loop Header: Depth=1 | |
vaddps -480(%rdx), %ymm0, %ymm0 | |
vaddps -448(%rdx), %ymm1, %ymm1 | |
vaddps -416(%rdx), %ymm2, %ymm2 | |
vaddps -384(%rdx), %ymm3, %ymm3 | |
vaddps -352(%rdx), %ymm0, %ymm0 | |
vaddps -320(%rdx), %ymm1, %ymm1 | |
vaddps -288(%rdx), %ymm2, %ymm2 | |
vaddps -256(%rdx), %ymm3, %ymm3 | |
vaddps -224(%rdx), %ymm0, %ymm0 | |
vaddps -192(%rdx), %ymm1, %ymm1 | |
vaddps -160(%rdx), %ymm2, %ymm2 | |
vaddps -128(%rdx), %ymm3, %ymm3 | |
vaddps -96(%rdx), %ymm0, %ymm0 | |
vaddps -64(%rdx), %ymm1, %ymm1 | |
vaddps -32(%rdx), %ymm2, %ymm2 | |
vaddps (%rdx), %ymm3, %ymm3 | |
addq $512, %rdx # imm = 0x200 | |
addq $-128, %rcx | |
jne .LBB1_8 | |
.LBB1_9: | |
movq %rax, %rcx | |
.LBB1_10: # %middle.block | |
vaddps %ymm0, %ymm1, %ymm0 | |
vaddps %ymm0, %ymm2, %ymm0 | |
vaddps %ymm0, %ymm3, %ymm0 | |
vextractf128 $1, %ymm0, %xmm1 | |
vaddps %ymm1, %ymm0, %ymm0 | |
vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2] | |
vaddps %ymm1, %ymm0, %ymm0 | |
vhaddps %ymm0, %ymm0, %ymm0 | |
cmpq %rsi, %rcx | |
je .LBB1_13 | |
# BB#11: # %.lr.ph.preheader | |
leaq (%rdi,%rcx,4), %rax | |
subq %rcx, %rsi | |
.align 16, 0x90 | |
.LBB1_12: # %.lr.ph | |
# =>This Inner Loop Header: Depth=1 | |
vaddss (%rax), %xmm0, %xmm0 | |
addq $4, %rax | |
addq $-1, %rsi | |
jne .LBB1_12 | |
.LBB1_13: # %._crit_edge | |
#APP | |
#NO_APP | |
vzeroupper | |
retq | |
.Lfunc_end1: | |
.size sum32, .Lfunc_end1-sum32 | |
.cfi_endproc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Function Attrs: noinline nounwind uwtable | |
define float @sum32(float* nocapture readonly %a, i64 %n) #3 { | |
%1 = icmp eq i64 %n, 0 | |
br i1 %1, label %._crit_edge, label %.lr.ph.preheader | |
.lr.ph.preheader: ; preds = %0 | |
%min.iters.check = icmp ult i64 %n, 16 | |
br i1 %min.iters.check, label %.lr.ph.preheader13, label %min.iters.checked | |
.lr.ph.preheader13: ; preds = %middle.block, %min.iters.checked, %.lr.ph.preheader | |
%i.02.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %.lr.ph.preheader ], [ %n.vec, %middle.block ] | |
%s.01.ph = phi float [ 0.000000e+00, %min.iters.checked ], [ 0.000000e+00, %.lr.ph.preheader ], [ %61, %middle.block ] | |
br label %.lr.ph | |
min.iters.checked: ; preds = %.lr.ph.preheader | |
%n.vec = and i64 %n, -16 | |
%cmp.zero = icmp eq i64 %n.vec, 0 | |
br i1 %cmp.zero, label %.lr.ph.preheader13, label %vector.body.preheader | |
vector.body.preheader: ; preds = %min.iters.checked | |
%2 = add i64 %n, -16 | |
%3 = lshr i64 %2, 4 | |
%4 = add nuw nsw i64 %3, 1 | |
%xtraiter = and i64 %4, 7 | |
%lcmp.mod = icmp eq i64 %xtraiter, 0 | |
br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol.preheader | |
vector.body.prol.preheader: ; preds = %vector.body.preheader | |
br label %vector.body.prol | |
vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol | |
%index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ] | |
%vec.phi.prol = phi <8 x float> [ %9, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%vec.phi4.prol = phi <8 x float> [ %10, %vector.body.prol ], [ zeroinitializer, %vector.body.prol.preheader ] | |
%prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] | |
%5 = getelementptr inbounds float, float* %a, i64 %index.prol | |
%6 = bitcast float* %5 to <8 x float>* | |
%wide.load.prol = load <8 x float>, <8 x float>* %6, align 4, !tbaa !7 | |
%7 = getelementptr float, float* %5, i64 8 | |
%8 = bitcast float* %7 to <8 x float>* | |
%wide.load5.prol = load <8 x float>, <8 x float>* %8, align 4, !tbaa !7 | |
%9 = fadd fast <8 x float> %wide.load.prol, %vec.phi.prol | |
%10 = fadd fast <8 x float> %wide.load5.prol, %vec.phi4.prol | |
%index.next.prol = add i64 %index.prol, 16 | |
%prol.iter.sub = add i64 %prol.iter, -1 | |
%prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 | |
br i1 %prol.iter.cmp, label %vector.body.preheader.split.loopexit, label %vector.body.prol, !llvm.loop !9 | |
vector.body.preheader.split.loopexit: ; preds = %vector.body.prol | |
%index.next.prol.lcssa = phi i64 [ %index.next.prol, %vector.body.prol ] | |
%.lcssa19 = phi <8 x float> [ %10, %vector.body.prol ] | |
%.lcssa18 = phi <8 x float> [ %9, %vector.body.prol ] | |
br label %vector.body.preheader.split | |
vector.body.preheader.split: ; preds = %vector.body.preheader.split.loopexit, %vector.body.preheader | |
%.lcssa15.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ] | |
%.lcssa14.unr = phi <8 x float> [ undef, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ] | |
%index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.prol.lcssa, %vector.body.preheader.split.loopexit ] | |
%vec.phi.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa18, %vector.body.preheader.split.loopexit ] | |
%vec.phi4.unr = phi <8 x float> [ zeroinitializer, %vector.body.preheader ], [ %.lcssa19, %vector.body.preheader.split.loopexit ] | |
%11 = icmp ult i64 %2, 112 | |
br i1 %11, label %middle.block, label %vector.body.preheader.split.split | |
vector.body.preheader.split.split: ; preds = %vector.body.preheader.split | |
br label %vector.body | |
vector.body: ; preds = %vector.body, %vector.body.preheader.split.split | |
%index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.7, %vector.body ] | |
%vec.phi = phi <8 x float> [ %vec.phi.unr, %vector.body.preheader.split.split ], [ %58, %vector.body ] | |
%vec.phi4 = phi <8 x float> [ %vec.phi4.unr, %vector.body.preheader.split.split ], [ %59, %vector.body ] | |
%12 = getelementptr inbounds float, float* %a, i64 %index | |
%13 = bitcast float* %12 to <8 x float>* | |
%wide.load = load <8 x float>, <8 x float>* %13, align 4, !tbaa !7 | |
%14 = getelementptr float, float* %12, i64 8 | |
%15 = bitcast float* %14 to <8 x float>* | |
%wide.load5 = load <8 x float>, <8 x float>* %15, align 4, !tbaa !7 | |
%16 = fadd fast <8 x float> %wide.load, %vec.phi | |
%17 = fadd fast <8 x float> %wide.load5, %vec.phi4 | |
%index.next = add i64 %index, 16 | |
%18 = getelementptr inbounds float, float* %a, i64 %index.next | |
%19 = bitcast float* %18 to <8 x float>* | |
%wide.load.1 = load <8 x float>, <8 x float>* %19, align 4, !tbaa !7 | |
%20 = getelementptr float, float* %18, i64 8 | |
%21 = bitcast float* %20 to <8 x float>* | |
%wide.load5.1 = load <8 x float>, <8 x float>* %21, align 4, !tbaa !7 | |
%22 = fadd fast <8 x float> %wide.load.1, %16 | |
%23 = fadd fast <8 x float> %wide.load5.1, %17 | |
%index.next.1 = add i64 %index, 32 | |
%24 = getelementptr inbounds float, float* %a, i64 %index.next.1 | |
%25 = bitcast float* %24 to <8 x float>* | |
%wide.load.2 = load <8 x float>, <8 x float>* %25, align 4, !tbaa !7 | |
%26 = getelementptr float, float* %24, i64 8 | |
%27 = bitcast float* %26 to <8 x float>* | |
%wide.load5.2 = load <8 x float>, <8 x float>* %27, align 4, !tbaa !7 | |
%28 = fadd fast <8 x float> %wide.load.2, %22 | |
%29 = fadd fast <8 x float> %wide.load5.2, %23 | |
%index.next.2 = add i64 %index, 48 | |
%30 = getelementptr inbounds float, float* %a, i64 %index.next.2 | |
%31 = bitcast float* %30 to <8 x float>* | |
%wide.load.3 = load <8 x float>, <8 x float>* %31, align 4, !tbaa !7 | |
%32 = getelementptr float, float* %30, i64 8 | |
%33 = bitcast float* %32 to <8 x float>* | |
%wide.load5.3 = load <8 x float>, <8 x float>* %33, align 4, !tbaa !7 | |
%34 = fadd fast <8 x float> %wide.load.3, %28 | |
%35 = fadd fast <8 x float> %wide.load5.3, %29 | |
%index.next.3 = add i64 %index, 64 | |
%36 = getelementptr inbounds float, float* %a, i64 %index.next.3 | |
%37 = bitcast float* %36 to <8 x float>* | |
%wide.load.4 = load <8 x float>, <8 x float>* %37, align 4, !tbaa !7 | |
%38 = getelementptr float, float* %36, i64 8 | |
%39 = bitcast float* %38 to <8 x float>* | |
%wide.load5.4 = load <8 x float>, <8 x float>* %39, align 4, !tbaa !7 | |
%40 = fadd fast <8 x float> %wide.load.4, %34 | |
%41 = fadd fast <8 x float> %wide.load5.4, %35 | |
%index.next.4 = add i64 %index, 80 | |
%42 = getelementptr inbounds float, float* %a, i64 %index.next.4 | |
%43 = bitcast float* %42 to <8 x float>* | |
%wide.load.5 = load <8 x float>, <8 x float>* %43, align 4, !tbaa !7 | |
%44 = getelementptr float, float* %42, i64 8 | |
%45 = bitcast float* %44 to <8 x float>* | |
%wide.load5.5 = load <8 x float>, <8 x float>* %45, align 4, !tbaa !7 | |
%46 = fadd fast <8 x float> %wide.load.5, %40 | |
%47 = fadd fast <8 x float> %wide.load5.5, %41 | |
%index.next.5 = add i64 %index, 96 | |
%48 = getelementptr inbounds float, float* %a, i64 %index.next.5 | |
%49 = bitcast float* %48 to <8 x float>* | |
%wide.load.6 = load <8 x float>, <8 x float>* %49, align 4, !tbaa !7 | |
%50 = getelementptr float, float* %48, i64 8 | |
%51 = bitcast float* %50 to <8 x float>* | |
%wide.load5.6 = load <8 x float>, <8 x float>* %51, align 4, !tbaa !7 | |
%52 = fadd fast <8 x float> %wide.load.6, %46 | |
%53 = fadd fast <8 x float> %wide.load5.6, %47 | |
%index.next.6 = add i64 %index, 112 | |
%54 = getelementptr inbounds float, float* %a, i64 %index.next.6 | |
%55 = bitcast float* %54 to <8 x float>* | |
%wide.load.7 = load <8 x float>, <8 x float>* %55, align 4, !tbaa !7 | |
%56 = getelementptr float, float* %54, i64 8 | |
%57 = bitcast float* %56 to <8 x float>* | |
%wide.load5.7 = load <8 x float>, <8 x float>* %57, align 4, !tbaa !7 | |
%58 = fadd fast <8 x float> %wide.load.7, %52 | |
%59 = fadd fast <8 x float> %wide.load5.7, %53 | |
%index.next.7 = add i64 %index, 128 | |
%60 = icmp eq i64 %index.next.7, %n.vec | |
br i1 %60, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !11 | |
middle.block.unr-lcssa: ; preds = %vector.body | |
%.lcssa17 = phi <8 x float> [ %59, %vector.body ] | |
%.lcssa16 = phi <8 x float> [ %58, %vector.body ] | |
br label %middle.block | |
middle.block: ; preds = %vector.body.preheader.split, %middle.block.unr-lcssa | |
%.lcssa15 = phi <8 x float> [ %.lcssa15.unr, %vector.body.preheader.split ], [ %.lcssa17, %middle.block.unr-lcssa ] | |
%.lcssa14 = phi <8 x float> [ %.lcssa14.unr, %vector.body.preheader.split ], [ %.lcssa16, %middle.block.unr-lcssa ] | |
%bin.rdx = fadd fast <8 x float> %.lcssa15, %.lcssa14 | |
%rdx.shuf = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx8 = fadd fast <8 x float> %bin.rdx, %rdx.shuf | |
%rdx.shuf9 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx10 = fadd fast <8 x float> %bin.rdx8, %rdx.shuf9 | |
%rdx.shuf11 = shufflevector <8 x float> %bin.rdx10, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | |
%bin.rdx12 = fadd fast <8 x float> %bin.rdx10, %rdx.shuf11 | |
%61 = extractelement <8 x float> %bin.rdx12, i32 0 | |
%cmp.n = icmp eq i64 %n.vec, %n | |
br i1 %cmp.n, label %._crit_edge, label %.lr.ph.preheader13 | |
._crit_edge.loopexit: ; preds = %.lr.ph | |
%.lcssa = phi float [ %64, %.lr.ph ] | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %middle.block, %0 | |
%s.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %61, %middle.block ], [ %.lcssa, %._crit_edge.loopexit ] | |
tail call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() #4, !srcloc !14 | |
ret float %s.0.lcssa | |
.lr.ph: ; preds = %.lr.ph.preheader13, %.lr.ph | |
%i.02 = phi i64 [ %65, %.lr.ph ], [ %i.02.ph, %.lr.ph.preheader13 ] | |
%s.01 = phi float [ %64, %.lr.ph ], [ %s.01.ph, %.lr.ph.preheader13 ] | |
%62 = getelementptr inbounds float, float* %a, i64 %i.02 | |
%63 = load float, float* %62, align 4, !tbaa !7 | |
%64 = fadd fast float %63, %s.01 | |
%65 = add nuw i64 %i.02, 1 | |
%exitcond = icmp eq i64 %65, %n | |
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph, !llvm.loop !15 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sum32: # @sum32 | |
.cfi_startproc | |
# BB#0: | |
vxorps %xmm0, %xmm0, %xmm0 | |
testq %rsi, %rsi | |
je .LBB1_13 | |
# BB#1: # %.lr.ph.preheader | |
vxorps %xmm0, %xmm0, %xmm0 | |
xorl %ecx, %ecx | |
cmpq $15, %rsi | |
jbe .LBB1_2 | |
# BB#4: # %min.iters.checked | |
xorl %ecx, %ecx | |
movq %rsi, %rax | |
andq $-16, %rax | |
je .LBB1_2 | |
# BB#5: # %vector.body.preheader | |
leaq -16(%rsi), %r8 | |
movl %r8d, %ecx | |
shrl $4, %ecx | |
addl $1, %ecx | |
xorl %edx, %edx | |
testb $7, %cl | |
je .LBB1_6 | |
# BB#7: # %vector.body.prol.preheader | |
leal -16(%rsi), %ecx | |
shrl $4, %ecx | |
addl $1, %ecx | |
andl $7, %ecx | |
negq %rcx | |
vxorps %ymm0, %ymm0, %ymm0 | |
xorl %edx, %edx | |
vxorps %ymm1, %ymm1, %ymm1 | |
.align 16, 0x90 | |
.LBB1_8: # %vector.body.prol | |
# =>This Inner Loop Header: Depth=1 | |
vaddps (%rdi,%rdx,4), %ymm0, %ymm0 | |
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1 | |
addq $16, %rdx | |
addq $1, %rcx | |
jne .LBB1_8 | |
jmp .LBB1_9 | |
.LBB1_6: | |
vxorps %ymm0, %ymm0, %ymm0 | |
vxorps %ymm1, %ymm1, %ymm1 | |
.LBB1_9: # %vector.body.preheader.split | |
cmpq $112, %r8 | |
jb .LBB1_12 | |
# BB#10: # %vector.body.preheader.split.split | |
movq %rsi, %rcx | |
andq $-16, %rcx | |
subq %rdx, %rcx | |
leaq 480(%rdi,%rdx,4), %rdx | |
.align 16, 0x90 | |
.LBB1_11: # %vector.body | |
# =>This Inner Loop Header: Depth=1 | |
vaddps -480(%rdx), %ymm0, %ymm0 | |
vaddps -448(%rdx), %ymm1, %ymm1 | |
vaddps -416(%rdx), %ymm0, %ymm0 | |
vaddps -384(%rdx), %ymm1, %ymm1 | |
vaddps -352(%rdx), %ymm0, %ymm0 | |
vaddps -320(%rdx), %ymm1, %ymm1 | |
vaddps -288(%rdx), %ymm0, %ymm0 | |
vaddps -256(%rdx), %ymm1, %ymm1 | |
vaddps -224(%rdx), %ymm0, %ymm0 | |
vaddps -192(%rdx), %ymm1, %ymm1 | |
vaddps -160(%rdx), %ymm0, %ymm0 | |
vaddps -128(%rdx), %ymm1, %ymm1 | |
vaddps -96(%rdx), %ymm0, %ymm0 | |
vaddps -64(%rdx), %ymm1, %ymm1 | |
vaddps -32(%rdx), %ymm0, %ymm0 | |
vaddps (%rdx), %ymm1, %ymm1 | |
addq $512, %rdx # imm = 0x200 | |
addq $-128, %rcx | |
jne .LBB1_11 | |
.LBB1_12: # %middle.block | |
vaddps %ymm0, %ymm1, %ymm0 | |
vextractf128 $1, %ymm0, %xmm1 | |
vaddps %ymm1, %ymm0, %ymm0 | |
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] | |
vaddps %ymm1, %ymm0, %ymm0 | |
vhaddps %ymm0, %ymm0, %ymm0 | |
movq %rax, %rcx | |
cmpq %rsi, %rax | |
je .LBB1_13 | |
.LBB1_2: # %.lr.ph.preheader13 | |
leaq (%rdi,%rcx,4), %rax | |
subq %rcx, %rsi | |
.align 16, 0x90 | |
.LBB1_3: # %.lr.ph | |
# =>This Inner Loop Header: Depth=1 | |
vaddss (%rax), %xmm0, %xmm0 | |
addq $4, %rax | |
addq $-1, %rsi | |
jne .LBB1_3 | |
.LBB1_13: # %._crit_edge | |
#APP | |
#NO_APP | |
vzeroupper | |
retq | |
.Lfunc_end1: | |
.size sum32, .Lfunc_end1-sum32 | |
.cfi_endproc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment