Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
yeppp! a trivial test's clang intel syntax assembly
call _ZNSt6chrono3_V212system_clock3nowEv
.Ltmp100:
mov r14, rax
.Ltmp101:
#DEBUG_VALUE: i <- 0
.align 16, 0x90
.LBB0_6: # %.lr.ph.i.preheader
# =>This Loop Header: Depth=1
# Child Loop BB0_7 Depth 2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
xorps xmm0, xmm0
xor eax, eax
.align 16, 0x90
.LBB0_7: # %.lr.ph.i
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0
.Ltmp102:
movsd xmm1, qword ptr [r15 + rax]
mulsd xmm1, qword ptr [rbx + rax]
addsd xmm0, xmm1
.Ltmp103:
#DEBUG_VALUE: inner_product<__gnu_cxx::__normal_iterator<double *, std::vector<double, std::allocator<double> > >, __gnu_cxx::__normal_iterator<double *, std::vector<double, std::allocator<double> > >, double>:__init <- XMM0
#DEBUG_VALUE: r <- [XMM0+0]
.loc 21 825 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_iterator.h:825:0
add rax, 8
cmp rax, 8192
.Ltmp104:
.loc 23 182 14 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:182:14
jne .LBB0_7
.Ltmp105:
# BB#8: # %_ZSt13inner_productIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEES6_dET1_T_S8_T0_S7_.exit
# in Loop: Header=BB0_6 Depth=1
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [XMM0+0]
.loc 1 25 13 # basics.cpp:25:13
movsd qword ptr [rsp + 24], xmm0
.loc 1 24 0 # basics.cpp:24:0
inc ebp
.Ltmp106:
#DEBUG_VALUE: i <- EBP
cmp ebp, 1000000
jne .LBB0_6
.Ltmp107:
# BB#9:
#DEBUG_VALUE: size <- 1024
.loc 1 26 0 # basics.cpp:26:0
call _ZNSt6chrono3_V212system_clock3nowEv
call _ZNSt6chrono3_V212system_clock3nowEv
.Ltmp100:
mov r14, rax
.Ltmp101:
#DEBUG_VALUE: i <- 0
.align 16, 0x90
.LBB0_6: # %.lr.ph.i.preheader
# =>This Loop Header: Depth=1
# Child Loop BB0_7 Depth 2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
xorpd xmm0, xmm0
xor eax, eax
.align 16, 0x90
.LBB0_7: # %vector.body
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0
.Ltmp102:
movupd xmm1, xmmword ptr [rbx + 8*rax]
movupd xmm2, xmmword ptr [r15 + 8*rax]
mulpd xmm2, xmm1
addpd xmm0, xmm2
add rax, 2
cmp rax, 1024
jne .LBB0_7
# BB#8: # %middle.block
# in Loop: Header=BB0_6 Depth=1
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
movapd xmm1, xmm0
unpckhpd xmm1, xmm1 # xmm1 = xmm1[1,1]
addpd xmm1, xmm0
.Ltmp103:
.loc 1 25 13 # basics.cpp:25:13
movlpd qword ptr [rsp + 24], xmm1
.loc 1 24 0 # basics.cpp:24:0
inc ebp
.Ltmp104:
#DEBUG_VALUE: i <- EBP
cmp ebp, 1000000
jne .LBB0_6
.Ltmp105:
# BB#9:
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
.loc 1 26 0 # basics.cpp:26:0
call _ZNSt6chrono3_V212system_clock3nowEv
call _ZNSt6chrono3_V212system_clock3nowEv
.Ltmp100:
mov r14, rax
.Ltmp101:
#DEBUG_VALUE: i <- 0
.align 16, 0x90
.LBB0_6: # %.lr.ph.i.preheader
# =>This Loop Header: Depth=1
# Child Loop BB0_7 Depth 2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
vxorpd ymm0, ymm0, ymm0
xor eax, eax
.align 16, 0x90
.LBB0_7: # %vector.body
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
.loc 23 183 0 # /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/bits/stl_numeric.h:183:0
.Ltmp102:
vmovupd xmm1, xmmword ptr [rbx + 8*rax + 16]
vmovupd xmm2, xmmword ptr [rbx + 8*rax]
vinsertf128 ymm1, ymm2, xmm1, 1
vmovupd xmm2, xmmword ptr [r15 + 8*rax + 16]
vmovupd xmm3, xmmword ptr [r15 + 8*rax]
vinsertf128 ymm2, ymm3, xmm2, 1
vmulpd ymm1, ymm2, ymm1
vaddpd ymm0, ymm0, ymm1
add rax, 4
cmp rax, 1024
jne .LBB0_7
# BB#8: # %middle.block
# in Loop: Header=BB0_6 Depth=1
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
vextractf128 xmm1, ymm0, 1
vaddpd ymm0, ymm0, ymm1
vhaddpd ymm0, ymm0, ymm0
.Ltmp103:
.loc 1 25 13 # basics.cpp:25:13
vmovlpd qword ptr [rsp + 24], xmm0
.loc 1 24 0 # basics.cpp:24:0
inc ebp
.Ltmp104:
#DEBUG_VALUE: i <- EBP
cmp ebp, 1000000
jne .LBB0_6
.Ltmp105:
# BB#9:
#DEBUG_VALUE: size <- 1024
#DEBUG_VALUE: r <- [RSP+24]
.loc 1 26 0 # basics.cpp:26:0
vzeroupper
call _ZNSt6chrono3_V212system_clock3nowEv
%ifidn __OUTPUT_FORMAT__, elf64
section .text.SandyBridge progbits alloc exec nowrite align=16
global _yepCore_DotProduct_V64fV64f_S64f_SandyBridge
_yepCore_DotProduct_V64fV64f_S64f_SandyBridge:
%else
section .text
global __yepCore_DotProduct_V64fV64f_S64f_SandyBridge
__yepCore_DotProduct_V64fV64f_S64f_SandyBridge:
%endif
.ENTRY:
TEST rdi, rdi
JZ .return_null_pointer
TEST rdi, 7
JNZ .return_misaligned_pointer
TEST rsi, rsi
JZ .return_null_pointer
TEST rsi, 7
JNZ .return_misaligned_pointer
TEST rdx, rdx
JZ .return_null_pointer
TEST rdx, 7
JNZ .return_misaligned_pointer
VXORPD xmm15, xmm15, xmm15
TEST rcx, rcx
JZ .return_ok
VXORPD xmm7, xmm7, xmm7
VXORPD xmm6, xmm6, xmm6
VXORPD xmm5, xmm5, xmm5
VXORPD xmm4, xmm4, xmm4
VXORPD xmm3, xmm3, xmm3
VXORPD xmm2, xmm2, xmm2
VXORPD xmm1, xmm1, xmm1
TEST rsi, 31
JZ .source_y_32b_aligned
.source_y_32b_misaligned:
VMOVSD xmm0, [rdi]
VMULSD xmm0, xmm0, [rsi]
VADDPD ymm15, ymm15, ymm0
ADD rdi, 8
ADD rsi, 8
SUB rcx, 1
JZ .reduce_batch
TEST rsi, 31
JNZ .source_y_32b_misaligned
.source_y_32b_aligned:
SUB rcx, 32
JB .batch_process_finish
.process_batch_prologue:
VMOVUPD ymm0, [rdi]
VMOVUPD ymm8, [byte rdi + 32]
VMOVUPD ymm9, [byte rdi + 64]
VMULPD ymm0, ymm0, [rsi]
VMOVUPD ymm12, [byte rdi + 96]
VMULPD ymm8, ymm8, [byte rsi + 32]
VMOVUPD ymm14, [dword rdi + 128]
VMULPD ymm9, ymm9, [byte rsi + 64]
VMOVUPD ymm10, [dword rdi + 160]
VMULPD ymm12, ymm12, [byte rsi + 96]
VADDPD ymm15, ymm15, ymm0
VMOVUPD ymm11, [dword rdi + 192]
VMULPD ymm14, ymm14, [dword rsi + 128]
VADDPD ymm7, ymm7, ymm8
VMOVUPD ymm13, [dword rdi + 224]
VMULPD ymm10, ymm10, [dword rsi + 160]
VADDPD ymm6, ymm6, ymm9
ADD rdi, 256
VMULPD ymm11, ymm11, [dword rsi + 192]
VADDPD ymm5, ymm5, ymm12
SUB rcx, 32
JB .process_batch_epilogue
align 16
.process_batch:
VMOVUPD ymm0, [rdi]
VMULPD ymm13, ymm13, [dword rsi + 224]
VADDPD ymm4, ymm4, ymm14
VMOVUPD ymm8, [byte rdi + 32]
ADD rsi, 256
VADDPD ymm3, ymm3, ymm10
VMOVUPD ymm9, [byte rdi + 64]
VMULPD ymm0, ymm0, [rsi]
VADDPD ymm2, ymm2, ymm11
VMOVUPD ymm12, [byte rdi + 96]
VMULPD ymm8, ymm8, [byte rsi + 32]
VADDPD ymm1, ymm1, ymm13
VMOVUPD ymm14, [dword rdi + 128]
VMULPD ymm9, ymm9, [byte rsi + 64]
VMOVUPD ymm10, [dword rdi + 160]
VMULPD ymm12, ymm12, [byte rsi + 96]
VADDPD ymm15, ymm15, ymm0
VMOVUPD ymm11, [dword rdi + 192]
VMULPD ymm14, ymm14, [dword rsi + 128]
VADDPD ymm7, ymm7, ymm8
VMOVUPD ymm13, [dword rdi + 224]
VMULPD ymm10, ymm10, [dword rsi + 160]
VADDPD ymm6, ymm6, ymm9
ADD rdi, 256
VMULPD ymm11, ymm11, [dword rsi + 192]
VADDPD ymm5, ymm5, ymm12
SUB rcx, 32
JAE .process_batch
.process_batch_epilogue:
VMULPD ymm13, ymm13, [dword rsi + 224]
VADDPD ymm4, ymm4, ymm14
ADD rsi, 256
VADDPD ymm3, ymm3, ymm10
VADDPD ymm2, ymm2, ymm11
VADDPD ymm1, ymm1, ymm13
.batch_process_finish:
ADD rcx, 32
JZ .reduce_batch
.process_single:
VMOVSD xmm8, [rdi]
VMULSD xmm8, xmm8, [rsi]
VADDPD ymm15, ymm15, ymm8
ADD rdi, 8
ADD rsi, 8
SUB rcx, 1
JNZ .process_single
.reduce_batch:
VADDPD ymm15, ymm15, ymm7
VADDPD ymm6, ymm6, ymm5
VADDPD ymm4, ymm4, ymm3
VADDPD ymm2, ymm2, ymm1
VADDPD ymm15, ymm15, ymm6
VADDPD ymm4, ymm4, ymm2
VADDPD ymm15, ymm15, ymm4
VEXTRACTF128 xmm8, ymm15, 1
VADDPD xmm15, xmm15, xmm8
VUNPCKHPD xmm8, xmm15, xmm15
VADDSD xmm15, xmm15, xmm8
.return_ok:
VMOVSD [rdx], xmm15
XOR eax, eax
.return:
VZEROUPPER
RET
.return_null_pointer:
MOV eax, 1
JMP .return
.return_misaligned_pointer:
MOV eax, 2
JMP .return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment