Last active
December 11, 2016 12:16
-
-
Save bluss/bf695d2405a4ea36fc5ab78609e57809 to your computer and use it in GitHub Desktop.
Compiler output using `rustc -Copt-level=3 -Ctarget-cpu=native --emit=asm float_fast.rs -Cllvm-args=-x86-asm-syntax=intel`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.intel_syntax noprefix | |
.file "float_fast.cgu-0.rs" | |
.section .text.dot3_fast,"ax",@progbits | |
.globl dot3_fast | |
.p2align 4, 0x90 | |
.type dot3_fast,@function | |
dot3_fast: | |
.cfi_startproc | |
cmp rcx, r9 | |
cmovbe r9, rcx | |
cmp r9, rsi | |
cmovae r9, rsi | |
vxorps xmm0, xmm0, xmm0 | |
test r9, r9 | |
je .LBB0_7 | |
vxorps xmm0, xmm0, xmm0 | |
xor eax, eax | |
cmp r9, 31 | |
jbe .LBB0_6 | |
xor eax, eax | |
mov r10, r9 | |
and r10, -32 | |
je .LBB0_6 | |
lea rcx, [rdi + 96] | |
lea rsi, [r8 + 96] | |
lea rax, [rdx + 96] | |
vxorps ymm0, ymm0, ymm0 | |
mov r11, r10 | |
vxorps ymm1, ymm1, ymm1 | |
vxorps ymm2, ymm2, ymm2 | |
vxorps ymm3, ymm3, ymm3 | |
.p2align 4, 0x90 | |
.LBB0_4: | |
vmovups xmm4, xmmword ptr [rcx - 96] | |
vmovups xmm5, xmmword ptr [rcx - 64] | |
vmovups xmm6, xmmword ptr [rcx - 32] | |
vmovups xmm7, xmmword ptr [rcx] | |
vinsertf128 ymm4, ymm4, xmmword ptr [rcx - 80], 1 | |
vinsertf128 ymm5, ymm5, xmmword ptr [rcx - 48], 1 | |
vinsertf128 ymm6, ymm6, xmmword ptr [rcx - 16], 1 | |
vinsertf128 ymm7, ymm7, xmmword ptr [rcx + 16], 1 | |
vmovups xmm8, xmmword ptr [rax - 96] | |
vmovups xmm9, xmmword ptr [rax - 64] | |
vmovups xmm10, xmmword ptr [rax - 32] | |
vmovups xmm11, xmmword ptr [rax] | |
vinsertf128 ymm8, ymm8, xmmword ptr [rax - 80], 1 | |
vinsertf128 ymm9, ymm9, xmmword ptr [rax - 48], 1 | |
vinsertf128 ymm10, ymm10, xmmword ptr [rax - 16], 1 | |
vinsertf128 ymm11, ymm11, xmmword ptr [rax + 16], 1 | |
vmovups xmm12, xmmword ptr [rsi - 96] | |
vmovups xmm13, xmmword ptr [rsi - 64] | |
vmovups xmm14, xmmword ptr [rsi - 32] | |
vmovups xmm15, xmmword ptr [rsi] | |
vinsertf128 ymm12, ymm12, xmmword ptr [rsi - 80], 1 | |
vinsertf128 ymm13, ymm13, xmmword ptr [rsi - 48], 1 | |
vinsertf128 ymm14, ymm14, xmmword ptr [rsi - 16], 1 | |
vinsertf128 ymm15, ymm15, xmmword ptr [rsi + 16], 1 | |
vmulps ymm4, ymm4, ymm4 | |
vmulps ymm5, ymm5, ymm5 | |
vmulps ymm6, ymm6, ymm6 | |
vmulps ymm7, ymm7, ymm7 | |
vmulps ymm8, ymm8, ymm8 | |
vmulps ymm9, ymm9, ymm9 | |
vmulps ymm10, ymm10, ymm10 | |
vmulps ymm11, ymm11, ymm11 | |
vmulps ymm12, ymm12, ymm12 | |
vmulps ymm13, ymm13, ymm13 | |
vmulps ymm14, ymm14, ymm14 | |
vmulps ymm15, ymm15, ymm15 | |
vaddps ymm0, ymm4, ymm0 | |
vaddps ymm1, ymm5, ymm1 | |
vaddps ymm2, ymm6, ymm2 | |
vaddps ymm3, ymm7, ymm3 | |
vaddps ymm0, ymm0, ymm8 | |
vaddps ymm1, ymm1, ymm9 | |
vaddps ymm2, ymm2, ymm10 | |
vaddps ymm3, ymm3, ymm11 | |
vaddps ymm0, ymm0, ymm12 | |
vaddps ymm1, ymm1, ymm13 | |
vaddps ymm2, ymm2, ymm14 | |
vaddps ymm3, ymm3, ymm15 | |
sub rcx, -128 | |
sub rsi, -128 | |
sub rax, -128 | |
add r11, -32 | |
jne .LBB0_4 | |
vaddps ymm0, ymm1, ymm0 | |
vaddps ymm0, ymm2, ymm0 | |
vaddps ymm0, ymm3, ymm0 | |
vextractf128 xmm1, ymm0, 1 | |
vaddps ymm0, ymm0, ymm1 | |
vpermilpd xmm1, xmm0, 1 | |
vaddps ymm0, ymm0, ymm1 | |
vhaddps ymm0, ymm0, ymm0 | |
mov rax, r10 | |
cmp r9, r10 | |
je .LBB0_7 | |
.p2align 4, 0x90 | |
.LBB0_6: | |
vmovss xmm1, dword ptr [rdi + 4*rax] | |
vmovss xmm2, dword ptr [rdx + 4*rax] | |
vmovss xmm3, dword ptr [r8 + 4*rax] | |
inc rax | |
vmulss xmm1, xmm1, xmm1 | |
vmulss xmm2, xmm2, xmm2 | |
vmulss xmm3, xmm3, xmm3 | |
vaddss xmm0, xmm1, xmm0 | |
vaddss xmm0, xmm0, xmm2 | |
vaddss xmm0, xmm0, xmm3 | |
cmp rax, r9 | |
jb .LBB0_6 | |
.LBB0_7: | |
vzeroupper | |
ret | |
.Lfunc_end0: | |
.size dot3_fast, .Lfunc_end0-dot3_fast | |
.cfi_endproc | |
.section ".note.GNU-stack","",@progbits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment