Skip to content

Instantly share code, notes, and snippets.

@bluss
Last active December 11, 2016 12:16
Show Gist options
  • Save bluss/bf695d2405a4ea36fc5ab78609e57809 to your computer and use it in GitHub Desktop.
Save bluss/bf695d2405a4ea36fc5ab78609e57809 to your computer and use it in GitHub Desktop.
Compiler output using `rustc -Copt-level=3 -Ctarget-cpu=native --emit=asm float_fast.rs -Cllvm-args=-x86-asm-syntax=intel`
.text
.intel_syntax noprefix
.file "float_fast.cgu-0.rs"
.section .text.dot3_fast,"ax",@progbits
.globl dot3_fast
.p2align 4, 0x90
.type dot3_fast,@function
dot3_fast:
.cfi_startproc
cmp rcx, r9
cmovbe r9, rcx
cmp r9, rsi
cmovae r9, rsi
vxorps xmm0, xmm0, xmm0
test r9, r9
je .LBB0_7
vxorps xmm0, xmm0, xmm0
xor eax, eax
cmp r9, 31
jbe .LBB0_6
xor eax, eax
mov r10, r9
and r10, -32
je .LBB0_6
lea rcx, [rdi + 96]
lea rsi, [r8 + 96]
lea rax, [rdx + 96]
vxorps ymm0, ymm0, ymm0
mov r11, r10
vxorps ymm1, ymm1, ymm1
vxorps ymm2, ymm2, ymm2
vxorps ymm3, ymm3, ymm3
.p2align 4, 0x90
.LBB0_4:
vmovups xmm4, xmmword ptr [rcx - 96]
vmovups xmm5, xmmword ptr [rcx - 64]
vmovups xmm6, xmmword ptr [rcx - 32]
vmovups xmm7, xmmword ptr [rcx]
vinsertf128 ymm4, ymm4, xmmword ptr [rcx - 80], 1
vinsertf128 ymm5, ymm5, xmmword ptr [rcx - 48], 1
vinsertf128 ymm6, ymm6, xmmword ptr [rcx - 16], 1
vinsertf128 ymm7, ymm7, xmmword ptr [rcx + 16], 1
vmovups xmm8, xmmword ptr [rax - 96]
vmovups xmm9, xmmword ptr [rax - 64]
vmovups xmm10, xmmword ptr [rax - 32]
vmovups xmm11, xmmword ptr [rax]
vinsertf128 ymm8, ymm8, xmmword ptr [rax - 80], 1
vinsertf128 ymm9, ymm9, xmmword ptr [rax - 48], 1
vinsertf128 ymm10, ymm10, xmmword ptr [rax - 16], 1
vinsertf128 ymm11, ymm11, xmmword ptr [rax + 16], 1
vmovups xmm12, xmmword ptr [rsi - 96]
vmovups xmm13, xmmword ptr [rsi - 64]
vmovups xmm14, xmmword ptr [rsi - 32]
vmovups xmm15, xmmword ptr [rsi]
vinsertf128 ymm12, ymm12, xmmword ptr [rsi - 80], 1
vinsertf128 ymm13, ymm13, xmmword ptr [rsi - 48], 1
vinsertf128 ymm14, ymm14, xmmword ptr [rsi - 16], 1
vinsertf128 ymm15, ymm15, xmmword ptr [rsi + 16], 1
vmulps ymm4, ymm4, ymm4
vmulps ymm5, ymm5, ymm5
vmulps ymm6, ymm6, ymm6
vmulps ymm7, ymm7, ymm7
vmulps ymm8, ymm8, ymm8
vmulps ymm9, ymm9, ymm9
vmulps ymm10, ymm10, ymm10
vmulps ymm11, ymm11, ymm11
vmulps ymm12, ymm12, ymm12
vmulps ymm13, ymm13, ymm13
vmulps ymm14, ymm14, ymm14
vmulps ymm15, ymm15, ymm15
vaddps ymm0, ymm4, ymm0
vaddps ymm1, ymm5, ymm1
vaddps ymm2, ymm6, ymm2
vaddps ymm3, ymm7, ymm3
vaddps ymm0, ymm0, ymm8
vaddps ymm1, ymm1, ymm9
vaddps ymm2, ymm2, ymm10
vaddps ymm3, ymm3, ymm11
vaddps ymm0, ymm0, ymm12
vaddps ymm1, ymm1, ymm13
vaddps ymm2, ymm2, ymm14
vaddps ymm3, ymm3, ymm15
sub rcx, -128
sub rsi, -128
sub rax, -128
add r11, -32
jne .LBB0_4
vaddps ymm0, ymm1, ymm0
vaddps ymm0, ymm2, ymm0
vaddps ymm0, ymm3, ymm0
vextractf128 xmm1, ymm0, 1
vaddps ymm0, ymm0, ymm1
vpermilpd xmm1, xmm0, 1
vaddps ymm0, ymm0, ymm1
vhaddps ymm0, ymm0, ymm0
mov rax, r10
cmp r9, r10
je .LBB0_7
.p2align 4, 0x90
.LBB0_6:
vmovss xmm1, dword ptr [rdi + 4*rax]
vmovss xmm2, dword ptr [rdx + 4*rax]
vmovss xmm3, dword ptr [r8 + 4*rax]
inc rax
vmulss xmm1, xmm1, xmm1
vmulss xmm2, xmm2, xmm2
vmulss xmm3, xmm3, xmm3
vaddss xmm0, xmm1, xmm0
vaddss xmm0, xmm0, xmm2
vaddss xmm0, xmm0, xmm3
cmp rax, r9
jb .LBB0_6
.LBB0_7:
vzeroupper
ret
.Lfunc_end0:
.size dot3_fast, .Lfunc_end0-dot3_fast
.cfi_endproc
.section ".note.GNU-stack","",@progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment