Skip to content

Instantly share code, notes, and snippets.

@9il
Created August 17, 2013 19:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9il/6258443 to your computer and use it in GitHub Desktop.
Save 9il/6258443 to your computer and use it in GitHub Desktop.
GDC-4.8.1 assembler output for dot product function.
.section .text._D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf,"axG",@progbits,_D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf,comdat
.p2align 4,,15
.weak _D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf
.type _D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf, @function
_D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf:
.LFB412:
.cfi_startproc
mov rdx, rdi
lea r10, [rsi+rdi*4]
and rdi, -32
lea rdi, [rsi+rdi*4]
and rdx, -8
mov rax, rsi
mov r8, rcx
lea r9, [rsi+rdx*4]
cmp rsi, rdi
jae .L2454
vxorps xmm1, xmm1, xmm1
mov r11, rcx
vmovaps ymm2, ymm1
vmovaps ymm0, ymm1
vmovaps ymm3, ymm1
.p2align 4,,10
.p2align 3
.L2448:
vmovups ymm5, YMMWORD PTR [rax]
sub rax, -128
sub r11, -128
vmovups ymm4, YMMWORD PTR [r11-128]
vmovups ymm6, YMMWORD PTR [rax-96]
vmovups ymm7, YMMWORD PTR [r11-96]
vfmadd231ps ymm3, ymm5, ymm4
vmovups ymm8, YMMWORD PTR [rax-64]
vmovups ymm9, YMMWORD PTR [r11-64]
vfmadd231ps ymm0, ymm6, ymm7
vmovups ymm10, YMMWORD PTR [rax-32]
vmovups ymm11, YMMWORD PTR [r11-32]
cmp rdi, rax
vfmadd231ps ymm2, ymm8, ymm9
vfmadd231ps ymm1, ymm10, ymm11
ja .L2448
vaddps ymm12, ymm3, ymm0
mov rax, rsi
not rax
vaddps ymm13, ymm2, ymm1
add rdi, rax
and rdi, -128
lea r8, [rdi+128]
lea rax, [rsi+r8]
add r8, rcx
vaddps ymm14, ymm12, ymm13
.L2446:
cmp r9, rax
jbe .L2449
mov rcx, r8
mov rsi, rax
.p2align 4,,10
.p2align 3
.L2451:
vmovups ymm15, YMMWORD PTR [rsi]
add rsi, 32
add rcx, 32
vmovups ymm1, YMMWORD PTR [rcx-32]
cmp r9, rsi
vfmadd231ps ymm14, ymm15, ymm1
ja .L2451
mov rdx, rax
not rdx
add rdx, r9
and rdx, -32
add rdx, 32
add rax, rdx
add r8, rdx
.L2449:
vhaddps ymm2, ymm14, ymm14
cmp r10, rax
vhaddps ymm0, ymm2, ymm2
vperm2f128 ymm3, ymm0, ymm0, 17
vaddps ymm0, ymm0, ymm3
jbe .L2455
.p2align 4,,10
.p2align 3
.L2453:
vmovss xmm5, DWORD PTR [rax]
add rax, 4
vfmadd231ss xmm0, xmm5, DWORD PTR [r8]
add r8, 4
cmp r10, rax
ja .L2453
.L2455:
vzeroupper
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment