Created
August 17, 2013 19:53
-
-
Save 9il/6258443 to your computer and use it in GitHub Desktop.
GDC-4.8.1 assembler output for dot product function.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section .text._D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf,"axG",@progbits,_D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf,comdat | |
.p2align 4,,15 | |
.weak _D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf | |
.type _D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf, @function | |
_D4simd10dotproduct18__T10dotProductTfZ10dotProductFNexAfxAfZf: | |
.LFB412: | |
.cfi_startproc | |
mov rdx, rdi | |
lea r10, [rsi+rdi*4] | |
and rdi, -32 | |
lea rdi, [rsi+rdi*4] | |
and rdx, -8 | |
mov rax, rsi | |
mov r8, rcx | |
lea r9, [rsi+rdx*4] | |
cmp rsi, rdi | |
jae .L2454 | |
vxorps xmm1, xmm1, xmm1 | |
mov r11, rcx | |
vmovaps ymm2, ymm1 | |
vmovaps ymm0, ymm1 | |
vmovaps ymm3, ymm1 | |
.p2align 4,,10 | |
.p2align 3 | |
.L2448: | |
vmovups ymm5, YMMWORD PTR [rax] | |
sub rax, -128 | |
sub r11, -128 | |
vmovups ymm4, YMMWORD PTR [r11-128] | |
vmovups ymm6, YMMWORD PTR [rax-96] | |
vmovups ymm7, YMMWORD PTR [r11-96] | |
vfmadd231ps ymm3, ymm5, ymm4 | |
vmovups ymm8, YMMWORD PTR [rax-64] | |
vmovups ymm9, YMMWORD PTR [r11-64] | |
vfmadd231ps ymm0, ymm6, ymm7 | |
vmovups ymm10, YMMWORD PTR [rax-32] | |
vmovups ymm11, YMMWORD PTR [r11-32] | |
cmp rdi, rax | |
vfmadd231ps ymm2, ymm8, ymm9 | |
vfmadd231ps ymm1, ymm10, ymm11 | |
ja .L2448 | |
vaddps ymm12, ymm3, ymm0 | |
mov rax, rsi | |
not rax | |
vaddps ymm13, ymm2, ymm1 | |
add rdi, rax | |
and rdi, -128 | |
lea r8, [rdi+128] | |
lea rax, [rsi+r8] | |
add r8, rcx | |
vaddps ymm14, ymm12, ymm13 | |
.L2446: | |
cmp r9, rax | |
jbe .L2449 | |
mov rcx, r8 | |
mov rsi, rax | |
.p2align 4,,10 | |
.p2align 3 | |
.L2451: | |
vmovups ymm15, YMMWORD PTR [rsi] | |
add rsi, 32 | |
add rcx, 32 | |
vmovups ymm1, YMMWORD PTR [rcx-32] | |
cmp r9, rsi | |
vfmadd231ps ymm14, ymm15, ymm1 | |
ja .L2451 | |
mov rdx, rax | |
not rdx | |
add rdx, r9 | |
and rdx, -32 | |
add rdx, 32 | |
add rax, rdx | |
add r8, rdx | |
.L2449: | |
vhaddps ymm2, ymm14, ymm14 | |
cmp r10, rax | |
vhaddps ymm0, ymm2, ymm2 | |
vperm2f128 ymm3, ymm0, ymm0, 17 | |
vaddps ymm0, ymm0, ymm3 | |
jbe .L2455 | |
.p2align 4,,10 | |
.p2align 3 | |
.L2453: | |
vmovss xmm5, DWORD PTR [rax] | |
add rax, 4 | |
vfmadd231ss xmm0, xmm5, DWORD PTR [r8] | |
add r8, 4 | |
cmp r10, rax | |
ja .L2453 | |
.L2455: | |
vzeroupper | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment