Created
July 3, 2024 08:53
-
-
Save ChillFish8/cbe593135ccc04a46e2b28dababc64f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section .text,"xr",one_only,blogpost::avx2_dot::avx2_dot_product | |
.globl blogpost::avx2_dot::avx2_dot_product | |
.p2align 4, 0x90 | |
blogpost::avx2_dot::avx2_dot_product: | |
.cv_func_id 6 | |
.seh_proc _ZN8blogpost8avx2_dot16avx2_dot_product17hffb9005f074b96fbE | |
sub rsp, 104 | |
.seh_stackalloc 104 | |
.seh_endprologue | |
mov qword ptr [rsp + 40], rdx | |
mov qword ptr [rsp + 48], r9 | |
cmp rdx, r9 | |
jne .LBB4_16 | |
mov rax, rdx | |
and rax, -8 | |
je .LBB4_2 | |
lea r10, [rax - 1] | |
shr r10, 3 | |
inc r10 | |
mov r9d, r10d | |
and r9d, 7 | |
cmp rax, 57 | |
jae .LBB4_14 | |
vxorps xmm0, xmm0, xmm0 | |
xor eax, eax | |
jmp .LBB4_5 | |
.LBB4_2: | |
xor eax, eax | |
vxorps xmm0, xmm0, xmm0 | |
jmp .LBB4_7 | |
.LBB4_14: | |
and r10, -8 | |
vxorps xmm0, xmm0, xmm0 | |
xor eax, eax | |
.p2align 4, 0x90 | |
.LBB4_15: | |
.cv_inline_site_id 7 within 6 inlined_at 6 24 0 | |
.cv_inline_site_id 8 within 7 inlined_at 8 1465 0 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax] | |
vmovups ymm2, ymmword ptr [rcx + 4*rax + 32] | |
vmovups ymm3, ymmword ptr [rcx + 4*rax + 64] | |
vmovups ymm4, ymmword ptr [rcx + 4*rax + 96] | |
.cv_inline_site_id 9 within 6 inlined_at 6 27 0 | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax] | |
.cv_inline_site_id 10 within 6 inlined_at 6 28 0 | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32] | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64] | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 128] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 160] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 192] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 224] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224] | |
add rax, 64 | |
add r10, -8 | |
vaddps ymm0, ymm0, ymm1 | |
jne .LBB4_15 | |
.LBB4_5: | |
test r9, r9 | |
je .LBB4_7 | |
.p2align 4, 0x90 | |
.LBB4_6: | |
vmovups ymm1, ymmword ptr [rcx + 4*rax] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax] | |
add rax, 8 | |
dec r9 | |
vaddps ymm0, ymm0, ymm1 | |
jne .LBB4_6 | |
.LBB4_7: | |
.cv_inline_site_id 11 within 6 inlined_at 6 42 0 | |
.cv_inline_site_id 12 within 11 inlined_at 6 61 0 | |
vextractf128 xmm1, ymm0, 1 | |
mov r9, rax | |
sub r9, rdx | |
.cv_inline_site_id 13 within 11 inlined_at 6 63 0 | |
vaddps xmm0, xmm1, xmm0 | |
.cv_inline_site_id 14 within 11 inlined_at 6 66 0 | |
vshufpd xmm1, xmm0, xmm0, 1 | |
.cv_inline_site_id 15 within 11 inlined_at 6 67 0 | |
vaddps xmm0, xmm0, xmm1 | |
.cv_inline_site_id 16 within 11 inlined_at 6 71 0 | |
vmovshdup xmm1, xmm0 | |
vaddss xmm0, xmm0, xmm1 | |
jae .LBB4_13 | |
mov r10d, edx | |
sub r10d, eax | |
and r10d, 7 | |
je .LBB4_10 | |
.p2align 4, 0x90 | |
.LBB4_9: | |
vmovss xmm1, dword ptr [rcx + 4*rax] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax] | |
inc rax | |
dec r10 | |
vaddss xmm0, xmm0, xmm1 | |
jne .LBB4_9 | |
.LBB4_10: | |
cmp r9, -8 | |
ja .LBB4_13 | |
neg rdx | |
add rax, 7 | |
.p2align 4, 0x90 | |
.LBB4_12: | |
vmovss xmm1, dword ptr [rcx + 4*rax - 28] | |
vmovss xmm2, dword ptr [rcx + 4*rax - 24] | |
lea r9, [rdx + rax + 8] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28] | |
vaddss xmm0, xmm0, xmm1 | |
vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 20] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 16] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 12] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 8] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 4] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax] | |
add rax, 8 | |
vaddss xmm0, xmm0, xmm1 | |
cmp r9, 7 | |
jne .LBB4_12 | |
.LBB4_13: | |
add rsp, 104 | |
vzeroupper | |
ret | |
.LBB4_16: | |
lea rax, [rip + __unnamed_2] | |
lea rcx, [rip + __unnamed_3] | |
lea r9, [rip + __unnamed_5] | |
lea rdx, [rsp + 48] | |
lea r8, [rsp + 56] | |
vxorps xmm0, xmm0, xmm0 | |
mov qword ptr [rsp + 56], rax | |
mov qword ptr [rsp + 64], 1 | |
mov qword ptr [rsp + 72], rcx | |
lea rcx, [rsp + 40] | |
vmovups xmmword ptr [rsp + 80], xmm0 | |
call core::panicking::assert_failed | |
int3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
blogpost::avx2_dot::avx2_dot_product: | |
sub rsp, 104 | |
mov qword ptr [rsp + 40], rdx | |
mov qword ptr [rsp + 48], r9 | |
cmp rdx, r9 | |
jne .LBB4_16 | |
mov rax, rdx | |
and rax, -8 | |
je .LBB4_2 | |
lea r10, [rax - 1] | |
shr r10, 3 | |
inc r10 | |
mov r9d, r10d | |
and r9d, 7 | |
cmp rax, 57 | |
jae .LBB4_14 | |
vxorps xmm0, xmm0, xmm0 | |
xor eax, eax | |
jmp .LBB4_5 | |
.LBB4_2: | |
xor eax, eax | |
vxorps xmm0, xmm0, xmm0 | |
jmp .LBB4_7 | |
.LBB4_14: | |
and r10, -8 | |
vxorps xmm0, xmm0, xmm0 | |
xor eax, eax | |
.LBB4_15: | |
vmovups ymm1, ymmword ptr [rcx + 4*rax] | |
vmovups ymm2, ymmword ptr [rcx + 4*rax + 32] | |
vmovups ymm3, ymmword ptr [rcx + 4*rax + 64] | |
vmovups ymm4, ymmword ptr [rcx + 4*rax + 96] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax] | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32] | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64] | |
vaddps ymm0, ymm0, ymm1 | |
vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 128] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 160] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 192] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192] | |
vaddps ymm0, ymm0, ymm1 | |
vmovups ymm1, ymmword ptr [rcx + 4*rax + 224] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224] | |
add rax, 64 | |
add r10, -8 | |
vaddps ymm0, ymm0, ymm1 | |
jne .LBB4_15 | |
.LBB4_5: | |
test r9, r9 | |
je .LBB4_7 | |
.p2align 4, 0x90 | |
.LBB4_6: | |
vmovups ymm1, ymmword ptr [rcx + 4*rax] | |
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax] | |
add rax, 8 | |
dec r9 | |
vaddps ymm0, ymm0, ymm1 | |
jne .LBB4_6 | |
.LBB4_7: | |
.cv_inline_site_id 11 within 6 inlined_at 6 42 0 | |
.cv_inline_site_id 12 within 11 inlined_at 6 61 0 | |
vextractf128 xmm1, ymm0, 1 | |
mov r9, rax | |
sub r9, rdx | |
.cv_inline_site_id 13 within 11 inlined_at 6 63 0 | |
vaddps xmm0, xmm1, xmm0 | |
.cv_inline_site_id 14 within 11 inlined_at 6 66 0 | |
vshufpd xmm1, xmm0, xmm0, 1 | |
.cv_inline_site_id 15 within 11 inlined_at 6 67 0 | |
vaddps xmm0, xmm0, xmm1 | |
.cv_inline_site_id 16 within 11 inlined_at 6 71 0 | |
vmovshdup xmm1, xmm0 | |
vaddss xmm0, xmm0, xmm1 | |
jae .LBB4_13 | |
mov r10d, edx | |
sub r10d, eax | |
and r10d, 7 | |
je .LBB4_10 | |
.LBB4_9: | |
vmovss xmm1, dword ptr [rcx + 4*rax] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax] | |
inc rax | |
dec r10 | |
vaddss xmm0, xmm0, xmm1 | |
jne .LBB4_9 | |
.LBB4_10: | |
cmp r9, -8 | |
ja .LBB4_13 | |
neg rdx | |
add rax, 7 | |
.LBB4_12: | |
vmovss xmm1, dword ptr [rcx + 4*rax - 28] | |
vmovss xmm2, dword ptr [rcx + 4*rax - 24] | |
lea r9, [rdx + rax + 8] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28] | |
vaddss xmm0, xmm0, xmm1 | |
vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 20] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 16] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 12] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 8] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax - 4] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4] | |
vaddss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rcx + 4*rax] | |
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax] | |
add rax, 8 | |
vaddss xmm0, xmm0, xmm1 | |
cmp r9, 7 | |
jne .LBB4_12 | |
.LBB4_13: | |
add rsp, 104 | |
vzeroupper | |
ret | |
.LBB4_16: | |
lea rax, [rip + __unnamed_2] | |
lea rcx, [rip + __unnamed_3] | |
lea r9, [rip + __unnamed_5] | |
lea rdx, [rsp + 48] | |
lea r8, [rsp + 56] | |
vxorps xmm0, xmm0, xmm0 | |
mov qword ptr [rsp + 56], rax | |
mov qword ptr [rsp + 64], 1 | |
mov qword ptr [rsp + 72], rcx | |
lea rcx, [rsp + 40] | |
vmovups xmmword ptr [rsp + 80], xmm0 | |
call core::panicking::assert_failed | |
int3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment