Skip to content

Instantly share code, notes, and snippets.

@ChillFish8
Created July 3, 2024 08:53
Show Gist options
  • Save ChillFish8/cbe593135ccc04a46e2b28dababc64f3 to your computer and use it in GitHub Desktop.
Save ChillFish8/cbe593135ccc04a46e2b28dababc64f3 to your computer and use it in GitHub Desktop.
.section .text,"xr",one_only,blogpost::avx2_dot::avx2_dot_product
.globl blogpost::avx2_dot::avx2_dot_product
.p2align 4, 0x90
blogpost::avx2_dot::avx2_dot_product:
.cv_func_id 6
.seh_proc _ZN8blogpost8avx2_dot16avx2_dot_product17hffb9005f074b96fbE
sub rsp, 104
.seh_stackalloc 104
.seh_endprologue
mov qword ptr [rsp + 40], rdx
mov qword ptr [rsp + 48], r9
cmp rdx, r9
jne .LBB4_16
mov rax, rdx
and rax, -8
je .LBB4_2
lea r10, [rax - 1]
shr r10, 3
inc r10
mov r9d, r10d
and r9d, 7
cmp rax, 57
jae .LBB4_14
vxorps xmm0, xmm0, xmm0
xor eax, eax
jmp .LBB4_5
.LBB4_2:
xor eax, eax
vxorps xmm0, xmm0, xmm0
jmp .LBB4_7
.LBB4_14:
and r10, -8
vxorps xmm0, xmm0, xmm0
xor eax, eax
.p2align 4, 0x90
.LBB4_15:
.cv_inline_site_id 7 within 6 inlined_at 6 24 0
.cv_inline_site_id 8 within 7 inlined_at 8 1465 0
vmovups ymm1, ymmword ptr [rcx + 4*rax]
vmovups ymm2, ymmword ptr [rcx + 4*rax + 32]
vmovups ymm3, ymmword ptr [rcx + 4*rax + 64]
vmovups ymm4, ymmword ptr [rcx + 4*rax + 96]
.cv_inline_site_id 9 within 6 inlined_at 6 27 0
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
.cv_inline_site_id 10 within 6 inlined_at 6 28 0
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32]
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64]
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 128]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 160]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 192]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 224]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224]
add rax, 64
add r10, -8
vaddps ymm0, ymm0, ymm1
jne .LBB4_15
.LBB4_5:
test r9, r9
je .LBB4_7
.p2align 4, 0x90
.LBB4_6:
vmovups ymm1, ymmword ptr [rcx + 4*rax]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
add rax, 8
dec r9
vaddps ymm0, ymm0, ymm1
jne .LBB4_6
.LBB4_7:
.cv_inline_site_id 11 within 6 inlined_at 6 42 0
.cv_inline_site_id 12 within 11 inlined_at 6 61 0
vextractf128 xmm1, ymm0, 1
mov r9, rax
sub r9, rdx
.cv_inline_site_id 13 within 11 inlined_at 6 63 0
vaddps xmm0, xmm1, xmm0
.cv_inline_site_id 14 within 11 inlined_at 6 66 0
vshufpd xmm1, xmm0, xmm0, 1
.cv_inline_site_id 15 within 11 inlined_at 6 67 0
vaddps xmm0, xmm0, xmm1
.cv_inline_site_id 16 within 11 inlined_at 6 71 0
vmovshdup xmm1, xmm0
vaddss xmm0, xmm0, xmm1
jae .LBB4_13
mov r10d, edx
sub r10d, eax
and r10d, 7
je .LBB4_10
.p2align 4, 0x90
.LBB4_9:
vmovss xmm1, dword ptr [rcx + 4*rax]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
inc rax
dec r10
vaddss xmm0, xmm0, xmm1
jne .LBB4_9
.LBB4_10:
cmp r9, -8
ja .LBB4_13
neg rdx
add rax, 7
.p2align 4, 0x90
.LBB4_12:
vmovss xmm1, dword ptr [rcx + 4*rax - 28]
vmovss xmm2, dword ptr [rcx + 4*rax - 24]
lea r9, [rdx + rax + 8]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28]
vaddss xmm0, xmm0, xmm1
vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 20]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 16]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 12]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 8]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 4]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
add rax, 8
vaddss xmm0, xmm0, xmm1
cmp r9, 7
jne .LBB4_12
.LBB4_13:
add rsp, 104
vzeroupper
ret
.LBB4_16:
lea rax, [rip + __unnamed_2]
lea rcx, [rip + __unnamed_3]
lea r9, [rip + __unnamed_5]
lea rdx, [rsp + 48]
lea r8, [rsp + 56]
vxorps xmm0, xmm0, xmm0
mov qword ptr [rsp + 56], rax
mov qword ptr [rsp + 64], 1
mov qword ptr [rsp + 72], rcx
lea rcx, [rsp + 40]
vmovups xmmword ptr [rsp + 80], xmm0
call core::panicking::assert_failed
int3
blogpost::avx2_dot::avx2_dot_product:
sub rsp, 104
mov qword ptr [rsp + 40], rdx
mov qword ptr [rsp + 48], r9
cmp rdx, r9
jne .LBB4_16
mov rax, rdx
and rax, -8
je .LBB4_2
lea r10, [rax - 1]
shr r10, 3
inc r10
mov r9d, r10d
and r9d, 7
cmp rax, 57
jae .LBB4_14
vxorps xmm0, xmm0, xmm0
xor eax, eax
jmp .LBB4_5
.LBB4_2:
xor eax, eax
vxorps xmm0, xmm0, xmm0
jmp .LBB4_7
.LBB4_14:
and r10, -8
vxorps xmm0, xmm0, xmm0
xor eax, eax
.LBB4_15:
vmovups ymm1, ymmword ptr [rcx + 4*rax]
vmovups ymm2, ymmword ptr [rcx + 4*rax + 32]
vmovups ymm3, ymmword ptr [rcx + 4*rax + 64]
vmovups ymm4, ymmword ptr [rcx + 4*rax + 96]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32]
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64]
vaddps ymm0, ymm0, ymm1
vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 128]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 160]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 192]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, ymmword ptr [rcx + 4*rax + 224]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224]
add rax, 64
add r10, -8
vaddps ymm0, ymm0, ymm1
jne .LBB4_15
.LBB4_5:
test r9, r9
je .LBB4_7
.p2align 4, 0x90
.LBB4_6:
vmovups ymm1, ymmword ptr [rcx + 4*rax]
vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
add rax, 8
dec r9
vaddps ymm0, ymm0, ymm1
jne .LBB4_6
.LBB4_7:
.cv_inline_site_id 11 within 6 inlined_at 6 42 0
.cv_inline_site_id 12 within 11 inlined_at 6 61 0
vextractf128 xmm1, ymm0, 1
mov r9, rax
sub r9, rdx
.cv_inline_site_id 13 within 11 inlined_at 6 63 0
vaddps xmm0, xmm1, xmm0
.cv_inline_site_id 14 within 11 inlined_at 6 66 0
vshufpd xmm1, xmm0, xmm0, 1
.cv_inline_site_id 15 within 11 inlined_at 6 67 0
vaddps xmm0, xmm0, xmm1
.cv_inline_site_id 16 within 11 inlined_at 6 71 0
vmovshdup xmm1, xmm0
vaddss xmm0, xmm0, xmm1
jae .LBB4_13
mov r10d, edx
sub r10d, eax
and r10d, 7
je .LBB4_10
.LBB4_9:
vmovss xmm1, dword ptr [rcx + 4*rax]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
inc rax
dec r10
vaddss xmm0, xmm0, xmm1
jne .LBB4_9
.LBB4_10:
cmp r9, -8
ja .LBB4_13
neg rdx
add rax, 7
.LBB4_12:
vmovss xmm1, dword ptr [rcx + 4*rax - 28]
vmovss xmm2, dword ptr [rcx + 4*rax - 24]
lea r9, [rdx + rax + 8]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28]
vaddss xmm0, xmm0, xmm1
vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 20]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 16]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 12]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 8]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax - 4]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4]
vaddss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rcx + 4*rax]
vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
add rax, 8
vaddss xmm0, xmm0, xmm1
cmp r9, 7
jne .LBB4_12
.LBB4_13:
add rsp, 104
vzeroupper
ret
.LBB4_16:
lea rax, [rip + __unnamed_2]
lea rcx, [rip + __unnamed_3]
lea r9, [rip + __unnamed_5]
lea rdx, [rsp + 48]
lea r8, [rsp + 56]
vxorps xmm0, xmm0, xmm0
mov qword ptr [rsp + 56], rax
mov qword ptr [rsp + 64], 1
mov qword ptr [rsp + 72], rcx
lea rcx, [rsp + 40]
vmovups xmmword ptr [rsp + 80], xmm0
call core::panicking::assert_failed
int3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment