Created
April 12, 2024 21:11
-
-
Save bjacob/f7c3844deacf87909a74cfdc79080583 to your computer and use it in GitHub Desktop.
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack disassembly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits | |
.p2align 4, 0x90 | |
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function | |
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack: | |
.Lfunc_begin1: | |
.loc 1 1 0 is_stmt 1 | |
.cfi_startproc | |
push rbp | |
.cfi_def_cfa_offset 16 | |
.cfi_offset rbp, -16 | |
mov rbp, rsp | |
.cfi_def_cfa_register rbp | |
.Ltmp2: | |
push r15 | |
push r14 | |
push r13 | |
push r12 | |
push rbx | |
and rsp, -64 | |
sub rsp, 256 | |
.cfi_offset rbx, -56 | |
.cfi_offset r12, -48 | |
.cfi_offset r13, -40 | |
.cfi_offset r14, -32 | |
.cfi_offset r15, -24 | |
.loc 1 4 3 prologue_end | |
mov r9, qword ptr [rsi + 24] | |
movzx edi, word ptr [rdx + 8] | |
mov ebx, dword ptr [r9 + 12] | |
mov eax, dword ptr [r9 + 8] | |
mov r8d, edi | |
shl r8d, 6 | |
mov qword ptr [rsp + 8], r8 | |
shl rbx, 32 | |
lea rcx, [rbx + rax] | |
mov qword ptr [rsp + 32], rcx | |
cmp r8, rcx | |
jge .LBB1_18 | |
.loc 1 0 3 is_stmt 0 | |
mov r11d, 15361 | |
movzx r10d, word ptr [rsi + 20] | |
mov r15d, 3538944000 | |
.loc 1 4 3 | |
or rbx, rax | |
mov r8, qword ptr [rsi + 32] | |
mov ecx, dword ptr [rsi + 16] | |
mov r14d, dword ptr [rsi + 12] | |
sub rbx, qword ptr [rsp + 8] | |
bextr r9, qword ptr [r9], r11 | |
mov r11d, dword ptr [rdx] | |
mov edx, dword ptr [rdx + 4] | |
imul rdi, r15 | |
imul r15, r10 | |
mov rsi, qword ptr [r8] | |
shl r10d, 6 | |
imul rax, rdx, 409600 | |
mov qword ptr [rsp + 64], rdx | |
mov qword ptr [rsp + 104], r11 | |
mov qword ptr [rsp + 48], r10 | |
mov qword ptr [rsp + 56], r15 | |
lea r15, [4*rdx] | |
mov rdx, r11 | |
shl rdx, 11 | |
shl r11, 6 | |
add rax, rdi | |
mov qword ptr [rsp + 80], r11 | |
add rdx, rax | |
mov qword ptr [rsp + 40], r15 | |
lea r15, [4*rcx] | |
imul rcx, rcx, 409600 | |
lea rax, [rdx + 2*r9] | |
add rax, qword ptr [r8 + 8] | |
mov rdx, r14 | |
shl rdx, 6 | |
shl r14, 11 | |
mov qword ptr [rsp + 96], r15 | |
mov qword ptr [rsp + 112], rdx | |
mov qword ptr [rsp + 120], r14 | |
mov qword ptr [rsp + 88], rcx | |
mov qword ptr [rsp + 16], rax | |
jmp .LBB1_2 | |
.p2align 4, 0x90 | |
.LBB1_17: | |
.loc 1 0 3 | |
mov rdx, qword ptr [rsp + 16] | |
mov rax, qword ptr [rsp + 8] | |
mov rcx, qword ptr [rsp + 48] | |
mov rbx, qword ptr [rsp + 72] | |
.loc 1 4 3 | |
add rdx, qword ptr [rsp + 56] | |
add rax, rcx | |
sub rbx, rcx | |
mov qword ptr [rsp + 16], rdx | |
mov qword ptr [rsp + 8], rax | |
cmp rax, qword ptr [rsp + 32] | |
jge .LBB1_18 | |
.LBB1_2: | |
cmp rbx, 64 | |
mov edx, 64 | |
mov eax, 1 | |
mov qword ptr [rsp + 72], rbx | |
cmovl rdx, rbx | |
cmp rdx, 2 | |
cmovl rdx, rax | |
cmp dword ptr [rsp + 64], 134 | |
ja .LBB1_17 | |
.loc 1 0 3 | |
mov rcx, qword ptr [rsp + 32] | |
mov rax, qword ptr [rsp + 16] | |
mov r11, qword ptr [rsp + 40] | |
.loc 1 4 3 | |
sub rcx, qword ptr [rsp + 8] | |
mov qword ptr [rsp + 24], rax | |
jmp .LBB1_4 | |
.p2align 4, 0x90 | |
.LBB1_16: | |
.loc 1 0 3 | |
mov rax, qword ptr [rsp + 24] | |
.loc 1 4 3 | |
add r11, qword ptr [rsp + 96] | |
add rax, qword ptr [rsp + 88] | |
mov qword ptr [rsp + 24], rax | |
cmp r11, 540 | |
jge .LBB1_17 | |
.LBB1_4: | |
.loc 1 0 3 | |
cmp dword ptr [rsp + 104], 49 | |
.loc 1 4 3 | |
ja .LBB1_16 | |
.loc 1 0 3 | |
mov r9, qword ptr [rsp + 24] | |
mov r15, qword ptr [rsp + 80] | |
jmp .LBB1_6 | |
.p2align 4, 0x90 | |
.LBB1_15: | |
.loc 1 4 3 | |
add r15, qword ptr [rsp + 112] | |
add r9, qword ptr [rsp + 120] | |
cmp r15, 3200 | |
jge .LBB1_16 | |
.LBB1_6: | |
.loc 1 0 3 | |
test rcx, rcx | |
.loc 1 4 3 | |
jle .LBB1_15 | |
.loc 1 0 3 | |
mov rdi, r9 | |
xor eax, eax | |
.p2align 4, 0x90 | |
.LBB1_8: | |
mov r10, rdi | |
xor r14d, r14d | |
.p2align 4, 0x90 | |
.LBB1_9: | |
lea r8, [r14 + r11] | |
mov r12, r10 | |
xor r13d, r13d | |
imul rbx, r8, 102400 | |
add rbx, rsi | |
.p2align 4, 0x90 | |
.LBB1_10: | |
.loc 1 4 3 | |
mov r8, r13 | |
or r8, r15 | |
vpinsrw xmm0, xmm0, word ptr [rbx + 2*r8], 0 | |
vpinsrw xmm1, xmm0, word ptr [rbx + 2*r8 + 6400], 0 | |
vpinsrw xmm2, xmm0, word ptr [rbx + 2*r8 + 12800], 0 | |
vpinsrw xmm3, xmm0, word ptr [rbx + 2*r8 + 19200], 0 | |
vpinsrw xmm4, xmm0, word ptr [rbx + 2*r8 + 25600], 0 | |
vpinsrw xmm5, xmm0, word ptr [rbx + 2*r8 + 32000], 0 | |
vpinsrw xmm6, xmm0, word ptr [rbx + 2*r8 + 38400], 0 | |
vpinsrw xmm7, xmm0, word ptr [rbx + 2*r8 + 44800], 0 | |
vpinsrw xmm8, xmm0, word ptr [rbx + 2*r8 + 51200], 0 | |
vpinsrw xmm9, xmm0, word ptr [rbx + 2*r8 + 57600], 0 | |
vpinsrw xmm10, xmm0, word ptr [rbx + 2*r8 + 64000], 0 | |
vpinsrw xmm11, xmm0, word ptr [rbx + 2*r8 + 70400], 0 | |
vpinsrw xmm12, xmm0, word ptr [rbx + 2*r8 + 76800], 0 | |
vpinsrw xmm13, xmm0, word ptr [rbx + 2*r8 + 83200], 0 | |
vpinsrw xmm14, xmm0, word ptr [rbx + 2*r8 + 89600], 0 | |
vpinsrw xmm15, xmm0, word ptr [rbx + 2*r8 + 96000], 0 | |
xor r8d, r8d | |
vpunpcklwd xmm6, xmm6, xmm7 | |
vpunpcklwd xmm14, xmm14, xmm15 | |
vpunpcklwd xmm4, xmm4, xmm5 | |
vpunpcklwd xmm7, xmm12, xmm13 | |
vpunpcklwd xmm2, xmm2, xmm3 | |
vpunpcklwd xmm5, xmm10, xmm11 | |
vpunpcklwd xmm3, xmm8, xmm9 | |
vpunpcklwd xmm0, xmm0, xmm1 | |
vinserti128 ymm6, ymm6, xmm14, 1 | |
vinserti128 ymm4, ymm4, xmm7, 1 | |
vinserti128 ymm2, ymm2, xmm5, 1 | |
vinserti128 ymm0, ymm0, xmm3, 1 | |
vpunpckldq ymm4, ymm4, ymm6 | |
vpunpckldq ymm0, ymm0, ymm2 | |
vpunpcklqdq ymm0, ymm0, ymm4 | |
vmovdqa ymmword ptr [rsp + 128], ymm0 | |
.p2align 4, 0x90 | |
.LBB1_11: | |
vpinsrw xmm0, xmm0, word ptr [rsp + 2*r8 + 128], 0 | |
vpextrw word ptr [r12 + 2*r8], xmm0, 0 | |
inc r8 | |
cmp r8, 16 | |
jne .LBB1_11 | |
inc r13 | |
add r12, 32 | |
cmp r13, 64 | |
jne .LBB1_10 | |
inc r14 | |
add r10, 102400 | |
cmp r14, 4 | |
jne .LBB1_9 | |
inc rax | |
add rdi, 55296000 | |
cmp rax, rdx | |
jne .LBB1_8 | |
jmp .LBB1_15 | |
.LBB1_18: | |
xor eax, eax | |
lea rsp, [rbp - 40] | |
.loc 1 4 3 epilogue_begin | |
pop rbx | |
pop r12 | |
pop r13 | |
pop r14 | |
pop r15 | |
pop rbp | |
.cfi_def_cfa rsp, 8 | |
vzeroupper | |
ret | |
.Ltmp3: | |
.Lfunc_end1: | |
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack, .Lfunc_end1-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack | |
.cfi_endproc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment