Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjacob/f7c3844deacf87909a74cfdc79080583 to your computer and use it in GitHub Desktop.
Save bjacob/f7c3844deacf87909a74cfdc79080583 to your computer and use it in GitHub Desktop.
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack disassembly
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack:
.Lfunc_begin1:
.loc 1 1 0 is_stmt 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
.Ltmp2:
push r15
push r14
push r13
push r12
push rbx
and rsp, -64
sub rsp, 256
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.loc 1 4 3 prologue_end
mov r9, qword ptr [rsi + 24]
movzx edi, word ptr [rdx + 8]
mov ebx, dword ptr [r9 + 12]
mov eax, dword ptr [r9 + 8]
mov r8d, edi
shl r8d, 6
mov qword ptr [rsp + 8], r8
shl rbx, 32
lea rcx, [rbx + rax]
mov qword ptr [rsp + 32], rcx
cmp r8, rcx
jge .LBB1_18
.loc 1 0 3 is_stmt 0
mov r11d, 15361
movzx r10d, word ptr [rsi + 20]
mov r15d, 3538944000
.loc 1 4 3
or rbx, rax
mov r8, qword ptr [rsi + 32]
mov ecx, dword ptr [rsi + 16]
mov r14d, dword ptr [rsi + 12]
sub rbx, qword ptr [rsp + 8]
bextr r9, qword ptr [r9], r11
mov r11d, dword ptr [rdx]
mov edx, dword ptr [rdx + 4]
imul rdi, r15
imul r15, r10
mov rsi, qword ptr [r8]
shl r10d, 6
imul rax, rdx, 409600
mov qword ptr [rsp + 64], rdx
mov qword ptr [rsp + 104], r11
mov qword ptr [rsp + 48], r10
mov qword ptr [rsp + 56], r15
lea r15, [4*rdx]
mov rdx, r11
shl rdx, 11
shl r11, 6
add rax, rdi
mov qword ptr [rsp + 80], r11
add rdx, rax
mov qword ptr [rsp + 40], r15
lea r15, [4*rcx]
imul rcx, rcx, 409600
lea rax, [rdx + 2*r9]
add rax, qword ptr [r8 + 8]
mov rdx, r14
shl rdx, 6
shl r14, 11
mov qword ptr [rsp + 96], r15
mov qword ptr [rsp + 112], rdx
mov qword ptr [rsp + 120], r14
mov qword ptr [rsp + 88], rcx
mov qword ptr [rsp + 16], rax
jmp .LBB1_2
.p2align 4, 0x90
.LBB1_17:
.loc 1 0 3
mov rdx, qword ptr [rsp + 16]
mov rax, qword ptr [rsp + 8]
mov rcx, qword ptr [rsp + 48]
mov rbx, qword ptr [rsp + 72]
.loc 1 4 3
add rdx, qword ptr [rsp + 56]
add rax, rcx
sub rbx, rcx
mov qword ptr [rsp + 16], rdx
mov qword ptr [rsp + 8], rax
cmp rax, qword ptr [rsp + 32]
jge .LBB1_18
.LBB1_2:
cmp rbx, 64
mov edx, 64
mov eax, 1
mov qword ptr [rsp + 72], rbx
cmovl rdx, rbx
cmp rdx, 2
cmovl rdx, rax
cmp dword ptr [rsp + 64], 134
ja .LBB1_17
.loc 1 0 3
mov rcx, qword ptr [rsp + 32]
mov rax, qword ptr [rsp + 16]
mov r11, qword ptr [rsp + 40]
.loc 1 4 3
sub rcx, qword ptr [rsp + 8]
mov qword ptr [rsp + 24], rax
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_16:
.loc 1 0 3
mov rax, qword ptr [rsp + 24]
.loc 1 4 3
add r11, qword ptr [rsp + 96]
add rax, qword ptr [rsp + 88]
mov qword ptr [rsp + 24], rax
cmp r11, 540
jge .LBB1_17
.LBB1_4:
.loc 1 0 3
cmp dword ptr [rsp + 104], 49
.loc 1 4 3
ja .LBB1_16
.loc 1 0 3
mov r9, qword ptr [rsp + 24]
mov r15, qword ptr [rsp + 80]
jmp .LBB1_6
.p2align 4, 0x90
.LBB1_15:
.loc 1 4 3
add r15, qword ptr [rsp + 112]
add r9, qword ptr [rsp + 120]
cmp r15, 3200
jge .LBB1_16
.LBB1_6:
.loc 1 0 3
test rcx, rcx
.loc 1 4 3
jle .LBB1_15
.loc 1 0 3
mov rdi, r9
xor eax, eax
.p2align 4, 0x90
.LBB1_8:
mov r10, rdi
xor r14d, r14d
.p2align 4, 0x90
.LBB1_9:
lea r8, [r14 + r11]
mov r12, r10
xor r13d, r13d
imul rbx, r8, 102400
add rbx, rsi
.p2align 4, 0x90
.LBB1_10:
.loc 1 4 3
mov r8, r13
or r8, r15
vpinsrw xmm0, xmm0, word ptr [rbx + 2*r8], 0
vpinsrw xmm1, xmm0, word ptr [rbx + 2*r8 + 6400], 0
vpinsrw xmm2, xmm0, word ptr [rbx + 2*r8 + 12800], 0
vpinsrw xmm3, xmm0, word ptr [rbx + 2*r8 + 19200], 0
vpinsrw xmm4, xmm0, word ptr [rbx + 2*r8 + 25600], 0
vpinsrw xmm5, xmm0, word ptr [rbx + 2*r8 + 32000], 0
vpinsrw xmm6, xmm0, word ptr [rbx + 2*r8 + 38400], 0
vpinsrw xmm7, xmm0, word ptr [rbx + 2*r8 + 44800], 0
vpinsrw xmm8, xmm0, word ptr [rbx + 2*r8 + 51200], 0
vpinsrw xmm9, xmm0, word ptr [rbx + 2*r8 + 57600], 0
vpinsrw xmm10, xmm0, word ptr [rbx + 2*r8 + 64000], 0
vpinsrw xmm11, xmm0, word ptr [rbx + 2*r8 + 70400], 0
vpinsrw xmm12, xmm0, word ptr [rbx + 2*r8 + 76800], 0
vpinsrw xmm13, xmm0, word ptr [rbx + 2*r8 + 83200], 0
vpinsrw xmm14, xmm0, word ptr [rbx + 2*r8 + 89600], 0
vpinsrw xmm15, xmm0, word ptr [rbx + 2*r8 + 96000], 0
xor r8d, r8d
vpunpcklwd xmm6, xmm6, xmm7
vpunpcklwd xmm14, xmm14, xmm15
vpunpcklwd xmm4, xmm4, xmm5
vpunpcklwd xmm7, xmm12, xmm13
vpunpcklwd xmm2, xmm2, xmm3
vpunpcklwd xmm5, xmm10, xmm11
vpunpcklwd xmm3, xmm8, xmm9
vpunpcklwd xmm0, xmm0, xmm1
vinserti128 ymm6, ymm6, xmm14, 1
vinserti128 ymm4, ymm4, xmm7, 1
vinserti128 ymm2, ymm2, xmm5, 1
vinserti128 ymm0, ymm0, xmm3, 1
vpunpckldq ymm4, ymm4, ymm6
vpunpckldq ymm0, ymm0, ymm2
vpunpcklqdq ymm0, ymm0, ymm4
vmovdqa ymmword ptr [rsp + 128], ymm0
.p2align 4, 0x90
.LBB1_11:
vpinsrw xmm0, xmm0, word ptr [rsp + 2*r8 + 128], 0
vpextrw word ptr [r12 + 2*r8], xmm0, 0
inc r8
cmp r8, 16
jne .LBB1_11
inc r13
add r12, 32
cmp r13, 64
jne .LBB1_10
inc r14
add r10, 102400
cmp r14, 4
jne .LBB1_9
inc rax
add rdi, 55296000
cmp rax, rdx
jne .LBB1_8
jmp .LBB1_15
.LBB1_18:
xor eax, eax
lea rsp, [rbp - 40]
.loc 1 4 3 epilogue_begin
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
.cfi_def_cfa rsp, 8
vzeroupper
ret
.Ltmp3:
.Lfunc_end1:
.size turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack, .Lfunc_end1-turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack
.cfi_endproc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment