Skip to content

Instantly share code, notes, and snippets.

@haampie
Last active February 1, 2021 23:19
Show Gist options
  • Save haampie/f6d0fc711d83d3d66506f666ee5c839b to your computer and use it in GitHub Desktop.
Save haampie/f6d0fc711d83d3d66506f666ee5c839b to your computer and use it in GitHub Desktop.
Tiny Transpose {5,6,7,8} x 8
vmovups ymm0, ymmword ptr [rsi]
vmovups ymm1, ymmword ptr [rsi + 32]
vmovups ymm2, ymmword ptr [rsi + 64]
vmovups ymm6, ymmword ptr [rsi + 96]
vmovups ymm7, ymmword ptr [rsi + 128]
movabs rcx, offset .rodata.cst32
mov rax, rdi
vmovaps ymm10, ymmword ptr [rcx]
vperm2f128 ymm3, ymm0, ymm1, 33 # ymm3 = ymm0[2,3],ymm1[0,1]
vperm2f128 ymm5, ymm1, ymm2, 33 # ymm5 = ymm1[2,3],ymm2[0,1]
vperm2f128 ymm8, ymm6, ymm7, 33 # ymm8 = ymm6[2,3],ymm7[0,1]
vshufps ymm4, ymm3, ymm1, 41 # ymm4 = ymm3[1,2],ymm1[2,0],ymm3[5,6],ymm1[6,4]
vshufpd ymm1, ymm1, ymm5, 5 # ymm1 = ymm1[1],ymm5[0],ymm1[3],ymm5[2]
vshufps ymm5, ymm5, ymm2, 3 # ymm5 = ymm5[3,0],ymm2[0,0],ymm5[7,4],ymm2[4,4]
vshufps ymm9, ymm6, ymm8, 41 # ymm9 = ymm6[1,2],ymm8[2,0],ymm6[5,6],ymm8[6,4]
vshufpd ymm8, ymm8, ymm7, 5 # ymm8 = ymm8[1],ymm7[0],ymm8[3],ymm7[2]
vpermps ymm7, ymm10, ymm7
vblendps ymm0, ymm0, ymm3, 170 # ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
vmovaps xmm3, xmmword ptr [rsi]
vshufps ymm5, ymm5, ymm2, 152 # ymm5 = ymm5[0,2],ymm2[1,2],ymm5[4,6],ymm2[5,6]
vperm2f128 ymm2, ymm2, ymm6, 33 # ymm2 = ymm2[2,3],ymm6[0,1]
vshufps xmm3, xmm3, xmm4, 221 # xmm3 = xmm3[1,3],xmm4[1,3]
vshufps ymm4, ymm1, ymm5, 136 # ymm4 = ymm1[0,2],ymm5[0,2],ymm1[4,6],ymm5[4,6]
vshufps xmm1, xmm1, xmm5, 221 # xmm1 = xmm1[1,3],xmm5[1,3]
vmovaps xmm5, xmmword ptr [rsi + 80]
vpermilps ymm4, ymm4, 216 # ymm4 = ymm4[0,2,1,3,4,6,5,7]
vshufps xmm5, xmm5, xmm9, 221 # xmm5 = xmm5[1,3],xmm9[1,3]
vblendps ymm2, ymm2, ymm6, 170 # ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4],ymm6[5],ymm2[6],ymm6[7]
vshufps ymm6, ymm8, ymm7, 136 # ymm6 = ymm8[0,2],ymm7[0,2],ymm8[4,6],ymm7[4,6]
vshufps xmm7, xmm8, xmm7, 221 # xmm7 = xmm8[1,3],xmm7[1,3]
vunpcklpd ymm8, ymm0, ymm4 # ymm8 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
vunpckhpd xmm0, xmm0, xmm4 # xmm0 = xmm0[1],xmm4[1]
vshufps xmm4, xmm3, xmm1, 136 # xmm4 = xmm3[0,2],xmm1[0,2]
vshufps xmm1, xmm3, xmm1, 221 # xmm1 = xmm3[1,3],xmm1[1,3]
vpermilps ymm6, ymm6, 216 # ymm6 = ymm6[0,2,1,3,4,6,5,7]
vunpcklpd ymm3, ymm2, ymm6 # ymm3 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
vunpckhpd xmm2, xmm2, xmm6 # xmm2 = xmm2[1],xmm6[1]
vshufps xmm6, xmm5, xmm7, 136 # xmm6 = xmm5[0,2],xmm7[0,2]
vshufps xmm5, xmm5, xmm7, 221 # xmm5 = xmm5[1,3],xmm7[1,3]
vinsertf128 ymm7, ymm8, xmm3, 1
vperm2f128 ymm3, ymm8, ymm3, 49 # ymm3 = ymm8[2,3],ymm3[2,3]
vmovaps ymmword ptr [rdi], ymm7
vmovaps xmmword ptr [rdi + 48], xmm6
vmovaps xmmword ptr [rdi + 32], xmm4
vmovaps xmmword ptr [rdi + 80], xmm2
vmovaps xmmword ptr [rdi + 64], xmm0
vmovaps xmmword ptr [rdi + 112], xmm5
vmovaps xmmword ptr [rdi + 96], xmm1
vmovaps ymmword ptr [rdi + 128], ymm3
vzeroupper
ret
nop word ptr cs:[rax + rax]
vmovupd ymm0, ymmword ptr [rsi]
vmovupd ymm1, ymmword ptr [rsi + 32]
vmovupd ymm3, ymmword ptr [rsi + 64]
vmovupd ymm4, ymmword ptr [rsi + 96]
vmovupd ymm7, ymmword ptr [rsi + 160]
mov rax, rdi
vperm2f128 ymm2, ymm0, ymm1, 33 # ymm2 = ymm0[2,3],ymm1[0,1]
vperm2f128 ymm5, ymm3, ymm4, 33 # ymm5 = ymm3[2,3],ymm4[0,1]
vshufpd ymm2, ymm2, ymm1, 5 # ymm2 = ymm2[1],ymm1[0],ymm2[3],ymm1[2]
vperm2f128 ymm1, ymm1, ymm3, 33 # ymm1 = ymm1[2,3],ymm3[0,1]
vshufpd ymm3, ymm3, ymm5, 5 # ymm3 = ymm3[1],ymm5[0],ymm3[3],ymm5[2]
vmovupd ymm5, ymmword ptr [rsi + 128]
vshufps ymm8, ymm0, ymm2, 136 # ymm8 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
vshufps ymm0, ymm0, ymm2, 221 # ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
vpermilps ymm8, ymm8, 216 # ymm8 = ymm8[0,2,1,3,4,6,5,7]
vpermilps ymm0, ymm0, 216 # ymm0 = ymm0[0,2,1,3,4,6,5,7]
vperm2f128 ymm6, ymm4, ymm5, 33 # ymm6 = ymm4[2,3],ymm5[0,1]
vshufps ymm2, ymm1, ymm3, 136 # ymm2 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
vshufps ymm1, ymm1, ymm3, 221 # ymm1 = ymm1[1,3],ymm3[1,3],ymm1[5,7],ymm3[5,7]
vpermilps ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3,4,6,5,7]
vpermilps ymm1, ymm1, 216 # ymm1 = ymm1[0,2,1,3,4,6,5,7]
vshufpd ymm6, ymm6, ymm5, 5 # ymm6 = ymm6[1],ymm5[0],ymm6[3],ymm5[2]
vperm2f128 ymm5, ymm5, ymm7, 33 # ymm5 = ymm5[2,3],ymm7[0,1]
vpermpd ymm7, ymm7, 57 # ymm7 = ymm7[1,2,3,0]
vshufps ymm3, ymm4, ymm6, 136 # ymm3 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
vshufps ymm4, ymm4, ymm6, 221 # ymm4 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7]
vpermilps ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3,4,6,5,7]
vpermilps ymm4, ymm4, 216 # ymm4 = ymm4[0,2,1,3,4,6,5,7]
vshufps ymm6, ymm5, ymm7, 136 # ymm6 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6]
vshufps ymm5, ymm5, ymm7, 221 # ymm5 = ymm5[1,3],ymm7[1,3],ymm5[5,7],ymm7[5,7]
vunpcklpd ymm7, ymm8, ymm2 # ymm7 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
vunpckhpd xmm2, xmm8, xmm2 # xmm2 = xmm8[1],xmm2[1]
vunpcklpd ymm8, ymm0, ymm1 # ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
vunpckhpd xmm0, xmm0, xmm1 # xmm0 = xmm0[1],xmm1[1]
vpermilps ymm6, ymm6, 216 # ymm6 = ymm6[0,2,1,3,4,6,5,7]
vpermilps ymm5, ymm5, 216 # ymm5 = ymm5[0,2,1,3,4,6,5,7]
vunpcklpd ymm1, ymm3, ymm6 # ymm1 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
vunpckhpd xmm3, xmm3, xmm6 # xmm3 = xmm3[1],xmm6[1]
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
vunpckhpd xmm4, xmm4, xmm5 # xmm4 = xmm4[1],xmm5[1]
vinsertf128 ymm5, ymm7, xmm1, 1
vperm2f128 ymm1, ymm7, ymm1, 49 # ymm1 = ymm7[2,3],ymm1[2,3]
vinsertf128 ymm7, ymm8, xmm6, 1
vperm2f128 ymm6, ymm8, ymm6, 49 # ymm6 = ymm8[2,3],ymm6[2,3]
vmovaps ymmword ptr [rdi], ymm5
vmovaps ymmword ptr [rdi + 32], ymm7
vmovaps xmmword ptr [rdi + 80], xmm3
vmovaps xmmword ptr [rdi + 64], xmm2
vmovaps xmmword ptr [rdi + 112], xmm4
vmovaps xmmword ptr [rdi + 96], xmm0
vmovaps ymmword ptr [rdi + 128], ymm1
vmovaps ymmword ptr [rdi + 160], ymm6
vzeroupper
ret
nop word ptr [rax + rax]
vmovupd ymm1, ymmword ptr [rsi + 32]
vmovupd ymm3, ymmword ptr [rsi + 64]
vmovupd ymm8, ymmword ptr [rsi + 160]
vmovupd ymm10, ymmword ptr [rsi + 192]
vmovupd ymm5, ymmword ptr [rsi + 96]
vmovupd ymm7, ymmword ptr [rsi + 128]
vmovupd ymm0, ymmword ptr [rsi]
movabs rcx, offset .rodata.cst32
mov rax, rdi
vperm2f128 ymm4, ymm1, ymm3, 33 # ymm4 = ymm1[2,3],ymm3[0,1]
vperm2f128 ymm11, ymm8, ymm10, 33 # ymm11 = ymm8[2,3],ymm10[0,1]
vperm2f128 ymm9, ymm7, ymm8, 33 # ymm9 = ymm7[2,3],ymm8[0,1]
vperm2f128 ymm2, ymm0, ymm1, 33 # ymm2 = ymm0[2,3],ymm1[0,1]
vshufpd ymm4, ymm4, ymm3, 5 # ymm4 = ymm4[1],ymm3[0],ymm4[3],ymm3[2]
vperm2f128 ymm3, ymm3, ymm5, 33 # ymm3 = ymm3[2,3],ymm5[0,1]
vshufpd ymm8, ymm8, ymm11, 5 # ymm8 = ymm8[1],ymm11[0],ymm8[3],ymm11[2]
vmovapd ymm11, ymmword ptr [rcx]
vshufps ymm2, ymm2, ymm1, 3 # ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
vshufps ymm2, ymm2, ymm1, 152 # ymm2 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
vshufps ymm2, ymm0, ymm2, 136 # ymm2 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
vshufps ymm0, ymm0, ymm1, 141 # ymm0 = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
vpermilps ymm0, ymm0, 216 # ymm0 = ymm0[0,2,1,3,4,6,5,7]
vpermps ymm11, ymm11, ymm10
vshufps ymm6, ymm3, ymm5, 41 # ymm6 = ymm3[1,2],ymm5[2,0],ymm3[5,6],ymm5[6,4]
vperm2f128 ymm5, ymm5, ymm7, 33 # ymm5 = ymm5[2,3],ymm7[0,1]
vshufps ymm7, ymm7, ymm9, 3 # ymm7 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4]
vshufps ymm1, ymm4, ymm3, 216 # ymm1 = ymm4[0,2],ymm3[1,3],ymm4[4,6],ymm3[5,7]
vshufps ymm7, ymm7, ymm9, 152 # ymm7 = ymm7[0,2],ymm9[1,2],ymm7[4,6],ymm9[5,6]
vshufps ymm3, ymm4, ymm6, 221 # ymm3 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7]
vshufps ymm6, ymm8, ymm10, 216 # ymm6 = ymm8[0,2],ymm10[1,3],ymm8[4,6],ymm10[5,7]
vpermilps ymm3, ymm3, 216 # ymm3 = ymm3[0,2,1,3,4,6,5,7]
vshufps ymm4, ymm5, ymm7, 136 # ymm4 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6]
vshufps ymm7, ymm8, ymm11, 221 # ymm7 = ymm8[1,3],ymm11[1,3],ymm8[5,7],ymm11[5,7]
vshufps ymm5, ymm5, ymm9, 141 # ymm5 = ymm5[1,3],ymm9[0,2],ymm5[5,7],ymm9[4,6]
vshufps ymm8, ymm2, ymm1, 136 # ymm8 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
vshufps ymm1, ymm2, ymm1, 221 # ymm1 = ymm2[1,3],ymm1[1,3],ymm2[5,7],ymm1[5,7]
vunpcklpd ymm2, ymm0, ymm3 # ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
vunpckhpd xmm0, xmm0, xmm3 # xmm0 = xmm0[1],xmm3[1]
vpermilps ymm5, ymm5, 216 # ymm5 = ymm5[0,2,1,3,4,6,5,7]
vpermilps ymm7, ymm7, 216 # ymm7 = ymm7[0,2,1,3,4,6,5,7]
vshufps ymm3, ymm4, ymm6, 136 # ymm3 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
vshufps ymm4, ymm4, ymm6, 221 # ymm4 = ymm4[1,3],ymm6[1,3],ymm4[5,7],ymm6[5,7]
vunpcklpd ymm6, ymm5, ymm7 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
vunpckhpd xmm5, xmm5, xmm7 # xmm5 = xmm5[1],xmm7[1]
vinsertf128 ymm7, ymm8, xmm3, 1
vperm2f128 ymm3, ymm8, ymm3, 49 # ymm3 = ymm8[2,3],ymm3[2,3]
vinsertf128 ymm8, ymm2, xmm6, 1
vperm2f128 ymm2, ymm2, ymm6, 49 # ymm2 = ymm2[2,3],ymm6[2,3]
vinsertf128 ymm6, ymm1, xmm4, 1
vperm2f128 ymm1, ymm1, ymm4, 49 # ymm1 = ymm1[2,3],ymm4[2,3]
vmovaps ymmword ptr [rdi], ymm7
vmovaps ymmword ptr [rdi + 32], ymm8
vmovaps ymmword ptr [rdi + 64], ymm6
vmovaps xmmword ptr [rdi + 112], xmm5
vmovaps xmmword ptr [rdi + 96], xmm0
vmovaps ymmword ptr [rdi + 128], ymm3
vmovaps ymmword ptr [rdi + 160], ymm2
vmovaps ymmword ptr [rdi + 192], ymm1
vzeroupper
ret
nop word ptr [rax + rax]
vmovups ymm0, ymmword ptr [rsi]
vmovups ymm1, ymmword ptr [rsi + 32]
vmovups ymm2, ymmword ptr [rsi + 64]
vmovups ymm3, ymmword ptr [rsi + 96]
vmovups ymm5, ymmword ptr [rsi + 160]
vmovups ymm7, ymmword ptr [rsi + 224]
mov rax, rdi
vshufps ymm4, ymm0, ymm1, 136 # ymm4 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
vshufps ymm0, ymm0, ymm1, 221 # ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
vshufps ymm1, ymm2, ymm3, 136 # ymm1 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
vshufps ymm2, ymm2, ymm3, 221 # ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
vmovups ymm3, ymmword ptr [rsi + 128]
vshufps ymm6, ymm3, ymm5, 136 # ymm6 = ymm3[0,2],ymm5[0,2],ymm3[4,6],ymm5[4,6]
vshufps ymm3, ymm3, ymm5, 221 # ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7]
vmovups ymm5, ymmword ptr [rsi + 192]
vshufps ymm8, ymm5, ymm7, 136 # ymm8 = ymm5[0,2],ymm7[0,2],ymm5[4,6],ymm7[4,6]
vshufps ymm5, ymm5, ymm7, 221 # ymm5 = ymm5[1,3],ymm7[1,3],ymm5[5,7],ymm7[5,7]
vshufps ymm7, ymm4, ymm1, 136 # ymm7 = ymm4[0,2],ymm1[0,2],ymm4[4,6],ymm1[4,6]
vshufps ymm1, ymm4, ymm1, 221 # ymm1 = ymm4[1,3],ymm1[1,3],ymm4[5,7],ymm1[5,7]
vshufps ymm4, ymm0, ymm2, 136 # ymm4 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
vshufps ymm0, ymm0, ymm2, 221 # ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
vshufps ymm2, ymm6, ymm8, 136 # ymm2 = ymm6[0,2],ymm8[0,2],ymm6[4,6],ymm8[4,6]
vshufps ymm6, ymm6, ymm8, 221 # ymm6 = ymm6[1,3],ymm8[1,3],ymm6[5,7],ymm8[5,7]
vshufps ymm8, ymm3, ymm5, 136 # ymm8 = ymm3[0,2],ymm5[0,2],ymm3[4,6],ymm5[4,6]
vshufps ymm3, ymm3, ymm5, 221 # ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7]
vinsertf128 ymm5, ymm7, xmm2, 1
vperm2f128 ymm2, ymm7, ymm2, 49 # ymm2 = ymm7[2,3],ymm2[2,3]
vinsertf128 ymm7, ymm4, xmm8, 1
vperm2f128 ymm4, ymm4, ymm8, 49 # ymm4 = ymm4[2,3],ymm8[2,3]
vinsertf128 ymm8, ymm1, xmm6, 1
vperm2f128 ymm1, ymm1, ymm6, 49 # ymm1 = ymm1[2,3],ymm6[2,3]
vinsertf128 ymm6, ymm0, xmm3, 1
vperm2f128 ymm0, ymm0, ymm3, 49 # ymm0 = ymm0[2,3],ymm3[2,3]
vmovaps ymmword ptr [rdi], ymm5
vmovaps ymmword ptr [rdi + 32], ymm7
vmovaps ymmword ptr [rdi + 64], ymm8
vmovaps ymmword ptr [rdi + 96], ymm6
vmovaps ymmword ptr [rdi + 128], ymm2
vmovaps ymmword ptr [rdi + 160], ymm4
vmovaps ymmword ptr [rdi + 192], ymm1
vmovaps ymmword ptr [rdi + 224], ymm0
vzeroupper
ret
nop
vmovups ymm5, ymmword ptr [rsi + 96]
movabs rcx, offset .rodata.cst16
vmovups ymm1, ymmword ptr [rsi]
vmovups ymm2, ymmword ptr [rsi + 32]
vmovups ymm4, ymmword ptr [rsi + 64]
vmovups ymm0, ymmword ptr [rsi + 128]
movabs rdx, 140200282622864
mov rax, rdi
vmovaps xmm6, xmmword ptr [rcx]
movabs rcx, 140200282622784
vpermpd ymm3, ymm5, 196 # ymm3 = ymm5[0,1,0,3]
vblendps ymm7, ymm1, ymm2, 204 # ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
vblendps ymm8, ymm4, ymm5, 204 # ymm8 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
vblendps ymm9, ymm1, ymm2, 48 # ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
vpermps ymm6, ymm6, ymm7
vblendps ymm7, ymm1, ymm2, 12 # ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
vblendps ymm3, ymm3, ymm4, 16 # ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
vblendps ymm3, ymm6, ymm3, 240 # ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
vbroadcastss ymm6, dword ptr [rsi + 140]
vblendps ymm3, ymm3, ymm6, 128 # ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
vmovaps xmm6, xmmword ptr [rcx]
movabs rcx, 140200282622800
vpermps ymm6, ymm6, ymm7
vbroadcastf128 ymm7, xmmword ptr [rcx] # ymm7 = mem[0,1,0,1]
movabs rcx, 140200282622816
vpermps ymm7, ymm7, ymm8
vmovaps xmm8, xmmword ptr [rcx]
movabs rcx, offset .rodata.cst8
vpermps ymm8, ymm8, ymm9
vshufps ymm9, ymm1, ymm2, 3 # ymm9 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4]
vblendps ymm1, ymm2, ymm1, 48 # ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
vshufps ymm9, ymm9, ymm2, 216 # ymm9 = ymm9[0,2],ymm2[1,3],ymm9[4,6],ymm2[5,7]
vbroadcastsd ymm2, qword ptr [rdx]
vpermpd ymm9, ymm9, 236 # ymm9 = ymm9[0,3,2,3]
vblendps ymm6, ymm7, ymm6, 7 # ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
vbroadcastss ymm7, dword ptr [rsi + 144]
vblendps ymm6, ymm6, ymm7, 128 # ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
vinsertf128 ymm7, ymm0, xmmword ptr [rsi + 96], 1
vmovaps ymmword ptr [rdi], ymm3
vmovaps ymmword ptr [rdi + 32], ymm6
vshufps ymm7, ymm7, ymm4, 35 # ymm7 = ymm7[3,0],ymm4[2,0],ymm7[7,4],ymm4[6,4]
vshufps ymm7, ymm7, ymm4, 98 # ymm7 = ymm7[2,0],ymm4[2,1],ymm7[6,4],ymm4[6,5]
vblendps ymm7, ymm8, ymm7, 56 # ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7]
vbroadcastsd ymm8, qword ptr [rcx]
movabs rcx, 140200282622856
vpermps ymm8, ymm8, ymm0
vblendps ymm7, ymm7, ymm8, 192 # ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
vshufps ymm8, ymm5, ymm4, 48 # ymm8 = ymm5[0,0],ymm4[3,0],ymm5[4,4],ymm4[7,4]
vshufps ymm8, ymm8, ymm4, 162 # ymm8 = ymm8[2,0],ymm4[2,2],ymm8[6,4],ymm4[6,6]
vperm2f128 ymm4, ymm4, ymm5, 32 # ymm4 = ymm4[0,1],ymm5[0,1]
vmovaps ymmword ptr [rdi + 64], ymm7
vblendps ymm8, ymm9, ymm8, 56 # ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7]
vbroadcastsd ymm9, qword ptr [rcx]
movabs rcx, 140200282622832
vpermps ymm9, ymm9, ymm0
vpermps ymm0, ymm2, ymm0
vblendps ymm4, ymm4, ymm5, 34 # ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
vmovaps xmm5, xmmword ptr [rcx]
vpermps ymm1, ymm5, ymm1
vblendps ymm8, ymm8, ymm9, 192 # ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
vmovaps ymmword ptr [rdi + 96], ymm8
vblendps ymm1, ymm1, ymm4, 56 # ymm1 = ymm1[0,1,2],ymm4[3,4,5],ymm1[6,7]
vblendps ymm0, ymm1, ymm0, 192 # ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
vmovaps ymmword ptr [rdi + 128], ymm0
vzeroupper
ret
nop dword ptr [rax + rax]
vmovups ymm5, ymmword ptr [rsi + 96]
vmovups ymm6, ymmword ptr [rsi + 64]
movabs rcx, offset .rodata.cst16
vmovups ymm2, ymmword ptr [rsi]
vmovups ymm4, ymmword ptr [rsi + 32]
vmovups ymm0, ymmword ptr [rsi + 128]
vmovups ymm1, ymmword ptr [rsi + 160]
mov rax, rdi
vmovaps xmm8, xmmword ptr [rcx]
movabs rcx, offset .rodata.cst8
vbroadcastsd ymm14, qword ptr [rcx]
movabs rcx, 140200282654384
vperm2f128 ymm7, ymm6, ymm5, 32 # ymm7 = ymm6[0,1],ymm5[0,1]
vblendps ymm9, ymm2, ymm4, 48 # ymm9 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
vblendps ymm10, ymm1, ymm0, 48 # ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
vshufps ymm12, ymm2, ymm4, 19 # ymm12 = ymm2[3,0],ymm4[1,0],ymm2[7,4],ymm4[5,4]
vpermps ymm8, ymm8, ymm9
vshufps ymm12, ymm12, ymm4, 248 # ymm12 = ymm12[0,2],ymm4[3,3],ymm12[4,6],ymm4[7,7]
vpermps ymm11, ymm14, ymm10
vpermpd ymm12, ymm12, 236 # ymm12 = ymm12[0,3,2,3]
vshufps ymm3, ymm5, ymm7, 2 # ymm3 = ymm5[2,0],ymm7[0,0],ymm5[6,4],ymm7[4,4]
vshufps ymm3, ymm3, ymm7, 162 # ymm3 = ymm3[2,0],ymm7[2,2],ymm3[6,4],ymm7[6,6]
vblendps ymm3, ymm8, ymm3, 56 # ymm3 = ymm8[0,1,2],ymm3[3,4,5],ymm8[6,7]
vblendps ymm3, ymm3, ymm11, 192 # ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
vshufps ymm11, ymm5, ymm7, 19 # ymm11 = ymm5[3,0],ymm7[1,0],ymm5[7,4],ymm7[5,4]
vshufps ymm7, ymm11, ymm7, 226 # ymm7 = ymm11[2,0],ymm7[2,3],ymm11[6,4],ymm7[6,7]
vmovaps xmm11, xmmword ptr [rcx]
movabs rcx, 140200282654440
vbroadcastsd ymm13, qword ptr [rcx]
movabs rcx, 140200282654400
vpermps ymm9, ymm11, ymm9
vshufps ymm11, ymm2, ymm4, 2 # ymm11 = ymm2[2,0],ymm4[0,0],ymm2[6,4],ymm4[4,4]
vblendps ymm2, ymm4, ymm2, 48 # ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7]
vpermps ymm10, ymm13, ymm10
vshufps ymm11, ymm11, ymm4, 232 # ymm11 = ymm11[0,2],ymm4[2,3],ymm11[4,6],ymm4[6,7]
vpermps ymm4, ymm14, ymm2
vpermps ymm2, ymm13, ymm2
vpermpd ymm11, ymm11, 236 # ymm11 = ymm11[0,3,2,3]
vblendps ymm7, ymm9, ymm7, 56 # ymm7 = ymm9[0,1,2],ymm7[3,4,5],ymm9[6,7]
vblendps ymm7, ymm7, ymm10, 192 # ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
vshufps ymm10, ymm6, ymm5, 32 # ymm10 = ymm6[0,0],ymm5[2,0],ymm6[4,4],ymm5[6,4]
vshufps ymm10, ymm10, ymm5, 226 # ymm10 = ymm10[2,0],ymm5[2,3],ymm10[6,4],ymm5[6,7]
vpermpd ymm10, ymm10, 200 # ymm10 = ymm10[0,2,0,3]
vblendps ymm10, ymm11, ymm10, 24 # ymm10 = ymm11[0,1,2],ymm10[3,4],ymm11[5,6,7]
vshufps ymm11, ymm1, ymm0, 32 # ymm11 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
vshufps ymm11, ymm0, ymm11, 32 # ymm11 = ymm0[0,0],ymm11[2,0],ymm0[4,4],ymm11[6,4]
vpermpd ymm11, ymm11, 196 # ymm11 = ymm11[0,1,0,3]
vblendps ymm10, ymm10, ymm11, 224 # ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
vshufps ymm11, ymm6, ymm5, 49 # ymm11 = ymm6[1,0],ymm5[3,0],ymm6[5,4],ymm5[7,4]
vshufps ymm11, ymm11, ymm5, 226 # ymm11 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7]
vblendps ymm5, ymm6, ymm5, 240 # ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
vmovaps xmm6, xmmword ptr [rsi + 80]
vmovaps ymmword ptr [rdi], ymm3
vmovaps ymmword ptr [rdi + 32], ymm7
vmovaps ymmword ptr [rdi + 64], ymm10
vpermpd ymm11, ymm11, 200 # ymm11 = ymm11[0,2,0,3]
vblendps ymm11, ymm12, ymm11, 24 # ymm11 = ymm12[0,1,2],ymm11[3,4],ymm12[5,6,7]
vshufps ymm12, ymm1, ymm0, 49 # ymm12 = ymm1[1,0],ymm0[3,0],ymm1[5,4],ymm0[7,4]
vshufps ymm12, ymm0, ymm12, 36 # ymm12 = ymm0[0,1],ymm12[2,0],ymm0[4,5],ymm12[6,4]
vblendps ymm0, ymm1, ymm0, 12 # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
vbroadcastf128 ymm1, xmmword ptr [rcx] # ymm1 = mem[0,1,0,1]
movabs rcx, 140200282654416
vpermpd ymm12, ymm12, 196 # ymm12 = ymm12[0,1,0,3]
vpermps ymm1, ymm1, ymm0
vblendps ymm11, ymm11, ymm12, 224 # ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
vshufps ymm12, ymm6, ymm5, 2 # ymm12 = ymm6[2,0],ymm5[0,0],ymm6[6,4],ymm5[4,4]
vshufps ymm12, ymm5, ymm12, 36 # ymm12 = ymm5[0,1],ymm12[2,0],ymm5[4,5],ymm12[6,4]
vmovaps ymmword ptr [rdi + 96], ymm11
vblendps ymm4, ymm12, ymm4, 3 # ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7]
vblendps ymm1, ymm4, ymm1, 224 # ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
vshufps ymm4, ymm6, ymm5, 19 # ymm4 = ymm6[3,0],ymm5[1,0],ymm6[7,4],ymm5[5,4]
vshufps ymm4, ymm5, ymm4, 37 # ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4]
vbroadcastf128 ymm5, xmmword ptr [rcx] # ymm5 = mem[0,1,0,1]
vmovaps ymmword ptr [rdi + 128], ymm1
vblendps ymm2, ymm4, ymm2, 3 # ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
vpermps ymm0, ymm5, ymm0
vblendps ymm0, ymm2, ymm0, 224 # ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
vmovaps ymmword ptr [rdi + 160], ymm0
vzeroupper
ret
nop dword ptr [rax]
vmovaps xmm7, xmmword ptr [rsi + 160]
vmovaps xmm0, xmmword ptr [rsi + 128]
vmovaps xmm4, xmmword ptr [rsi + 96]
vmovaps xmm6, xmmword ptr [rsi]
vmovaps xmm1, xmmword ptr [rsi + 32]
vmovaps xmm3, xmmword ptr [rsi + 64]
vmovaps xmm2, xmmword ptr [rsi + 192]
vmovups ymm11, ymmword ptr [rsi + 160]
vmovups ymm15, ymmword ptr [rsi + 128]
vmovups ymm8, ymmword ptr [rsi + 96]
vmovups ymm13, ymmword ptr [rsi + 64]
vmovups ymm9, ymmword ptr [rsi + 32]
vmovups ymm14, ymmword ptr [rsi]
vmovaps xmm12, xmmword ptr [rsi + 208]
mov rax, rdi
vinsertps xmm10, xmm0, xmm7, 28 # xmm10 = xmm0[0],xmm7[0],zero,zero
vinsertps xmm5, xmm6, xmm1, 28 # xmm5 = xmm6[0],xmm1[0],zero,zero
vmovss dword ptr [rdi + 24], xmm2
vmovlps qword ptr [rdi + 16], xmm10
vshufps xmm10, xmm4, xmm3, 0 # xmm10 = xmm4[0,0],xmm3[0,0]
vshufps xmm5, xmm5, xmm10, 36 # xmm5 = xmm5[0,1],xmm10[2,0]
vinsertps xmm10, xmm7, xmm0, 76 # xmm10 = xmm0[1],xmm7[1],zero,zero
vmovaps xmmword ptr [rdi], xmm5
vmovlps qword ptr [rdi + 48], xmm10
vunpcklps xmm10, xmm3, xmm4 # xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
vinsertps xmm5, xmm1, xmm6, 76 # xmm5 = xmm6[1],xmm1[1],zero,zero
vextractps dword ptr [rdi + 56], xmm2, 1
vblendps xmm5, xmm10, xmm5, 3 # xmm5 = xmm5[0,1],xmm10[2,3]
vunpckhps xmm10, xmm0, xmm7 # xmm10 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
vmovaps xmmword ptr [rdi + 32], xmm5
vmovlps qword ptr [rdi + 80], xmm10
vinsertps xmm10, xmm3, xmm4, 179 # xmm10 = zero,zero,xmm3[2],xmm4[2]
vunpckhps xmm5, xmm6, xmm1 # xmm5 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
vinsertps xmm3, xmm4, xmm3, 227 # xmm3 = zero,zero,xmm3[3],xmm4[3]
vshufps xmm1, xmm1, xmm6, 51 # xmm1 = xmm1[3,0],xmm6[3,0]
vextractps dword ptr [rdi + 88], xmm2, 2
vblendps xmm5, xmm10, xmm5, 3 # xmm5 = xmm5[0,1],xmm10[2,3]
vshufps xmm1, xmm1, xmm3, 226 # xmm1 = xmm1[2,0],xmm3[2,3]
vmovaps xmmword ptr [rdi + 64], xmm5
vshufps xmm5, xmm7, xmm0, 51 # xmm5 = xmm7[3,0],xmm0[3,0]
vmovaps xmmword ptr [rdi + 96], xmm1
vunpcklps ymm1, ymm15, ymm11 # ymm1 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[4],ymm11[4],ymm15[5],ymm11[5]
vextractps dword ptr [rdi + 120], xmm2, 3
vshufps xmm0, xmm5, xmm0, 226 # xmm0 = xmm5[2,0],xmm0[2,3]
vmovlps qword ptr [rdi + 112], xmm0
vextractf128 xmm0, ymm1, 1
vunpcklps ymm1, ymm13, ymm8 # ymm1 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[4],ymm8[4],ymm13[5],ymm8[5]
vmovss dword ptr [rdi + 152], xmm12
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
vmovlps qword ptr [rdi + 144], xmm0
vunpcklps ymm0, ymm14, ymm9 # ymm0 = ymm14[0],ymm9[0],ymm14[1],ymm9[1],ymm14[4],ymm9[4],ymm14[5],ymm9[5]
vextractf128 xmm0, ymm0, 1
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3]
vshufps ymm1, ymm11, ymm15, 17 # ymm1 = ymm11[1,0],ymm15[1,0],ymm11[5,4],ymm15[5,4]
vshufps ymm1, ymm1, ymm15, 226 # ymm1 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7]
vmovaps xmmword ptr [rdi + 128], xmm0
vextractps dword ptr [rdi + 184], xmm12, 1
vextractf128 xmm0, ymm1, 1
vshufps ymm1, ymm8, ymm13, 17 # ymm1 = ymm8[1,0],ymm13[1,0],ymm8[5,4],ymm13[5,4]
vshufps ymm1, ymm1, ymm13, 226 # ymm1 = ymm1[2,0],ymm13[2,3],ymm1[6,4],ymm13[6,7]
vmovlps qword ptr [rdi + 176], xmm0
vshufps ymm0, ymm9, ymm14, 17 # ymm0 = ymm9[1,0],ymm14[1,0],ymm9[5,4],ymm14[5,4]
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
vshufps ymm0, ymm0, ymm14, 226 # ymm0 = ymm0[2,0],ymm14[2,3],ymm0[6,4],ymm14[6,7]
vextractf128 xmm0, ymm0, 1
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3]
vunpckhps ymm1, ymm15, ymm11 # ymm1 = ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[6],ymm11[6],ymm15[7],ymm11[7]
vmovaps xmmword ptr [rdi + 160], xmm0
vextractf128 xmm0, ymm1, 1
vunpckhps ymm1, ymm13, ymm8 # ymm1 = ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[6],ymm8[6],ymm13[7],ymm8[7]
vextractps dword ptr [rdi + 216], xmm12, 2
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
vmovlps qword ptr [rdi + 208], xmm0
vunpckhps ymm0, ymm14, ymm9 # ymm0 = ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[6],ymm9[6],ymm14[7],ymm9[7]
vextractf128 xmm0, ymm0, 1
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3]
vshufps ymm1, ymm8, ymm13, 51 # ymm1 = ymm8[3,0],ymm13[3,0],ymm8[7,4],ymm13[7,4]
vshufps ymm1, ymm1, ymm13, 226 # ymm1 = ymm1[2,0],ymm13[2,3],ymm1[6,4],ymm13[6,7]
vmovaps xmmword ptr [rdi + 192], xmm0
vshufps ymm0, ymm11, ymm15, 51 # ymm0 = ymm11[3,0],ymm15[3,0],ymm11[7,4],ymm15[7,4]
vextractps dword ptr [rdi + 248], xmm12, 3
vpermpd ymm1, ymm1, 232 # ymm1 = ymm1[0,2,2,3]
vshufps ymm0, ymm0, ymm15, 226 # ymm0 = ymm0[2,0],ymm15[2,3],ymm0[6,4],ymm15[6,7]
vextractf128 xmm0, ymm0, 1
vmovlps qword ptr [rdi + 240], xmm0
vshufps ymm0, ymm9, ymm14, 51 # ymm0 = ymm9[3,0],ymm14[3,0],ymm9[7,4],ymm14[7,4]
vshufps ymm0, ymm0, ymm14, 226 # ymm0 = ymm0[2,0],ymm14[2,3],ymm0[6,4],ymm14[6,7]
vextractf128 xmm0, ymm0, 1
vblendps xmm0, xmm1, xmm0, 3 # xmm0 = xmm0[0,1],xmm1[2,3]
vmovaps xmmword ptr [rdi + 224], xmm0
vzeroupper
ret
nop word ptr cs:[rax + rax]
vmovaps xmm10, xmmword ptr [rsi + 160]
vmovaps xmm3, xmmword ptr [rsi + 128]
vmovaps xmm5, xmmword ptr [rsi + 192]
vmovaps xmm4, xmmword ptr [rsi + 224]
vmovaps xmm12, xmmword ptr [rsi]
vmovaps xmm7, xmmword ptr [rsi + 32]
vmovaps xmm9, xmmword ptr [rsi + 96]
vmovups ymm13, ymmword ptr [rsi + 96]
vmovups ymm15, ymmword ptr [rsi + 64]
vmovups ymm11, ymmword ptr [rsi + 128]
vmovups ymm14, ymmword ptr [rsi + 32]
mov rax, rdi
vinsertps xmm0, xmm3, xmm10, 28 # xmm0 = xmm3[0],xmm10[0],zero,zero
vshufps xmm6, xmm4, xmm5, 0 # xmm6 = xmm4[0,0],xmm5[0,0]
vinsertps xmm2, xmm12, xmm7, 28 # xmm2 = xmm12[0],xmm7[0],zero,zero
vinsertf128 ymm0, ymm0, xmm0, 1
vshufps xmm6, xmm5, xmm6, 36 # xmm6 = xmm5[0,1],xmm6[2,0]
vinsertf128 ymm6, ymm0, xmm6, 1
vblendps ymm8, ymm0, ymm6, 192 # ymm8 = ymm0[0,1,2,3,4,5],ymm6[6,7]
vmovaps xmm0, xmmword ptr [rsi + 64]
vshufps xmm6, xmm9, xmm0, 0 # xmm6 = xmm9[0,0],xmm0[0,0]
vshufps xmm2, xmm2, xmm6, 36 # xmm2 = xmm2[0,1],xmm6[2,0]
vinsertps xmm6, xmm10, xmm3, 76 # xmm6 = xmm3[1],xmm10[1],zero,zero
vblendps ymm1, ymm2, ymm8, 240 # ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
vunpcklps xmm2, xmm5, xmm4 # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
vinsertf128 ymm6, ymm0, xmm6, 1
vmovups ymm8, ymmword ptr [rsi + 192]
vinsertf128 ymm2, ymm0, xmm2, 1
vmovups ymmword ptr [rsp - 56], ymm1
vinsertps xmm1, xmm7, xmm12, 76 # xmm1 = xmm12[1],xmm7[1],zero,zero
vblendps ymm2, ymm6, ymm2, 192 # ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
vunpcklps xmm6, xmm0, xmm9 # xmm6 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
vblendps xmm1, xmm6, xmm1, 3 # xmm1 = xmm1[0,1],xmm6[2,3]
vunpckhps xmm6, xmm3, xmm10 # xmm6 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
vblendps ymm1, ymm1, ymm2, 240 # ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
vinsertf128 ymm6, ymm0, xmm6, 1
vunpckhps xmm2, xmm12, xmm7 # xmm2 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
vmovups ymmword ptr [rsp - 88], ymm1
vinsertps xmm1, xmm5, xmm4, 179 # xmm1 = zero,zero,xmm5[2],xmm4[2]
vinsertf128 ymm1, ymm0, xmm1, 1
vblendps ymm1, ymm6, ymm1, 192 # ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
vinsertps xmm6, xmm0, xmm9, 179 # xmm6 = zero,zero,xmm0[2],xmm9[2]
vblendps xmm2, xmm6, xmm2, 3 # xmm2 = xmm2[0,1],xmm6[2,3]
vmovups ymm6, ymmword ptr [rsi]
vblendps ymm1, ymm2, ymm1, 240 # ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
vinsertps xmm2, xmm4, xmm5, 227 # xmm2 = zero,zero,xmm5[3],xmm4[3]
vshufps xmm5, xmm10, xmm3, 51 # xmm5 = xmm10[3,0],xmm3[3,0]
vmovups ymm4, ymmword ptr [rsi + 224]
vmovups ymm10, ymmword ptr [rsi + 160]
vshufps xmm3, xmm5, xmm3, 226 # xmm3 = xmm5[2,0],xmm3[2,3]
vinsertf128 ymm2, ymm0, xmm2, 1
vunpckhps ymm5, ymm15, ymm13 # ymm5 = ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[6],ymm13[6],ymm15[7],ymm13[7]
vinsertf128 ymm3, ymm0, xmm3, 1
vinsertps xmm0, xmm9, xmm0, 227 # xmm0 = zero,zero,xmm0[3],xmm9[3]
vpermpd ymm5, ymm5, 232 # ymm5 = ymm5[0,2,2,3]
vblendps ymm2, ymm3, ymm2, 192 # ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
vshufps xmm3, xmm7, xmm12, 51 # xmm3 = xmm7[3,0],xmm12[3,0]
vshufps xmm0, xmm3, xmm0, 226 # xmm0 = xmm3[2,0],xmm0[2,3]
vunpcklps ymm3, ymm15, ymm13 # ymm3 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5]
vpermpd ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3]
vblendps ymm7, ymm0, ymm2, 240 # ymm7 = ymm0[0,1,2,3],ymm2[4,5,6,7]
vshufps ymm2, ymm4, ymm8, 0 # ymm2 = ymm4[0,0],ymm8[0,0],ymm4[4,4],ymm8[4,4]
vunpcklps ymm0, ymm11, ymm10 # ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
vshufps ymm2, ymm8, ymm2, 36 # ymm2 = ymm8[0,1],ymm2[2,0],ymm8[4,5],ymm2[6,4]
vblendps ymm0, ymm0, ymm2, 192 # ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
vunpcklps ymm2, ymm6, ymm14 # ymm2 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5]
vextractf128 xmm2, ymm2, 1
vblendps xmm2, xmm3, xmm2, 3 # xmm2 = xmm2[0,1],xmm3[2,3]
vshufps ymm3, ymm13, ymm15, 17 # ymm3 = ymm13[1,0],ymm15[1,0],ymm13[5,4],ymm15[5,4]
vshufps ymm3, ymm3, ymm15, 226 # ymm3 = ymm3[2,0],ymm15[2,3],ymm3[6,4],ymm15[6,7]
vblendps ymm12, ymm2, ymm0, 240 # ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7]
vshufps ymm2, ymm10, ymm11, 17 # ymm2 = ymm10[1,0],ymm11[1,0],ymm10[5,4],ymm11[5,4]
vunpcklps ymm0, ymm8, ymm4 # ymm0 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5]
vpermpd ymm3, ymm3, 232 # ymm3 = ymm3[0,2,2,3]
vshufps ymm2, ymm2, ymm11, 226 # ymm2 = ymm2[2,0],ymm11[2,3],ymm2[6,4],ymm11[6,7]
vblendps ymm0, ymm2, ymm0, 192 # ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
vshufps ymm2, ymm14, ymm6, 17 # ymm2 = ymm14[1,0],ymm6[1,0],ymm14[5,4],ymm6[5,4]
vshufps ymm2, ymm2, ymm6, 226 # ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7]
vextractf128 xmm2, ymm2, 1
vblendps xmm2, xmm3, xmm2, 3 # xmm2 = xmm2[0,1],xmm3[2,3]
vshufps ymm3, ymm4, ymm8, 34 # ymm3 = ymm4[2,0],ymm8[2,0],ymm4[6,4],ymm8[6,4]
vblendps ymm0, ymm2, ymm0, 240 # ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
vunpckhps ymm2, ymm11, ymm10 # ymm2 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
vshufps ymm3, ymm8, ymm3, 36 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4]
vblendps ymm2, ymm2, ymm3, 192 # ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
vunpckhps ymm3, ymm6, ymm14 # ymm3 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7]
vextractf128 xmm3, ymm3, 1
vblendps xmm3, xmm5, xmm3, 3 # xmm3 = xmm3[0,1],xmm5[2,3]
vshufps ymm5, ymm13, ymm15, 51 # ymm5 = ymm13[3,0],ymm15[3,0],ymm13[7,4],ymm15[7,4]
vshufps ymm5, ymm5, ymm15, 226 # ymm5 = ymm5[2,0],ymm15[2,3],ymm5[6,4],ymm15[6,7]
vblendps ymm2, ymm3, ymm2, 240 # ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
vunpckhps ymm3, ymm8, ymm4 # ymm3 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7]
vshufps ymm4, ymm10, ymm11, 51 # ymm4 = ymm10[3,0],ymm11[3,0],ymm10[7,4],ymm11[7,4]
vpermpd ymm5, ymm5, 232 # ymm5 = ymm5[0,2,2,3]
vshufps ymm4, ymm4, ymm11, 226 # ymm4 = ymm4[2,0],ymm11[2,3],ymm4[6,4],ymm11[6,7]
vblendps ymm3, ymm4, ymm3, 192 # ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
vshufps ymm4, ymm14, ymm6, 51 # ymm4 = ymm14[3,0],ymm6[3,0],ymm14[7,4],ymm6[7,4]
vshufps ymm4, ymm4, ymm6, 226 # ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7]
vmovups ymm6, ymmword ptr [rsp - 56]
vextractf128 xmm4, ymm4, 1
vmovaps ymmword ptr [rdi], ymm6
vblendps xmm4, xmm5, xmm4, 3 # xmm4 = xmm4[0,1],xmm5[2,3]
vmovups ymm5, ymmword ptr [rsp - 88]
vblendps ymm3, ymm4, ymm3, 240 # ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
vmovaps ymmword ptr [rdi + 32], ymm5
vmovaps ymmword ptr [rdi + 64], ymm1
vmovaps ymmword ptr [rdi + 96], ymm7
vmovaps ymmword ptr [rdi + 128], ymm12
vmovaps ymmword ptr [rdi + 160], ymm0
vmovaps ymmword ptr [rdi + 192], ymm2
vmovaps ymmword ptr [rdi + 224], ymm3
vzeroupper
ret
nop word ptr [rax + rax]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment