Created
September 23, 2016 07:02
-
-
Save 9il/e6f16cc014cddaf2e344672a40f504f5 to your computer and use it in GitHub Desktop.
cannonlake code for 512-bit vectors (broken)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl __D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f | |
.weak_definition __D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f: | |
.cfi_startproc | |
subq $376, %rsp | |
Ltmp0: | |
.cfi_def_cfa_offset 384 | |
movq %rsi, %rax | |
shlq $7, %rax | |
addq %rcx, %rax | |
addq $576, %rcx | |
vpxord %zmm0, %zmm0, %zmm0 | |
vpxord %zmm4, %zmm4, %zmm4 | |
vpxord %zmm30, %zmm30, %zmm30 | |
vpxord %zmm31, %zmm31, %zmm31 | |
vpxord %zmm22, %zmm22, %zmm22 | |
vpxord %zmm23, %zmm23, %zmm23 | |
vpxord %zmm24, %zmm24, %zmm24 | |
vpxord %zmm25, %zmm25, %zmm25 | |
vpxord %zmm26, %zmm26, %zmm26 | |
vpxord %zmm27, %zmm27, %zmm27 | |
vpxord %zmm28, %zmm28, %zmm28 | |
vpxord %zmm29, %zmm29, %zmm29 | |
.p2align 4, 0x90 | |
LBB0_1: | |
vmovups %zmm2, -128(%rsp) | |
vmovaps %zmm29, %zmm16 | |
vmovaps %zmm28, %zmm17 | |
vmovaps %zmm27, %zmm18 | |
vmovaps %zmm26, %zmm19 | |
vmovaps %zmm25, %zmm20 | |
vmovaps %zmm24, %zmm21 | |
vmovups %zmm23, -64(%rsp) | |
vmovups %zmm22, (%rsp) | |
vmovups %zmm31, 64(%rsp) | |
vmovups %zmm30, 128(%rsp) | |
vmovups %zmm4, 192(%rsp) | |
vmovups %zmm0, 256(%rsp) | |
prefetcht0 -64(%rcx) | |
prefetcht0 (%rcx) | |
vmovaps -576(%rcx), %zmm11 | |
vmovaps -512(%rcx), %zmm10 | |
vmovss (%rdx), %xmm4 | |
vmovss 4(%rdx), %xmm0 | |
vblendps $1, %xmm4, %xmm8, %xmm9 | |
vinsertf32x4 $0, %xmm9, %zmm8, %zmm8 | |
vinsertps $16, %xmm4, %xmm8, %xmm1 | |
vinsertf32x4 $0, %xmm1, %zmm8, %zmm1 | |
vinsertps $32, %xmm4, %xmm1, %xmm2 | |
vinsertf32x4 $0, %xmm2, %zmm1, %zmm1 | |
vinsertps $48, %xmm4, %xmm1, %xmm2 | |
vinsertf32x4 $0, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vblendps $1, %xmm4, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $16, %xmm4, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $32, %xmm4, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $48, %xmm4, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vblendps $1, %xmm0, %xmm6, %xmm2 | |
vinsertf32x4 $0, %xmm2, %zmm6, %zmm2 | |
vextractf32x4 $2, %zmm1, %xmm6 | |
vinsertps $16, %xmm0, %xmm2, %xmm7 | |
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2 | |
vinsertps $32, %xmm0, %xmm2, %xmm7 | |
vblendps $1, %xmm4, %xmm6, %xmm6 | |
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2 | |
vinsertps $48, %xmm0, %xmm2, %xmm7 | |
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2 | |
vinsertf32x4 $2, %xmm6, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm2, %xmm6 | |
vblendps $1, %xmm0, %xmm6, %xmm6 | |
vinsertf32x4 $1, %xmm6, %zmm2, %zmm2 | |
vextractf32x4 $2, %zmm1, %xmm6 | |
vextractf32x4 $1, %zmm2, %xmm7 | |
vinsertps $16, %xmm0, %xmm7, %xmm7 | |
vinsertf32x4 $1, %xmm7, %zmm2, %zmm2 | |
vinsertps $16, %xmm4, %xmm6, %xmm6 | |
vextractf32x4 $1, %zmm2, %xmm7 | |
vinsertps $32, %xmm0, %xmm7, %xmm7 | |
vinsertf32x4 $1, %xmm7, %zmm2, %zmm2 | |
vinsertf32x4 $2, %xmm6, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm2, %xmm6 | |
vinsertps $48, %xmm0, %xmm6, %xmm6 | |
vinsertf32x4 $1, %xmm6, %zmm2, %zmm2 | |
vextractf32x4 $2, %zmm1, %xmm6 | |
vextractf32x4 $2, %zmm2, %xmm7 | |
vblendps $1, %xmm0, %xmm7, %xmm7 | |
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2 | |
vinsertps $32, %xmm4, %xmm6, %xmm6 | |
vextractf32x4 $2, %zmm2, %xmm7 | |
vinsertps $16, %xmm0, %xmm7, %xmm7 | |
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2 | |
vinsertf32x4 $2, %xmm6, %zmm1, %zmm8 | |
vextractf32x4 $2, %zmm2, %xmm6 | |
vinsertps $32, %xmm0, %xmm6, %xmm7 | |
vmovss 8(%rdx), %xmm6 | |
vblendps $1, %xmm6, %xmm15, %xmm3 | |
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2 | |
vinsertf32x4 $0, %xmm3, %zmm15, %zmm3 | |
vinsertps $16, %xmm6, %xmm3, %xmm5 | |
vinsertf32x4 $0, %xmm5, %zmm3, %zmm3 | |
vextractf32x4 $2, %zmm8, %xmm5 | |
vinsertps $32, %xmm6, %xmm3, %xmm7 | |
vinsertf32x4 $0, %xmm7, %zmm3, %zmm3 | |
vinsertps $48, %xmm6, %xmm3, %xmm7 | |
vextractf32x4 $2, %zmm2, %xmm1 | |
vinsertf32x4 $0, %xmm7, %zmm3, %zmm3 | |
vextractf32x4 $1, %zmm3, %xmm7 | |
vblendps $1, %xmm6, %xmm7, %xmm7 | |
vinsertps $48, %xmm4, %xmm5, %xmm5 | |
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3 | |
vextractf32x4 $1, %zmm3, %xmm7 | |
vinsertps $16, %xmm6, %xmm7, %xmm7 | |
vinsertps $48, %xmm0, %xmm1, %xmm1 | |
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3 | |
vextractf32x4 $1, %zmm3, %xmm7 | |
vinsertps $32, %xmm6, %xmm7, %xmm7 | |
vinsertf32x4 $2, %xmm5, %zmm8, %zmm8 | |
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3 | |
vextractf32x4 $1, %zmm3, %xmm5 | |
vinsertps $48, %xmm6, %xmm5, %xmm5 | |
vinsertf32x4 $2, %xmm1, %zmm2, %zmm22 | |
vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 | |
vextractf32x4 $2, %zmm2, %xmm3 | |
vblendps $1, %xmm6, %xmm3, %xmm3 | |
vextractf32x4 $3, %zmm8, %xmm5 | |
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2 | |
vextractf32x4 $2, %zmm2, %xmm3 | |
vinsertps $16, %xmm6, %xmm3, %xmm3 | |
vextractf32x4 $3, %zmm22, %xmm7 | |
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2 | |
vextractf32x4 $2, %zmm2, %xmm3 | |
vinsertps $32, %xmm6, %xmm3, %xmm3 | |
vblendps $1, %xmm4, %xmm5, %xmm9 | |
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2 | |
vextractf32x4 $2, %zmm2, %xmm3 | |
vinsertps $48, %xmm6, %xmm3, %xmm3 | |
vblendps $1, %xmm0, %xmm7, %xmm7 | |
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2 | |
vextractf32x4 $3, %zmm2, %xmm3 | |
vblendps $1, %xmm6, %xmm3, %xmm12 | |
vmovss 12(%rdx), %xmm5 | |
vinsertf32x4 $3, %xmm9, %zmm8, %zmm8 | |
vblendps $1, %xmm5, %xmm13, %xmm3 | |
vinsertf32x4 $0, %xmm3, %zmm13, %zmm3 | |
vinsertps $16, %xmm5, %xmm3, %xmm1 | |
vinsertf32x4 $3, %xmm7, %zmm22, %zmm9 | |
vinsertf32x4 $0, %xmm1, %zmm3, %zmm1 | |
vinsertps $32, %xmm5, %xmm1, %xmm3 | |
vinsertf32x4 $0, %xmm3, %zmm1, %zmm1 | |
vinsertf32x4 $3, %xmm12, %zmm2, %zmm12 | |
vinsertps $48, %xmm5, %xmm1, %xmm3 | |
vinsertf32x4 $0, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm3 | |
vextractf32x4 $3, %zmm8, %xmm7 | |
vblendps $1, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm3 | |
vextractf32x4 $3, %zmm9, %xmm2 | |
vinsertps $16, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm3 | |
vextractf32x4 $3, %zmm12, %xmm13 | |
vinsertps $32, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm3 | |
vinsertps $16, %xmm4, %xmm7, %xmm7 | |
vinsertps $48, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm3 | |
vinsertps $16, %xmm0, %xmm2, %xmm2 | |
vblendps $1, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm3 | |
vinsertps $16, %xmm6, %xmm13, %xmm13 | |
vinsertps $16, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm3 | |
vinsertf32x4 $3, %xmm7, %zmm8, %zmm23 | |
vinsertps $32, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm3 | |
vinsertf32x4 $3, %xmm2, %zmm9, %zmm22 | |
vinsertps $48, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm3 | |
vinsertf32x4 $3, %xmm13, %zmm12, %zmm8 | |
vblendps $1, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $3, %xmm3, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm3 | |
vextractf32x4 $3, %zmm23, %xmm9 | |
vinsertps $16, %xmm5, %xmm3, %xmm3 | |
vinsertf32x4 $3, %xmm3, %zmm1, %zmm12 | |
vmovss 16(%rdx), %xmm3 | |
vblendps $1, %xmm3, %xmm14, %xmm2 | |
vextractf32x4 $3, %zmm22, %xmm13 | |
vinsertf32x4 $0, %xmm2, %zmm14, %zmm2 | |
vinsertps $16, %xmm3, %xmm2, %xmm1 | |
vinsertf32x4 $0, %xmm1, %zmm2, %zmm1 | |
vextractf32x4 $3, %zmm8, %xmm14 | |
vinsertps $32, %xmm3, %xmm1, %xmm7 | |
vinsertf32x4 $0, %xmm7, %zmm1, %zmm1 | |
vinsertps $48, %xmm3, %xmm1, %xmm7 | |
vinsertps $32, %xmm4, %xmm9, %xmm9 | |
vinsertf32x4 $0, %xmm7, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm12, %xmm7 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $32, %xmm0, %xmm13, %xmm13 | |
vblendps $1, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $32, %xmm6, %xmm14, %xmm14 | |
vinsertps $16, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertps $32, %xmm5, %xmm7, %xmm7 | |
vinsertps $32, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $1, %zmm1, %xmm2 | |
vinsertf32x4 $3, %xmm9, %zmm23, %zmm9 | |
vinsertps $48, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertf32x4 $3, %xmm13, %zmm22, %zmm13 | |
vblendps $1, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertf32x4 $3, %xmm14, %zmm8, %zmm14 | |
vinsertps $16, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertf32x4 $3, %xmm7, %zmm12, %zmm12 | |
vinsertps $32, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vextractf32x4 $3, %zmm9, %xmm8 | |
vinsertps $48, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vextractf32x4 $3, %zmm13, %xmm7 | |
vblendps $1, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vinsertps $48, %xmm4, %xmm8, %xmm4 | |
vinsertps $16, %xmm3, %xmm2, %xmm2 | |
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm14, %xmm2 | |
vinsertps $48, %xmm0, %xmm7, %xmm15 | |
vextractf32x4 $3, %zmm1, %xmm0 | |
vinsertps $32, %xmm3, %xmm0, %xmm0 | |
vinsertf32x4 $3, %xmm0, %zmm1, %zmm1 | |
vmovss 20(%rdx), %xmm0 | |
vinsertps $48, %xmm6, %xmm2, %xmm2 | |
vmovups -128(%rsp), %zmm7 | |
vblendps $1, %xmm0, %xmm7, %xmm6 | |
vinsertf32x4 $0, %xmm6, %zmm7, %zmm6 | |
vextractf32x4 $3, %zmm12, %xmm7 | |
vinsertps $48, %xmm5, %xmm7, %xmm7 | |
vinsertps $16, %xmm0, %xmm6, %xmm5 | |
vinsertf32x4 $0, %xmm5, %zmm6, %zmm5 | |
vextractf32x4 $3, %zmm1, %xmm6 | |
vinsertps $48, %xmm3, %xmm6, %xmm3 | |
vinsertps $32, %xmm0, %xmm5, %xmm6 | |
vinsertf32x4 $0, %xmm6, %zmm5, %zmm5 | |
vinsertps $48, %xmm0, %xmm5, %xmm6 | |
vinsertf32x4 $3, %xmm4, %zmm9, %zmm8 | |
vinsertf32x4 $0, %xmm6, %zmm5, %zmm4 | |
vextractf32x4 $1, %zmm4, %xmm5 | |
vblendps $1, %xmm0, %xmm5, %xmm5 | |
vinsertf32x4 $3, %xmm15, %zmm13, %zmm6 | |
vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 | |
vextractf32x4 $1, %zmm4, %xmm5 | |
vinsertps $16, %xmm0, %xmm5, %xmm5 | |
vinsertf32x4 $3, %xmm2, %zmm14, %zmm15 | |
vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 | |
vextractf32x4 $1, %zmm2, %xmm4 | |
vinsertps $32, %xmm0, %xmm4, %xmm4 | |
vinsertf32x4 $3, %xmm7, %zmm12, %zmm13 | |
vinsertf32x4 $1, %xmm4, %zmm2, %zmm2 | |
vextractf32x4 $1, %zmm2, %xmm4 | |
vinsertps $48, %xmm0, %xmm4, %xmm4 | |
vinsertf32x4 $3, %xmm3, %zmm1, %zmm14 | |
vmovaps %zmm8, %zmm29 | |
vinsertf32x4 $1, %xmm4, %zmm2, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vblendps $1, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm16, %zmm11, %zmm29 | |
vmovaps %zmm8, %zmm28 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertps $16, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm17, %zmm10, %zmm28 | |
vmovaps %zmm6, %zmm27 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertps $32, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm18, %zmm11, %zmm27 | |
vmovaps %zmm6, %zmm26 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $2, %zmm1, %xmm2 | |
vinsertps $48, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm19, %zmm10, %zmm26 | |
vmovaps %zmm15, %zmm25 | |
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vblendps $1, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm20, %zmm11, %zmm25 | |
vmovaps %zmm15, %zmm24 | |
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vinsertps $16, %xmm0, %xmm2, %xmm2 | |
vfmadd213ps %zmm21, %zmm10, %zmm24 | |
vmovaps %zmm13, %zmm23 | |
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vinsertps $32, %xmm0, %xmm2, %xmm2 | |
vmovups -64(%rsp), %zmm3 | |
vfmadd213ps %zmm3, %zmm11, %zmm23 | |
vmovaps %zmm13, %zmm22 | |
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1 | |
vextractf32x4 $3, %zmm1, %xmm2 | |
vinsertps $48, %xmm0, %xmm2, %xmm0 | |
vmovups (%rsp), %zmm2 | |
vfmadd213ps %zmm2, %zmm10, %zmm22 | |
vinsertf32x4 $3, %xmm0, %zmm1, %zmm2 | |
vmovaps %zmm14, %zmm31 | |
vmovups 64(%rsp), %zmm0 | |
vfmadd213ps %zmm0, %zmm11, %zmm31 | |
vmovaps %zmm14, %zmm30 | |
vmovups 128(%rsp), %zmm0 | |
vfmadd213ps %zmm0, %zmm10, %zmm30 | |
vmovaps %zmm2, %zmm4 | |
vmovups 192(%rsp), %zmm0 | |
vfmadd213ps %zmm0, %zmm11, %zmm4 | |
vmovaps %zmm2, %zmm0 | |
subq $-128, %rcx | |
addq $24, %rdx | |
addq $-1, %rsi | |
vmovups 256(%rsp), %zmm1 | |
vfmadd213ps %zmm1, %zmm10, %zmm0 | |
jne LBB0_1 | |
vmovaps %zmm29, (%rdi) | |
vmovaps %zmm28, 64(%rdi) | |
vmovaps %zmm27, 128(%rdi) | |
vmovaps %zmm26, 192(%rdi) | |
vmovaps %zmm25, 256(%rdi) | |
vmovaps %zmm24, 320(%rdi) | |
vmovaps %zmm23, 384(%rdi) | |
vmovaps %zmm22, 448(%rdi) | |
vmovaps %zmm31, 512(%rdi) | |
vmovaps %zmm30, 576(%rdi) | |
vmovaps %zmm4, 640(%rdi) | |
vmovaps %zmm0, 704(%rdi) | |
addq $376, %rsp | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.weak_definition __D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv: | |
.cfi_startproc | |
prefetcht0 512(%rsi) | |
prefetcht0 576(%rsi) | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv | |
.weak_definition __D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv: | |
.cfi_startproc | |
vmovaps (%rdi), %zmm0 | |
vmovaps %zmm0, (%rsi) | |
vmovaps 64(%rdi), %zmm0 | |
vmovaps %zmm0, 64(%rsi) | |
vmovaps 128(%rdi), %zmm0 | |
vmovaps %zmm0, 128(%rsi) | |
vmovaps 192(%rdi), %zmm0 | |
vmovaps %zmm0, 192(%rsi) | |
vmovaps 256(%rdi), %zmm0 | |
vmovaps %zmm0, 256(%rsi) | |
vmovaps 320(%rdi), %zmm0 | |
vmovaps %zmm0, 320(%rsi) | |
vmovaps 384(%rdi), %zmm0 | |
vmovaps %zmm0, 384(%rsi) | |
vmovaps 448(%rdi), %zmm0 | |
vmovaps %zmm0, 448(%rsi) | |
vmovaps 512(%rdi), %zmm0 | |
vmovaps %zmm0, 512(%rsi) | |
vmovaps 576(%rdi), %zmm0 | |
vmovaps %zmm0, 576(%rsi) | |
vmovaps 640(%rdi), %zmm0 | |
vmovaps %zmm0, 640(%rsi) | |
vmovaps 704(%rdi), %zmm0 | |
vmovaps %zmm0, 704(%rsi) | |
retq | |
.cfi_endproc | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm16__moduleinfoCtorZ: | |
movq __Dmodule_ref@GOTPCREL(%rip), %rax | |
movq (%rax), %rcx | |
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip) | |
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx | |
movq %rcx, (%rax) | |
retq | |
.section __DATA,__data | |
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.p2align 4 | |
__D3mir4glas8internal4gemm12__ModuleInfoZ: | |
.long 2147484672 | |
.long 0 | |
.quad 2 | |
.quad __D3mir4glas6common12__ModuleInfoZ | |
.quad __D3mir4glas8internal12__ModuleInfoZ | |
.asciz "mir.glas.internal.gemm" | |
.space 1 | |
.p2align 3 | |
__D3mir4glas8internal4gemm11__moduleRefZ: | |
.quad 0 | |
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.section __DATA,__mod_init_func,mod_init_funcs | |
.p2align 3 | |
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment