Skip to content

Instantly share code, notes, and snippets.

@9il
Created September 23, 2016 07:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9il/e6f16cc014cddaf2e344672a40f504f5 to your computer and use it in GitHub Desktop.
Save 9il/e6f16cc014cddaf2e344672a40f504f5 to your computer and use it in GitHub Desktop.
cannonlake code for 512-bit vectors (broken)
.section __TEXT,__text,regular,pure_instructions
.globl __D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f
.weak_definition __D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f
.p2align 4, 0x90
__D3mir4glas8internal4gemm48__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG16fTfZ13dot_reg_basicFNbNiPxG1G2NhG16fPxG6G1fmKG6G1G2NhG16fZPxG1G2NhG16f:
.cfi_startproc
subq $376, %rsp
Ltmp0:
.cfi_def_cfa_offset 384
movq %rsi, %rax
shlq $7, %rax
addq %rcx, %rax
addq $576, %rcx
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm30, %zmm30, %zmm30
vpxord %zmm31, %zmm31, %zmm31
vpxord %zmm22, %zmm22, %zmm22
vpxord %zmm23, %zmm23, %zmm23
vpxord %zmm24, %zmm24, %zmm24
vpxord %zmm25, %zmm25, %zmm25
vpxord %zmm26, %zmm26, %zmm26
vpxord %zmm27, %zmm27, %zmm27
vpxord %zmm28, %zmm28, %zmm28
vpxord %zmm29, %zmm29, %zmm29
.p2align 4, 0x90
LBB0_1:
vmovups %zmm2, -128(%rsp)
vmovaps %zmm29, %zmm16
vmovaps %zmm28, %zmm17
vmovaps %zmm27, %zmm18
vmovaps %zmm26, %zmm19
vmovaps %zmm25, %zmm20
vmovaps %zmm24, %zmm21
vmovups %zmm23, -64(%rsp)
vmovups %zmm22, (%rsp)
vmovups %zmm31, 64(%rsp)
vmovups %zmm30, 128(%rsp)
vmovups %zmm4, 192(%rsp)
vmovups %zmm0, 256(%rsp)
prefetcht0 -64(%rcx)
prefetcht0 (%rcx)
vmovaps -576(%rcx), %zmm11
vmovaps -512(%rcx), %zmm10
vmovss (%rdx), %xmm4
vmovss 4(%rdx), %xmm0
vblendps $1, %xmm4, %xmm8, %xmm9
vinsertf32x4 $0, %xmm9, %zmm8, %zmm8
vinsertps $16, %xmm4, %xmm8, %xmm1
vinsertf32x4 $0, %xmm1, %zmm8, %zmm1
vinsertps $32, %xmm4, %xmm1, %xmm2
vinsertf32x4 $0, %xmm2, %zmm1, %zmm1
vinsertps $48, %xmm4, %xmm1, %xmm2
vinsertf32x4 $0, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vblendps $1, %xmm4, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $16, %xmm4, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $32, %xmm4, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $48, %xmm4, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vblendps $1, %xmm0, %xmm6, %xmm2
vinsertf32x4 $0, %xmm2, %zmm6, %zmm2
vextractf32x4 $2, %zmm1, %xmm6
vinsertps $16, %xmm0, %xmm2, %xmm7
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2
vinsertps $32, %xmm0, %xmm2, %xmm7
vblendps $1, %xmm4, %xmm6, %xmm6
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2
vinsertps $48, %xmm0, %xmm2, %xmm7
vinsertf32x4 $0, %xmm7, %zmm2, %zmm2
vinsertf32x4 $2, %xmm6, %zmm1, %zmm1
vextractf32x4 $1, %zmm2, %xmm6
vblendps $1, %xmm0, %xmm6, %xmm6
vinsertf32x4 $1, %xmm6, %zmm2, %zmm2
vextractf32x4 $2, %zmm1, %xmm6
vextractf32x4 $1, %zmm2, %xmm7
vinsertps $16, %xmm0, %xmm7, %xmm7
vinsertf32x4 $1, %xmm7, %zmm2, %zmm2
vinsertps $16, %xmm4, %xmm6, %xmm6
vextractf32x4 $1, %zmm2, %xmm7
vinsertps $32, %xmm0, %xmm7, %xmm7
vinsertf32x4 $1, %xmm7, %zmm2, %zmm2
vinsertf32x4 $2, %xmm6, %zmm1, %zmm1
vextractf32x4 $1, %zmm2, %xmm6
vinsertps $48, %xmm0, %xmm6, %xmm6
vinsertf32x4 $1, %xmm6, %zmm2, %zmm2
vextractf32x4 $2, %zmm1, %xmm6
vextractf32x4 $2, %zmm2, %xmm7
vblendps $1, %xmm0, %xmm7, %xmm7
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2
vinsertps $32, %xmm4, %xmm6, %xmm6
vextractf32x4 $2, %zmm2, %xmm7
vinsertps $16, %xmm0, %xmm7, %xmm7
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2
vinsertf32x4 $2, %xmm6, %zmm1, %zmm8
vextractf32x4 $2, %zmm2, %xmm6
vinsertps $32, %xmm0, %xmm6, %xmm7
vmovss 8(%rdx), %xmm6
vblendps $1, %xmm6, %xmm15, %xmm3
vinsertf32x4 $2, %xmm7, %zmm2, %zmm2
vinsertf32x4 $0, %xmm3, %zmm15, %zmm3
vinsertps $16, %xmm6, %xmm3, %xmm5
vinsertf32x4 $0, %xmm5, %zmm3, %zmm3
vextractf32x4 $2, %zmm8, %xmm5
vinsertps $32, %xmm6, %xmm3, %xmm7
vinsertf32x4 $0, %xmm7, %zmm3, %zmm3
vinsertps $48, %xmm6, %xmm3, %xmm7
vextractf32x4 $2, %zmm2, %xmm1
vinsertf32x4 $0, %xmm7, %zmm3, %zmm3
vextractf32x4 $1, %zmm3, %xmm7
vblendps $1, %xmm6, %xmm7, %xmm7
vinsertps $48, %xmm4, %xmm5, %xmm5
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3
vextractf32x4 $1, %zmm3, %xmm7
vinsertps $16, %xmm6, %xmm7, %xmm7
vinsertps $48, %xmm0, %xmm1, %xmm1
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3
vextractf32x4 $1, %zmm3, %xmm7
vinsertps $32, %xmm6, %xmm7, %xmm7
vinsertf32x4 $2, %xmm5, %zmm8, %zmm8
vinsertf32x4 $1, %xmm7, %zmm3, %zmm3
vextractf32x4 $1, %zmm3, %xmm5
vinsertps $48, %xmm6, %xmm5, %xmm5
vinsertf32x4 $2, %xmm1, %zmm2, %zmm22
vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
vextractf32x4 $2, %zmm2, %xmm3
vblendps $1, %xmm6, %xmm3, %xmm3
vextractf32x4 $3, %zmm8, %xmm5
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2
vextractf32x4 $2, %zmm2, %xmm3
vinsertps $16, %xmm6, %xmm3, %xmm3
vextractf32x4 $3, %zmm22, %xmm7
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2
vextractf32x4 $2, %zmm2, %xmm3
vinsertps $32, %xmm6, %xmm3, %xmm3
vblendps $1, %xmm4, %xmm5, %xmm9
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2
vextractf32x4 $2, %zmm2, %xmm3
vinsertps $48, %xmm6, %xmm3, %xmm3
vblendps $1, %xmm0, %xmm7, %xmm7
vinsertf32x4 $2, %xmm3, %zmm2, %zmm2
vextractf32x4 $3, %zmm2, %xmm3
vblendps $1, %xmm6, %xmm3, %xmm12
vmovss 12(%rdx), %xmm5
vinsertf32x4 $3, %xmm9, %zmm8, %zmm8
vblendps $1, %xmm5, %xmm13, %xmm3
vinsertf32x4 $0, %xmm3, %zmm13, %zmm3
vinsertps $16, %xmm5, %xmm3, %xmm1
vinsertf32x4 $3, %xmm7, %zmm22, %zmm9
vinsertf32x4 $0, %xmm1, %zmm3, %zmm1
vinsertps $32, %xmm5, %xmm1, %xmm3
vinsertf32x4 $0, %xmm3, %zmm1, %zmm1
vinsertf32x4 $3, %xmm12, %zmm2, %zmm12
vinsertps $48, %xmm5, %xmm1, %xmm3
vinsertf32x4 $0, %xmm3, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm3
vextractf32x4 $3, %zmm8, %xmm7
vblendps $1, %xmm5, %xmm3, %xmm3
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm3
vextractf32x4 $3, %zmm9, %xmm2
vinsertps $16, %xmm5, %xmm3, %xmm3
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm3
vextractf32x4 $3, %zmm12, %xmm13
vinsertps $32, %xmm5, %xmm3, %xmm3
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm3
vinsertps $16, %xmm4, %xmm7, %xmm7
vinsertps $48, %xmm5, %xmm3, %xmm3
vinsertf32x4 $1, %xmm3, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm3
vinsertps $16, %xmm0, %xmm2, %xmm2
vblendps $1, %xmm5, %xmm3, %xmm3
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm3
vinsertps $16, %xmm6, %xmm13, %xmm13
vinsertps $16, %xmm5, %xmm3, %xmm3
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm3
vinsertf32x4 $3, %xmm7, %zmm8, %zmm23
vinsertps $32, %xmm5, %xmm3, %xmm3
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm3
vinsertf32x4 $3, %xmm2, %zmm9, %zmm22
vinsertps $48, %xmm5, %xmm3, %xmm3
vinsertf32x4 $2, %xmm3, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm3
vinsertf32x4 $3, %xmm13, %zmm12, %zmm8
vblendps $1, %xmm5, %xmm3, %xmm3
vinsertf32x4 $3, %xmm3, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm3
vextractf32x4 $3, %zmm23, %xmm9
vinsertps $16, %xmm5, %xmm3, %xmm3
vinsertf32x4 $3, %xmm3, %zmm1, %zmm12
vmovss 16(%rdx), %xmm3
vblendps $1, %xmm3, %xmm14, %xmm2
vextractf32x4 $3, %zmm22, %xmm13
vinsertf32x4 $0, %xmm2, %zmm14, %zmm2
vinsertps $16, %xmm3, %xmm2, %xmm1
vinsertf32x4 $0, %xmm1, %zmm2, %zmm1
vextractf32x4 $3, %zmm8, %xmm14
vinsertps $32, %xmm3, %xmm1, %xmm7
vinsertf32x4 $0, %xmm7, %zmm1, %zmm1
vinsertps $48, %xmm3, %xmm1, %xmm7
vinsertps $32, %xmm4, %xmm9, %xmm9
vinsertf32x4 $0, %xmm7, %zmm1, %zmm1
vextractf32x4 $3, %zmm12, %xmm7
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $32, %xmm0, %xmm13, %xmm13
vblendps $1, %xmm3, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $32, %xmm6, %xmm14, %xmm14
vinsertps $16, %xmm3, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertps $32, %xmm5, %xmm7, %xmm7
vinsertps $32, %xmm3, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $1, %zmm1, %xmm2
vinsertf32x4 $3, %xmm9, %zmm23, %zmm9
vinsertps $48, %xmm3, %xmm2, %xmm2
vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertf32x4 $3, %xmm13, %zmm22, %zmm13
vblendps $1, %xmm3, %xmm2, %xmm2
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertf32x4 $3, %xmm14, %zmm8, %zmm14
vinsertps $16, %xmm3, %xmm2, %xmm2
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertf32x4 $3, %xmm7, %zmm12, %zmm12
vinsertps $32, %xmm3, %xmm2, %xmm2
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vextractf32x4 $3, %zmm9, %xmm8
vinsertps $48, %xmm3, %xmm2, %xmm2
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vextractf32x4 $3, %zmm13, %xmm7
vblendps $1, %xmm3, %xmm2, %xmm2
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vinsertps $48, %xmm4, %xmm8, %xmm4
vinsertps $16, %xmm3, %xmm2, %xmm2
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm14, %xmm2
vinsertps $48, %xmm0, %xmm7, %xmm15
vextractf32x4 $3, %zmm1, %xmm0
vinsertps $32, %xmm3, %xmm0, %xmm0
vinsertf32x4 $3, %xmm0, %zmm1, %zmm1
vmovss 20(%rdx), %xmm0
vinsertps $48, %xmm6, %xmm2, %xmm2
vmovups -128(%rsp), %zmm7
vblendps $1, %xmm0, %xmm7, %xmm6
vinsertf32x4 $0, %xmm6, %zmm7, %zmm6
vextractf32x4 $3, %zmm12, %xmm7
vinsertps $48, %xmm5, %xmm7, %xmm7
vinsertps $16, %xmm0, %xmm6, %xmm5
vinsertf32x4 $0, %xmm5, %zmm6, %zmm5
vextractf32x4 $3, %zmm1, %xmm6
vinsertps $48, %xmm3, %xmm6, %xmm3
vinsertps $32, %xmm0, %xmm5, %xmm6
vinsertf32x4 $0, %xmm6, %zmm5, %zmm5
vinsertps $48, %xmm0, %xmm5, %xmm6
vinsertf32x4 $3, %xmm4, %zmm9, %zmm8
vinsertf32x4 $0, %xmm6, %zmm5, %zmm4
vextractf32x4 $1, %zmm4, %xmm5
vblendps $1, %xmm0, %xmm5, %xmm5
vinsertf32x4 $3, %xmm15, %zmm13, %zmm6
vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
vextractf32x4 $1, %zmm4, %xmm5
vinsertps $16, %xmm0, %xmm5, %xmm5
vinsertf32x4 $3, %xmm2, %zmm14, %zmm15
vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
vextractf32x4 $1, %zmm2, %xmm4
vinsertps $32, %xmm0, %xmm4, %xmm4
vinsertf32x4 $3, %xmm7, %zmm12, %zmm13
vinsertf32x4 $1, %xmm4, %zmm2, %zmm2
vextractf32x4 $1, %zmm2, %xmm4
vinsertps $48, %xmm0, %xmm4, %xmm4
vinsertf32x4 $3, %xmm3, %zmm1, %zmm14
vmovaps %zmm8, %zmm29
vinsertf32x4 $1, %xmm4, %zmm2, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vblendps $1, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm16, %zmm11, %zmm29
vmovaps %zmm8, %zmm28
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertps $16, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm17, %zmm10, %zmm28
vmovaps %zmm6, %zmm27
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertps $32, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm18, %zmm11, %zmm27
vmovaps %zmm6, %zmm26
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $2, %zmm1, %xmm2
vinsertps $48, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm19, %zmm10, %zmm26
vmovaps %zmm15, %zmm25
vinsertf32x4 $2, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vblendps $1, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm20, %zmm11, %zmm25
vmovaps %zmm15, %zmm24
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vinsertps $16, %xmm0, %xmm2, %xmm2
vfmadd213ps %zmm21, %zmm10, %zmm24
vmovaps %zmm13, %zmm23
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vinsertps $32, %xmm0, %xmm2, %xmm2
vmovups -64(%rsp), %zmm3
vfmadd213ps %zmm3, %zmm11, %zmm23
vmovaps %zmm13, %zmm22
vinsertf32x4 $3, %xmm2, %zmm1, %zmm1
vextractf32x4 $3, %zmm1, %xmm2
vinsertps $48, %xmm0, %xmm2, %xmm0
vmovups (%rsp), %zmm2
vfmadd213ps %zmm2, %zmm10, %zmm22
vinsertf32x4 $3, %xmm0, %zmm1, %zmm2
vmovaps %zmm14, %zmm31
vmovups 64(%rsp), %zmm0
vfmadd213ps %zmm0, %zmm11, %zmm31
vmovaps %zmm14, %zmm30
vmovups 128(%rsp), %zmm0
vfmadd213ps %zmm0, %zmm10, %zmm30
vmovaps %zmm2, %zmm4
vmovups 192(%rsp), %zmm0
vfmadd213ps %zmm0, %zmm11, %zmm4
vmovaps %zmm2, %zmm0
subq $-128, %rcx
addq $24, %rdx
addq $-1, %rsi
vmovups 256(%rsp), %zmm1
vfmadd213ps %zmm1, %zmm10, %zmm0
jne LBB0_1
vmovaps %zmm29, (%rdi)
vmovaps %zmm28, 64(%rdi)
vmovaps %zmm27, 128(%rdi)
vmovaps %zmm26, 192(%rdi)
vmovaps %zmm25, 256(%rdi)
vmovaps %zmm24, 320(%rdi)
vmovaps %zmm23, 384(%rdi)
vmovaps %zmm22, 448(%rdi)
vmovaps %zmm31, 512(%rdi)
vmovaps %zmm30, 576(%rdi)
vmovaps %zmm4, 640(%rdi)
vmovaps %zmm0, 704(%rdi)
addq $376, %rsp
retq
.cfi_endproc
.globl __D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.weak_definition __D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.p2align 4, 0x90
__D3mir4glas8internal4gemm36__T10prefetch_rVmi128Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv:
.cfi_startproc
prefetcht0 512(%rsi)
prefetcht0 576(%rsi)
retq
.cfi_endproc
.globl __D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv
.weak_definition __D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv
.p2align 4, 0x90
__D3mir4glas8internal4copy40__T9load_nanoVmi2Vmi1Vmi6TNhG16fTNhG16fZ9load_nanoFNaNbNiNfKG6G1G2NhG16fKG6G1G2NhG16fZv:
.cfi_startproc
vmovaps (%rdi), %zmm0
vmovaps %zmm0, (%rsi)
vmovaps 64(%rdi), %zmm0
vmovaps %zmm0, 64(%rsi)
vmovaps 128(%rdi), %zmm0
vmovaps %zmm0, 128(%rsi)
vmovaps 192(%rdi), %zmm0
vmovaps %zmm0, 192(%rsi)
vmovaps 256(%rdi), %zmm0
vmovaps %zmm0, 256(%rsi)
vmovaps 320(%rdi), %zmm0
vmovaps %zmm0, 320(%rsi)
vmovaps 384(%rdi), %zmm0
vmovaps %zmm0, 384(%rsi)
vmovaps 448(%rdi), %zmm0
vmovaps %zmm0, 448(%rsi)
vmovaps 512(%rdi), %zmm0
vmovaps %zmm0, 512(%rsi)
vmovaps 576(%rdi), %zmm0
vmovaps %zmm0, 576(%rsi)
vmovaps 640(%rdi), %zmm0
vmovaps %zmm0, 640(%rsi)
vmovaps 704(%rdi), %zmm0
vmovaps %zmm0, 704(%rsi)
retq
.cfi_endproc
.p2align 4, 0x90
__D3mir4glas8internal4gemm16__moduleinfoCtorZ:
movq __Dmodule_ref@GOTPCREL(%rip), %rax
movq (%rax), %rcx
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip)
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx
movq %rcx, (%rax)
retq
.section __DATA,__data
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ
.p2align 4
__D3mir4glas8internal4gemm12__ModuleInfoZ:
.long 2147484672
.long 0
.quad 2
.quad __D3mir4glas6common12__ModuleInfoZ
.quad __D3mir4glas8internal12__ModuleInfoZ
.asciz "mir.glas.internal.gemm"
.space 1
.p2align 3
__D3mir4glas8internal4gemm11__moduleRefZ:
.quad 0
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ
.section __DATA,__mod_init_func,mod_init_funcs
.p2align 3
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment