Skip to content

Instantly share code, notes, and snippets.

@9il
Created September 23, 2016 07:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9il/1d9af6abd167efdfa2d854ac3f0ab2be to your computer and use it in GitHub Desktop.
Save 9il/1d9af6abd167efdfa2d854ac3f0ab2be to your computer and use it in GitHub Desktop.
cannonlake code for 256-bit vectors (broken)
.section __TEXT,__text,regular,pure_instructions
.globl __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f
.weak_definition __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f
.p2align 4, 0x90
__D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f:
.cfi_startproc
subq $24, %rsp
Ltmp0:
.cfi_def_cfa_offset 32
movq %rsi, %rax
shlq $6, %rax
addq %rcx, %rax
vpxord %ymm0, %ymm0, %ymm0
vpxord %ymm14, %ymm14, %ymm14
vpxord %ymm2, %ymm2, %ymm2
vpxord %ymm4, %ymm4, %ymm4
vpxord %ymm3, %ymm3, %ymm3
vpxord %ymm5, %ymm5, %ymm5
vpxord %ymm7, %ymm7, %ymm7
vpxord %ymm9, %ymm9, %ymm9
vpxord %ymm6, %ymm6, %ymm6
vpxord %ymm10, %ymm10, %ymm10
vpxord %ymm8, %ymm8, %ymm8
vpxord %ymm11, %ymm11, %ymm11
.p2align 4, 0x90
LBB0_1:
vmovaps %ymm11, %ymm15
vmovdqu %ymm8, -32(%rsp)
vmovaps %ymm10, %ymm12
vmovdqu %ymm6, -64(%rsp)
vmovaps %ymm9, %ymm13
vmovdqu %ymm7, -96(%rsp)
prefetcht0 512(%rcx)
vmovaps (%rcx), %ymm1
vbroadcastss (%rdx), %ymm8
vbroadcastss 4(%rdx), %ymm6
vmovaps %ymm8, %ymm11
vfmadd213ps %ymm15, %ymm1, %ymm11
vmovaps %ymm6, %ymm10
vfmadd213ps %ymm12, %ymm1, %ymm10
vbroadcastss 8(%rdx), %ymm7
vmovaps %ymm7, %ymm9
vfmadd213ps %ymm13, %ymm1, %ymm9
vmovaps %ymm5, %ymm12
vmovdqu %ymm3, -128(%rsp)
vbroadcastss 12(%rdx), %ymm3
vmovaps %ymm3, %ymm5
vfmadd213ps %ymm12, %ymm1, %ymm5
vmovaps %ymm4, %ymm12
vmovaps %ymm2, %ymm15
vbroadcastss 16(%rdx), %ymm2
vmovaps %ymm2, %ymm4
vfmadd213ps %ymm12, %ymm1, %ymm4
vmovaps %ymm14, %ymm12
vmovaps %ymm0, %ymm13
vbroadcastss 20(%rdx), %ymm0
vmovaps %ymm0, %ymm14
vfmadd213ps %ymm12, %ymm1, %ymm14
vmovaps 32(%rcx), %ymm1
vmovups -32(%rsp), %ymm12
vfmadd213ps %ymm12, %ymm1, %ymm8
vmovups -64(%rsp), %ymm12
vfmadd213ps %ymm12, %ymm1, %ymm6
vmovups -96(%rsp), %ymm12
vfmadd213ps %ymm12, %ymm1, %ymm7
vmovups -128(%rsp), %ymm12
vfmadd213ps %ymm12, %ymm1, %ymm3
vfmadd213ps %ymm15, %ymm1, %ymm2
vfmadd213ps %ymm13, %ymm1, %ymm0
addq $64, %rcx
addq $24, %rdx
addq $-1, %rsi
jne LBB0_1
vmovaps %ymm11, (%rdi)
vmovaps %ymm8, 32(%rdi)
vmovaps %ymm10, 64(%rdi)
vmovaps %ymm6, 96(%rdi)
vmovaps %ymm9, 128(%rdi)
vmovaps %ymm7, 160(%rdi)
vmovaps %ymm5, 192(%rdi)
vmovaps %ymm3, 224(%rdi)
vmovaps %ymm4, 256(%rdi)
vmovaps %ymm2, 288(%rdi)
vmovaps %ymm14, 320(%rdi)
vmovaps %ymm0, 352(%rdi)
addq $24, %rsp
retq
.cfi_endproc
.globl __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.weak_definition __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.p2align 4, 0x90
__D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv:
.cfi_startproc
prefetcht0 512(%rsi)
retq
.cfi_endproc
.globl __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv
.weak_definition __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv
.p2align 4, 0x90
__D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv:
.cfi_startproc
vmovaps (%rdi), %ymm0
vmovaps %ymm0, (%rsi)
vmovaps 32(%rdi), %ymm0
vmovaps %ymm0, 32(%rsi)
vmovaps 64(%rdi), %ymm0
vmovaps %ymm0, 64(%rsi)
vmovaps 96(%rdi), %ymm0
vmovaps %ymm0, 96(%rsi)
vmovaps 128(%rdi), %ymm0
vmovaps %ymm0, 128(%rsi)
vmovaps 160(%rdi), %ymm0
vmovaps %ymm0, 160(%rsi)
vmovaps 192(%rdi), %ymm0
vmovaps %ymm0, 192(%rsi)
vmovaps 224(%rdi), %ymm0
vmovaps %ymm0, 224(%rsi)
vmovaps 256(%rdi), %ymm0
vmovaps %ymm0, 256(%rsi)
vmovaps 288(%rdi), %ymm0
vmovaps %ymm0, 288(%rsi)
vmovaps 320(%rdi), %ymm0
vmovaps %ymm0, 320(%rsi)
vmovaps 352(%rdi), %ymm0
vmovaps %ymm0, 352(%rsi)
retq
.cfi_endproc
.p2align 4, 0x90
__D3mir4glas8internal4gemm16__moduleinfoCtorZ:
movq __Dmodule_ref@GOTPCREL(%rip), %rax
movq (%rax), %rcx
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip)
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx
movq %rcx, (%rax)
retq
.section __DATA,__data
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ
.p2align 4
__D3mir4glas8internal4gemm12__ModuleInfoZ:
.long 2147484672
.long 0
.quad 2
.quad __D3mir4glas6common12__ModuleInfoZ
.quad __D3mir4glas8internal12__ModuleInfoZ
.asciz "mir.glas.internal.gemm"
.space 1
.p2align 3
__D3mir4glas8internal4gemm11__moduleRefZ:
.quad 0
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ
.section __DATA,__mod_init_func,mod_init_funcs
.p2align 3
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment