Skip to content

Instantly share code, notes, and snippets.

@9il
Created September 23, 2016 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9il/bf680d2a07561097158268f7dda15428 to your computer and use it in GitHub Desktop.
Save 9il/bf680d2a07561097158268f7dda15428 to your computer and use it in GitHub Desktop.
haswell code for 256-bit vectors (OK)
.section __TEXT,__text,regular,pure_instructions
.globl __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f
.weak_definition __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f
.p2align 4, 0x90
__D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f:
.cfi_startproc
movq %rsi, %rax
shlq $6, %rax
addq %rcx, %rax
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
vxorps %ymm4, %ymm4, %ymm4
vxorps %ymm5, %ymm5, %ymm5
vxorps %ymm6, %ymm6, %ymm6
vxorps %ymm7, %ymm7, %ymm7
vxorps %ymm8, %ymm8, %ymm8
vxorps %ymm9, %ymm9, %ymm9
vxorps %ymm10, %ymm10, %ymm10
vxorps %ymm11, %ymm11, %ymm11
.p2align 4, 0x90
LBB0_1:
prefetcht0 512(%rcx)
vmovaps (%rcx), %ymm12
vmovaps 32(%rcx), %ymm13
vbroadcastss (%rdx), %ymm14
vbroadcastss 4(%rdx), %ymm15
vfmadd231ps %ymm12, %ymm14, %ymm11
vfmadd231ps %ymm13, %ymm14, %ymm10
vfmadd231ps %ymm12, %ymm15, %ymm9
vfmadd231ps %ymm13, %ymm15, %ymm8
vbroadcastss 8(%rdx), %ymm14
vbroadcastss 12(%rdx), %ymm15
vfmadd231ps %ymm12, %ymm14, %ymm7
vfmadd231ps %ymm13, %ymm14, %ymm6
vfmadd231ps %ymm12, %ymm15, %ymm5
vfmadd231ps %ymm13, %ymm15, %ymm4
vbroadcastss 16(%rdx), %ymm14
vbroadcastss 20(%rdx), %ymm15
vfmadd231ps %ymm12, %ymm14, %ymm3
vfmadd231ps %ymm13, %ymm14, %ymm2
vfmadd231ps %ymm12, %ymm15, %ymm1
vfmadd231ps %ymm13, %ymm15, %ymm0
addq $64, %rcx
addq $24, %rdx
addq $-1, %rsi
jne LBB0_1
vmovaps %ymm11, (%rdi)
vmovaps %ymm10, 32(%rdi)
vmovaps %ymm9, 64(%rdi)
vmovaps %ymm8, 96(%rdi)
vmovaps %ymm7, 128(%rdi)
vmovaps %ymm6, 160(%rdi)
vmovaps %ymm5, 192(%rdi)
vmovaps %ymm4, 224(%rdi)
vmovaps %ymm3, 256(%rdi)
vmovaps %ymm2, 288(%rdi)
vmovaps %ymm1, 320(%rdi)
vmovaps %ymm0, 352(%rdi)
vzeroupper
retq
.cfi_endproc
.globl __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.weak_definition __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.p2align 4, 0x90
__D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv:
.cfi_startproc
prefetcht0 512(%rsi)
retq
.cfi_endproc
.globl __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv
.weak_definition __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv
.p2align 4, 0x90
__D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv:
.cfi_startproc
vmovaps (%rdi), %ymm0
vmovaps %ymm0, (%rsi)
vmovaps 32(%rdi), %ymm0
vmovaps %ymm0, 32(%rsi)
vmovaps 64(%rdi), %ymm0
vmovaps %ymm0, 64(%rsi)
vmovaps 96(%rdi), %ymm0
vmovaps %ymm0, 96(%rsi)
vmovaps 128(%rdi), %ymm0
vmovaps %ymm0, 128(%rsi)
vmovaps 160(%rdi), %ymm0
vmovaps %ymm0, 160(%rsi)
vmovaps 192(%rdi), %ymm0
vmovaps %ymm0, 192(%rsi)
vmovaps 224(%rdi), %ymm0
vmovaps %ymm0, 224(%rsi)
vmovaps 256(%rdi), %ymm0
vmovaps %ymm0, 256(%rsi)
vmovaps 288(%rdi), %ymm0
vmovaps %ymm0, 288(%rsi)
vmovaps 320(%rdi), %ymm0
vmovaps %ymm0, 320(%rsi)
vmovaps 352(%rdi), %ymm0
vmovaps %ymm0, 352(%rsi)
vzeroupper
retq
.cfi_endproc
.p2align 4, 0x90
__D3mir4glas8internal4gemm16__moduleinfoCtorZ:
movq __Dmodule_ref@GOTPCREL(%rip), %rax
movq (%rax), %rcx
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip)
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx
movq %rcx, (%rax)
retq
.section __DATA,__data
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ
.p2align 4
__D3mir4glas8internal4gemm12__ModuleInfoZ:
.long 2147484672
.long 0
.quad 2
.quad __D3mir4glas6common12__ModuleInfoZ
.quad __D3mir4glas8internal12__ModuleInfoZ
.asciz "mir.glas.internal.gemm"
.space 1
.p2align 3
__D3mir4glas8internal4gemm11__moduleRefZ:
.quad 0
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ
.section __DATA,__mod_init_func,mod_init_funcs
.p2align 3
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment