Skip to content

Instantly share code, notes, and snippets.

@9il
Created September 23, 2016 07:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9il/894a35df5bb20a6e9baa100db551e5cd to your computer and use it in GitHub Desktop.
Save 9il/894a35df5bb20a6e9baa100db551e5cd to your computer and use it in GitHub Desktop.
haswell code for 128-bit vectors (OK)
.section __TEXT,__text,regular,pure_instructions
.globl __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f
.weak_definition __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f
.p2align 4, 0x90
__D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f:
.cfi_startproc
movq %rsi, %rax
shlq $5, %rax
addq %rcx, %rax
vxorps %xmm8, %xmm8, %xmm8
vxorps %xmm9, %xmm9, %xmm9
vxorps %xmm10, %xmm10, %xmm10
vxorps %xmm11, %xmm11, %xmm11
vxorps %xmm12, %xmm12, %xmm12
vxorps %xmm13, %xmm13, %xmm13
vxorps %xmm14, %xmm14, %xmm14
vxorps %xmm15, %xmm15, %xmm15
vxorps %xmm0, %xmm0, %xmm0
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm3, %xmm3, %xmm3
.p2align 4, 0x90
LBB0_1:
prefetcht0 512(%rcx)
vmovaps (%rcx), %xmm4
vmovaps 16(%rcx), %xmm5
vbroadcastss (%rdx), %xmm6
vbroadcastss 4(%rdx), %xmm7
vfmadd231ps %xmm4, %xmm6, %xmm3
vfmadd231ps %xmm5, %xmm6, %xmm2
vfmadd231ps %xmm4, %xmm7, %xmm1
vfmadd231ps %xmm5, %xmm7, %xmm0
vbroadcastss 8(%rdx), %xmm6
vbroadcastss 12(%rdx), %xmm7
vfmadd231ps %xmm4, %xmm6, %xmm15
vfmadd231ps %xmm5, %xmm6, %xmm14
vfmadd231ps %xmm4, %xmm7, %xmm13
vfmadd231ps %xmm5, %xmm7, %xmm12
vbroadcastss 16(%rdx), %xmm6
vbroadcastss 20(%rdx), %xmm7
vfmadd231ps %xmm4, %xmm6, %xmm11
vfmadd231ps %xmm5, %xmm6, %xmm10
vfmadd231ps %xmm4, %xmm7, %xmm9
vfmadd231ps %xmm5, %xmm7, %xmm8
addq $32, %rcx
addq $24, %rdx
addq $-1, %rsi
jne LBB0_1
vmovaps %xmm3, (%rdi)
vmovaps %xmm2, 16(%rdi)
vmovaps %xmm1, 32(%rdi)
vmovaps %xmm0, 48(%rdi)
vmovaps %xmm15, 64(%rdi)
vmovaps %xmm14, 80(%rdi)
vmovaps %xmm13, 96(%rdi)
vmovaps %xmm12, 112(%rdi)
vmovaps %xmm11, 128(%rdi)
vmovaps %xmm10, 144(%rdi)
vmovaps %xmm9, 160(%rdi)
vmovaps %xmm8, 176(%rdi)
retq
.cfi_endproc
.globl __D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.weak_definition __D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv
.p2align 4, 0x90
__D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv:
.cfi_startproc
prefetcht0 512(%rsi)
retq
.cfi_endproc
.globl __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv
.weak_definition __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv
.p2align 4, 0x90
__D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv:
.cfi_startproc
vmovaps (%rdi), %xmm0
vmovaps %xmm0, (%rsi)
vmovaps 16(%rdi), %xmm0
vmovaps %xmm0, 16(%rsi)
vmovaps 32(%rdi), %xmm0
vmovaps %xmm0, 32(%rsi)
vmovaps 48(%rdi), %xmm0
vmovaps %xmm0, 48(%rsi)
vmovaps 64(%rdi), %xmm0
vmovaps %xmm0, 64(%rsi)
vmovaps 80(%rdi), %xmm0
vmovaps %xmm0, 80(%rsi)
vmovaps 96(%rdi), %xmm0
vmovaps %xmm0, 96(%rsi)
vmovaps 112(%rdi), %xmm0
vmovaps %xmm0, 112(%rsi)
vmovaps 128(%rdi), %xmm0
vmovaps %xmm0, 128(%rsi)
vmovaps 144(%rdi), %xmm0
vmovaps %xmm0, 144(%rsi)
vmovaps 160(%rdi), %xmm0
vmovaps %xmm0, 160(%rsi)
vmovaps 176(%rdi), %xmm0
vmovaps %xmm0, 176(%rsi)
retq
.cfi_endproc
.p2align 4, 0x90
__D3mir4glas8internal4gemm16__moduleinfoCtorZ:
movq __Dmodule_ref@GOTPCREL(%rip), %rax
movq (%rax), %rcx
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip)
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx
movq %rcx, (%rax)
retq
.section __DATA,__data
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ
.p2align 4
__D3mir4glas8internal4gemm12__ModuleInfoZ:
.long 2147484672
.long 0
.quad 2
.quad __D3mir4glas6common12__ModuleInfoZ
.quad __D3mir4glas8internal12__ModuleInfoZ
.asciz "mir.glas.internal.gemm"
.space 1
.p2align 3
__D3mir4glas8internal4gemm11__moduleRefZ:
.quad 0
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ
.section __DATA,__mod_init_func,mod_init_funcs
.p2align 3
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment