Created
September 23, 2016 07:04
-
-
Save 9il/894a35df5bb20a6e9baa100db551e5cd to your computer and use it in GitHub Desktop.
haswell code for 128-bit vectors (OK)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f | |
.weak_definition __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG4fTfZ13dot_reg_basicFNbNiPxG1G2NhG4fPxG6G1fmKG6G1G2NhG4fZPxG1G2NhG4f: | |
.cfi_startproc | |
movq %rsi, %rax | |
shlq $5, %rax | |
addq %rcx, %rax | |
vxorps %xmm8, %xmm8, %xmm8 | |
vxorps %xmm9, %xmm9, %xmm9 | |
vxorps %xmm10, %xmm10, %xmm10 | |
vxorps %xmm11, %xmm11, %xmm11 | |
vxorps %xmm12, %xmm12, %xmm12 | |
vxorps %xmm13, %xmm13, %xmm13 | |
vxorps %xmm14, %xmm14, %xmm14 | |
vxorps %xmm15, %xmm15, %xmm15 | |
vxorps %xmm0, %xmm0, %xmm0 | |
vxorps %xmm1, %xmm1, %xmm1 | |
vxorps %xmm2, %xmm2, %xmm2 | |
vxorps %xmm3, %xmm3, %xmm3 | |
.p2align 4, 0x90 | |
LBB0_1: | |
prefetcht0 512(%rcx) | |
vmovaps (%rcx), %xmm4 | |
vmovaps 16(%rcx), %xmm5 | |
vbroadcastss (%rdx), %xmm6 | |
vbroadcastss 4(%rdx), %xmm7 | |
vfmadd231ps %xmm4, %xmm6, %xmm3 | |
vfmadd231ps %xmm5, %xmm6, %xmm2 | |
vfmadd231ps %xmm4, %xmm7, %xmm1 | |
vfmadd231ps %xmm5, %xmm7, %xmm0 | |
vbroadcastss 8(%rdx), %xmm6 | |
vbroadcastss 12(%rdx), %xmm7 | |
vfmadd231ps %xmm4, %xmm6, %xmm15 | |
vfmadd231ps %xmm5, %xmm6, %xmm14 | |
vfmadd231ps %xmm4, %xmm7, %xmm13 | |
vfmadd231ps %xmm5, %xmm7, %xmm12 | |
vbroadcastss 16(%rdx), %xmm6 | |
vbroadcastss 20(%rdx), %xmm7 | |
vfmadd231ps %xmm4, %xmm6, %xmm11 | |
vfmadd231ps %xmm5, %xmm6, %xmm10 | |
vfmadd231ps %xmm4, %xmm7, %xmm9 | |
vfmadd231ps %xmm5, %xmm7, %xmm8 | |
addq $32, %rcx | |
addq $24, %rdx | |
addq $-1, %rsi | |
jne LBB0_1 | |
vmovaps %xmm3, (%rdi) | |
vmovaps %xmm2, 16(%rdi) | |
vmovaps %xmm1, 32(%rdi) | |
vmovaps %xmm0, 48(%rdi) | |
vmovaps %xmm15, 64(%rdi) | |
vmovaps %xmm14, 80(%rdi) | |
vmovaps %xmm13, 96(%rdi) | |
vmovaps %xmm12, 112(%rdi) | |
vmovaps %xmm11, 128(%rdi) | |
vmovaps %xmm10, 144(%rdi) | |
vmovaps %xmm9, 160(%rdi) | |
vmovaps %xmm8, 176(%rdi) | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.weak_definition __D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm35__T10prefetch_rVmi32Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv: | |
.cfi_startproc | |
prefetcht0 512(%rsi) | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv | |
.weak_definition __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG4fTNhG4fZ9load_nanoFNaNbNiNfKG6G1G2NhG4fKG6G1G2NhG4fZv: | |
.cfi_startproc | |
vmovaps (%rdi), %xmm0 | |
vmovaps %xmm0, (%rsi) | |
vmovaps 16(%rdi), %xmm0 | |
vmovaps %xmm0, 16(%rsi) | |
vmovaps 32(%rdi), %xmm0 | |
vmovaps %xmm0, 32(%rsi) | |
vmovaps 48(%rdi), %xmm0 | |
vmovaps %xmm0, 48(%rsi) | |
vmovaps 64(%rdi), %xmm0 | |
vmovaps %xmm0, 64(%rsi) | |
vmovaps 80(%rdi), %xmm0 | |
vmovaps %xmm0, 80(%rsi) | |
vmovaps 96(%rdi), %xmm0 | |
vmovaps %xmm0, 96(%rsi) | |
vmovaps 112(%rdi), %xmm0 | |
vmovaps %xmm0, 112(%rsi) | |
vmovaps 128(%rdi), %xmm0 | |
vmovaps %xmm0, 128(%rsi) | |
vmovaps 144(%rdi), %xmm0 | |
vmovaps %xmm0, 144(%rsi) | |
vmovaps 160(%rdi), %xmm0 | |
vmovaps %xmm0, 160(%rsi) | |
vmovaps 176(%rdi), %xmm0 | |
vmovaps %xmm0, 176(%rsi) | |
retq | |
.cfi_endproc | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm16__moduleinfoCtorZ: | |
movq __Dmodule_ref@GOTPCREL(%rip), %rax | |
movq (%rax), %rcx | |
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip) | |
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx | |
movq %rcx, (%rax) | |
retq | |
.section __DATA,__data | |
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.p2align 4 | |
__D3mir4glas8internal4gemm12__ModuleInfoZ: | |
.long 2147484672 | |
.long 0 | |
.quad 2 | |
.quad __D3mir4glas6common12__ModuleInfoZ | |
.quad __D3mir4glas8internal12__ModuleInfoZ | |
.asciz "mir.glas.internal.gemm" | |
.space 1 | |
.p2align 3 | |
__D3mir4glas8internal4gemm11__moduleRefZ: | |
.quad 0 | |
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.section __DATA,__mod_init_func,mod_init_funcs | |
.p2align 3 | |
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment