Created
September 23, 2016 06:59
-
-
Save 9il/bf680d2a07561097158268f7dda15428 to your computer and use it in GitHub Desktop.
haswell code for 256-bit vectors (OK)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f | |
.weak_definition __D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm47__T13dot_reg_basicVmi1Vmi1Vmi1Vmi2Vmi6TNhG8fTfZ13dot_reg_basicFNbNiPxG1G2NhG8fPxG6G1fmKG6G1G2NhG8fZPxG1G2NhG8f: | |
.cfi_startproc | |
movq %rsi, %rax | |
shlq $6, %rax | |
addq %rcx, %rax | |
vxorps %ymm0, %ymm0, %ymm0 | |
vxorps %ymm1, %ymm1, %ymm1 | |
vxorps %ymm2, %ymm2, %ymm2 | |
vxorps %ymm3, %ymm3, %ymm3 | |
vxorps %ymm4, %ymm4, %ymm4 | |
vxorps %ymm5, %ymm5, %ymm5 | |
vxorps %ymm6, %ymm6, %ymm6 | |
vxorps %ymm7, %ymm7, %ymm7 | |
vxorps %ymm8, %ymm8, %ymm8 | |
vxorps %ymm9, %ymm9, %ymm9 | |
vxorps %ymm10, %ymm10, %ymm10 | |
vxorps %ymm11, %ymm11, %ymm11 | |
.p2align 4, 0x90 | |
LBB0_1: | |
prefetcht0 512(%rcx) | |
vmovaps (%rcx), %ymm12 | |
vmovaps 32(%rcx), %ymm13 | |
vbroadcastss (%rdx), %ymm14 | |
vbroadcastss 4(%rdx), %ymm15 | |
vfmadd231ps %ymm12, %ymm14, %ymm11 | |
vfmadd231ps %ymm13, %ymm14, %ymm10 | |
vfmadd231ps %ymm12, %ymm15, %ymm9 | |
vfmadd231ps %ymm13, %ymm15, %ymm8 | |
vbroadcastss 8(%rdx), %ymm14 | |
vbroadcastss 12(%rdx), %ymm15 | |
vfmadd231ps %ymm12, %ymm14, %ymm7 | |
vfmadd231ps %ymm13, %ymm14, %ymm6 | |
vfmadd231ps %ymm12, %ymm15, %ymm5 | |
vfmadd231ps %ymm13, %ymm15, %ymm4 | |
vbroadcastss 16(%rdx), %ymm14 | |
vbroadcastss 20(%rdx), %ymm15 | |
vfmadd231ps %ymm12, %ymm14, %ymm3 | |
vfmadd231ps %ymm13, %ymm14, %ymm2 | |
vfmadd231ps %ymm12, %ymm15, %ymm1 | |
vfmadd231ps %ymm13, %ymm15, %ymm0 | |
addq $64, %rcx | |
addq $24, %rdx | |
addq $-1, %rsi | |
jne LBB0_1 | |
vmovaps %ymm11, (%rdi) | |
vmovaps %ymm10, 32(%rdi) | |
vmovaps %ymm9, 64(%rdi) | |
vmovaps %ymm8, 96(%rdi) | |
vmovaps %ymm7, 128(%rdi) | |
vmovaps %ymm6, 160(%rdi) | |
vmovaps %ymm5, 192(%rdi) | |
vmovaps %ymm4, 224(%rdi) | |
vmovaps %ymm3, 256(%rdi) | |
vmovaps %ymm2, 288(%rdi) | |
vmovaps %ymm1, 320(%rdi) | |
vmovaps %ymm0, 352(%rdi) | |
vzeroupper | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.weak_definition __D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm35__T10prefetch_rVmi64Vmi1Vmi8Vmi512Z10prefetch_rFNbNiPvlZv: | |
.cfi_startproc | |
prefetcht0 512(%rsi) | |
retq | |
.cfi_endproc | |
.globl __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv | |
.weak_definition __D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4copy38__T9load_nanoVmi2Vmi1Vmi6TNhG8fTNhG8fZ9load_nanoFNaNbNiNfKG6G1G2NhG8fKG6G1G2NhG8fZv: | |
.cfi_startproc | |
vmovaps (%rdi), %ymm0 | |
vmovaps %ymm0, (%rsi) | |
vmovaps 32(%rdi), %ymm0 | |
vmovaps %ymm0, 32(%rsi) | |
vmovaps 64(%rdi), %ymm0 | |
vmovaps %ymm0, 64(%rsi) | |
vmovaps 96(%rdi), %ymm0 | |
vmovaps %ymm0, 96(%rsi) | |
vmovaps 128(%rdi), %ymm0 | |
vmovaps %ymm0, 128(%rsi) | |
vmovaps 160(%rdi), %ymm0 | |
vmovaps %ymm0, 160(%rsi) | |
vmovaps 192(%rdi), %ymm0 | |
vmovaps %ymm0, 192(%rsi) | |
vmovaps 224(%rdi), %ymm0 | |
vmovaps %ymm0, 224(%rsi) | |
vmovaps 256(%rdi), %ymm0 | |
vmovaps %ymm0, 256(%rsi) | |
vmovaps 288(%rdi), %ymm0 | |
vmovaps %ymm0, 288(%rsi) | |
vmovaps 320(%rdi), %ymm0 | |
vmovaps %ymm0, 320(%rsi) | |
vmovaps 352(%rdi), %ymm0 | |
vmovaps %ymm0, 352(%rsi) | |
vzeroupper | |
retq | |
.cfi_endproc | |
.p2align 4, 0x90 | |
__D3mir4glas8internal4gemm16__moduleinfoCtorZ: | |
movq __Dmodule_ref@GOTPCREL(%rip), %rax | |
movq (%rax), %rcx | |
movq %rcx, __D3mir4glas8internal4gemm11__moduleRefZ(%rip) | |
leaq __D3mir4glas8internal4gemm11__moduleRefZ(%rip), %rcx | |
movq %rcx, (%rax) | |
retq | |
.section __DATA,__data | |
.globl __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.p2align 4 | |
__D3mir4glas8internal4gemm12__ModuleInfoZ: | |
.long 2147484672 | |
.long 0 | |
.quad 2 | |
.quad __D3mir4glas6common12__ModuleInfoZ | |
.quad __D3mir4glas8internal12__ModuleInfoZ | |
.asciz "mir.glas.internal.gemm" | |
.space 1 | |
.p2align 3 | |
__D3mir4glas8internal4gemm11__moduleRefZ: | |
.quad 0 | |
.quad __D3mir4glas8internal4gemm12__ModuleInfoZ | |
.section __DATA,__mod_init_func,mod_init_funcs | |
.p2align 3 | |
.quad __D3mir4glas8internal4gemm16__moduleinfoCtorZ | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment