Created
March 31, 2017 05:32
-
-
Save 9il/1fcb0e0f30ed8365014592d98d641cfd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ldc2 -linkonce-templates -betterC -O -release -mcpu=skylake -mattr=+avx512f -output-s gemm_micro_kernel.d | |
; export extern(C) | |
; auto dot_reg_basic_generic( | |
; const(__vector(float[16])[3][1])* a, | |
; const(float[1][8])* b, | |
; size_t length, | |
; ref __vector(float[16])[3][1][8] c, | |
; ) | |
; { | |
; return dot_reg_basic(a, b, length, c); | |
; } | |
.section __TEXT,__text,regular,pure_instructions | |
.globl _dot_reg_basic_generic | |
.p2align 4, 0x90 | |
_dot_reg_basic_generic: | |
.cfi_startproc | |
subq $56, %rsp | |
Lcfi0: | |
.cfi_def_cfa_offset 64 | |
leaq 640(%rdi), %rax | |
vpxord %zmm29, %zmm29, %zmm29 | |
vpxord %zmm1, %zmm1, %zmm1 | |
vpxord %zmm2, %zmm2, %zmm2 | |
vpxord %zmm18, %zmm18, %zmm18 | |
vpxord %zmm3, %zmm3, %zmm3 | |
vpxord %zmm4, %zmm4, %zmm4 | |
vpxord %zmm17, %zmm17, %zmm17 | |
vpxord %zmm5, %zmm5, %zmm5 | |
vpxord %zmm6, %zmm6, %zmm6 | |
vpxord %zmm20, %zmm20, %zmm20 | |
vpxord %zmm7, %zmm7, %zmm7 | |
vpxord %zmm8, %zmm8, %zmm8 | |
vpxord %zmm19, %zmm19, %zmm19 | |
vpxord %zmm9, %zmm9, %zmm9 | |
vpxord %zmm10, %zmm10, %zmm10 | |
vpxord %zmm22, %zmm22, %zmm22 | |
vpxord %zmm11, %zmm11, %zmm11 | |
vpxord %zmm12, %zmm12, %zmm12 | |
vpxord %zmm21, %zmm21, %zmm21 | |
vpxord %zmm13, %zmm13, %zmm13 | |
vpxord %zmm14, %zmm14, %zmm14 | |
vpxord %zmm23, %zmm23, %zmm23 | |
vpxord %zmm15, %zmm15, %zmm15 | |
vpxord %zmm16, %zmm16, %zmm16 | |
movq %rdx, %r8 | |
.p2align 4, 0x90 | |
LBB0_1: | |
vmovdqa64 %zmm23, %zmm24 | |
vmovdqa64 %zmm21, %zmm25 | |
vmovdqa64 %zmm22, %zmm26 | |
vmovdqa64 %zmm19, %zmm27 | |
vmovdqa64 %zmm20, %zmm28 | |
vmovdqu64 %zmm17, -128(%rsp) | |
vmovdqu64 %zmm18, -64(%rsp) | |
prefetcht0 -128(%rax) | |
prefetcht0 -64(%rax) | |
prefetcht0 (%rax) | |
vmovaps -640(%rax), %zmm31 | |
vmovaps -576(%rax), %zmm0 | |
vmovaps -512(%rax), %zmm30 | |
vbroadcastss (%rsi), %zmm23 | |
vbroadcastss 4(%rsi), %zmm21 | |
vfmadd231ps %zmm31, %zmm23, %zmm16 | |
vfmadd231ps %zmm0, %zmm23, %zmm15 | |
vfmadd213ps %zmm24, %zmm30, %zmm23 | |
vfmadd231ps %zmm31, %zmm21, %zmm14 | |
vfmadd231ps %zmm0, %zmm21, %zmm13 | |
vfmadd213ps %zmm25, %zmm30, %zmm21 | |
vbroadcastss 8(%rsi), %zmm22 | |
vbroadcastss 12(%rsi), %zmm19 | |
vfmadd231ps %zmm31, %zmm22, %zmm12 | |
vfmadd231ps %zmm0, %zmm22, %zmm11 | |
vfmadd213ps %zmm26, %zmm30, %zmm22 | |
vfmadd231ps %zmm31, %zmm19, %zmm10 | |
vfmadd231ps %zmm0, %zmm19, %zmm9 | |
vfmadd213ps %zmm27, %zmm30, %zmm19 | |
vbroadcastss 16(%rsi), %zmm20 | |
vbroadcastss 20(%rsi), %zmm17 | |
vfmadd231ps %zmm31, %zmm20, %zmm8 | |
vfmadd231ps %zmm0, %zmm20, %zmm7 | |
vfmadd213ps %zmm28, %zmm30, %zmm20 | |
vfmadd231ps %zmm31, %zmm17, %zmm6 | |
vfmadd231ps %zmm0, %zmm17, %zmm5 | |
vfmadd213ps -128(%rsp), %zmm30, %zmm17 | |
vbroadcastss 24(%rsi), %zmm18 | |
vbroadcastss 28(%rsi), %zmm24 | |
vfmadd231ps %zmm31, %zmm18, %zmm4 | |
vfmadd231ps %zmm0, %zmm18, %zmm3 | |
vfmadd213ps -64(%rsp), %zmm30, %zmm18 | |
vfmadd231ps %zmm31, %zmm24, %zmm2 | |
vfmadd231ps %zmm0, %zmm24, %zmm1 | |
vfmadd231ps %zmm30, %zmm24, %zmm29 | |
addq $192, %rax | |
addq $32, %rsi | |
addq $-1, %r8 | |
jne LBB0_1 | |
leaq (%rdx,%rdx,2), %rax | |
shlq $6, %rax | |
addq %rax, %rdi | |
vmovaps %zmm16, (%rcx) | |
vmovaps %zmm15, 64(%rcx) | |
vmovaps %zmm23, 128(%rcx) | |
vmovaps %zmm14, 192(%rcx) | |
vmovaps %zmm13, 256(%rcx) | |
vmovaps %zmm21, 320(%rcx) | |
vmovaps %zmm12, 384(%rcx) | |
vmovaps %zmm11, 448(%rcx) | |
vmovaps %zmm22, 512(%rcx) | |
vmovaps %zmm10, 576(%rcx) | |
vmovaps %zmm9, 640(%rcx) | |
vmovaps %zmm19, 704(%rcx) | |
vmovaps %zmm8, 768(%rcx) | |
vmovaps %zmm7, 832(%rcx) | |
vmovaps %zmm20, 896(%rcx) | |
vmovaps %zmm6, 960(%rcx) | |
vmovaps %zmm5, 1024(%rcx) | |
vmovaps %zmm17, 1088(%rcx) | |
vmovaps %zmm4, 1152(%rcx) | |
vmovaps %zmm3, 1216(%rcx) | |
vmovaps %zmm18, 1280(%rcx) | |
vmovaps %zmm2, 1344(%rcx) | |
vmovaps %zmm1, 1408(%rcx) | |
vmovaps %zmm29, 1472(%rcx) | |
movq %rdi, %rax | |
addq $56, %rsp | |
retq | |
.cfi_endproc | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment