Instantly share code, notes, and snippets.

Embed
What would you like to do?
// ldc2 -linkonce-templates -betterC -O -release -mcpu=broadwell -output-s gemm_micro_kernel_simple.d
import ldc.attributes;
import ldc.intrinsics;
enum prefetchShift = 512;
@fastmath:
export extern(C) nothrow @nogc @fastmath
auto dot_reg_basic_simple(
const(__vector(float[8])[2][1])* a,
const(float[1][6])* b,
size_t length,
ref __vector(float[8])[2][1][6] c,
)
{
return dot_reg_basic_simple!()(a, b, length, c);
}
const(__vector(float[8])[2][1])* dot_reg_basic_simple()
(
const(__vector(float[8])[2][1])* a,
const(float[1][6])* b,
size_t length,
ref __vector(float[8])[2][1][6] c,
)
{
__vector(float[8])[2][1][6] reg = void;
reg[0][0][0] = 0;
reg[0][0][1] = 0;
reg[1][0][0] = 0;
reg[1][0][1] = 0;
reg[2][0][0] = 0;
reg[2][0][1] = 0;
reg[3][0][0] = 0;
reg[3][0][1] = 0;
reg[4][0][0] = 0;
reg[4][0][1] = 0;
reg[5][0][0] = 0;
reg[5][0][1] = 0;
do
{
llvm_prefetch(cast(void*) a + prefetchShift, 0, 3, 1);
__vector(float[8])[2][1] ai = void;
__vector(float[8])[1][6] bi = void;
ai[0][0] = a[0][0][0];
ai[0][1] = a[0][0][1];
bi[0][0] = b[0][0][0];
bi[1][0] = b[0][1][0];
reg[0][0][0] += ai[0][0] * bi[0][0];
reg[0][0][1] += ai[0][1] * bi[0][0];
reg[1][0][0] += ai[0][0] * bi[1][0];
reg[1][0][1] += ai[0][1] * bi[1][0];
bi[2][0] = b[0][2][0];
bi[3][0] = b[0][3][0];
reg[2][0][0] += ai[0][0] * bi[2][0];
reg[2][0][1] += ai[0][1] * bi[2][0];
reg[3][0][0] += ai[0][0] * bi[3][0];
reg[3][0][1] += ai[0][1] * bi[3][0];
bi[4][0] = b[0][4][0];
bi[5][0] = b[0][5][0];
reg[4][0][0] += ai[0][0] * bi[4][0];
reg[4][0][1] += ai[0][1] * bi[4][0];
reg[5][0][0] += ai[0][0] * bi[5][0];
reg[5][0][1] += ai[0][1] * bi[5][0];
a++;
b++;
length--;
}
while (length);
load_nano(c, reg);
return a;
}
auto load_nano()(ref __vector(float[8])[2][1][6] to, ref __vector(float[8])[2][1][6] from)
{
to[0][0][0] = from[0][0][0];
to[0][0][1] = from[0][0][1];
to[1][0][0] = from[1][0][0];
to[1][0][1] = from[1][0][1];
to[2][0][0] = from[2][0][0];
to[2][0][1] = from[2][0][1];
to[3][0][0] = from[3][0][0];
to[3][0][1] = from[3][0][1];
to[4][0][0] = from[4][0][0];
to[4][0][1] = from[4][0][1];
to[5][0][0] = from[5][0][0];
to[5][0][1] = from[5][0][1];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment