Skip to content

Instantly share code, notes, and snippets.

@anadon
Created February 24, 2016 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anadon/c1ea234ade1e9d076970 to your computer and use it in GitHub Desktop.
Save anadon/c1ea234ade1e9d076970 to your computer and use it in GitHub Desktop.
#pragma OPENCL EXTENSION cl_amd_printf : enable
typedef union GPtr {
__global float *f;
__global float2 *f2v;
__global float4 *f4v;
__global float8 *f8v;
__global float16 *f16v;
} GPtr;
typedef union LPtr {
__local float *f;
__local float2 *f2v;
__local float4 *f4v;
__local float8 *f8v;
__local float16 *f16v;
} LPtr;
typedef union PPtr {
float *f;
float2 *f2v;
float4 *f4v;
float8 *f8v;
float16 *f16v;
} PPtr;
__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
strsmCached(
uint M,
uint N,
float alpha,
const __global float *restrict A,
uint lda,
__global float *B,
uint ldb)
{
const int lid = get_local_id(0);
const int gid = get_group_id(0);
GPtr uA, uB;
uint coordA, coordB;
uint m0 = 0, k0, m1;
uint currM = (M - 1) / 32 * 32;
float4 a0, a1, a2, a3;
float4 b0, b1, b2, b3;
float4 c0;
uB.f = B;
for (m0 = 0; m0 < M; m0 += 32) {
c0 = 0;
coordA = currM + (lid / 8 * 4);
k0 = currM + 32;
coordB = gid * 8 + (lid % 8 * 1);
// Stage 1. Multiply and update with large blocks
if (coordB < N) {
for (k0 = currM + 32; k0 < M / 4 * 4; k0 += 4) {
/* -- Tiles multiplier -- */
b0 = uB.f4v[mad24(coordB, (ldb >> 2), (uint)(k0 >> 2))];
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))];
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))];
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))];
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))];
c0 += a0 * b0.s0;
c0 += a1 * b0.s1;
c0 += a2 * b0.s2;
c0 += a3 * b0.s3;
/* ---------------------- */
}
}
uA.f = A;
if ((currM + 32 < M) && (coordB < N)) {
/* -- Tiles multiplier -- */
b0 = uB.f4v[mad24(coordB % N, (ldb >> 2), (uint)(k0 >> 2))];
b0.s0 = (k0 < M) ? b0.s0 : 0;
b0.s1 = (k0 + 1 < M) ? b0.s1 : 0;
b0.s2 = (k0 + 2 < M) ? b0.s2 : 0;
b0.s3 = (k0 + 3 < M) ? b0.s3 : 0;
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))];
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))];
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))];
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))];
a0 = (k0 < M) ? a0 : 0;
a1 = (k0 + 1 < M) ? a1 : 0;
a2 = (k0 + 2 < M) ? a2 : 0;
a3 = (k0 + 3 < M) ? a3 : 0;
c0 += a0 * b0.s0;
c0 += a1 * b0.s1;
c0 += a2 * b0.s2;
c0 += a3 * b0.s3;
/* ---------------------- */
}
/*
* Stage 2. A part of work items multiply got result on a respective
* inverted diagonal block, and the remaining ones wait. Then they perform
* one step of further intermediate result evaluation as multiplying tile by tile.
* It continues until the whole panel of the matrix A is processed
*/
for (m1 = 0; m1 < 8; m1++) {
coordA = currM + (lid / 8 * 4);
k0 = currM + 28 - m1 * 4;
coordB = gid * 8 + (lid % 8 * 1);
if (lid / 8 + m1 == 7 && coordA < M && coordB < N) {
{
uint y = min(4u, M - (uint)coordA);
uint x = min(1u, N - (uint)coordB);
if ((y == 4) && (x == 1)) {
float beta = -1. / alpha;
float alpha = beta;
GPtr uC;
uC.f = B + coordB * ldb + coordA;
__global float4 *pC = uC.f;
float4 tempC0;
tempC0 = pC[0];
c0 = c0 * alpha + tempC0;
}
else {
float beta = -1. / alpha;
float alpha = 1.;
GPtr uC;
int i, j;
PPtr res;
uC.f = B + coordB * ldb + coordA;
if (x) {
switch (y) {
case 4:
c0.s3 = c0.s3 * beta + uC.f[3] * alpha;
case 3:
c0.s2 = c0.s2 * beta + uC.f[2] * alpha;
case 2:
c0.s1 = c0.s1 * beta + uC.f[1] * alpha;
case 1:
c0.s0 = c0.s0 * beta + uC.f[0] * alpha;
}
uC.f += ldb;
x--;
}
}
}
// Fetch and invert the square tile located on the diagonal
b0 = uA.f4v[mad24(k0 % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))];
b1 = uA.f4v[mad24((k0 + 1) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))];
b2 = uA.f4v[mad24((k0 + 2) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))];
b3 = uA.f4v[mad24((k0 + 3) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))];
b0 = (k0 < M) ? b0 : 0;
b1 = (k0 + 1 < M) ? b1 : 0;
b2 = (k0 + 2 < M) ? b2 : 0;
b3 = (k0 + 3 < M) ? b3 : 0;
// post fetch A
{
uint zy = k0;
b0.s0 = zy < coordA ? 0 : b0.s0;
b0.s1 = zy < coordA + 1 ? 0 : b0.s1;
b0.s2 = zy < coordA + 2 ? 0 : b0.s2;
b0.s3 = zy < coordA + 3 ? 0 : b0.s3;
zy++;
b1.s0 = zy < coordA ? 0 : b1.s0;
b1.s1 = zy < coordA + 1 ? 0 : b1.s1;
b1.s2 = zy < coordA + 2 ? 0 : b1.s2;
b1.s3 = zy < coordA + 3 ? 0 : b1.s3;
zy++;
b2.s0 = zy < coordA ? 0 : b2.s0;
b2.s1 = zy < coordA + 1 ? 0 : b2.s1;
b2.s2 = zy < coordA + 2 ? 0 : b2.s2;
b2.s3 = zy < coordA + 3 ? 0 : b2.s3;
zy++;
b3.s0 = zy < coordA ? 0 : b3.s0;
b3.s1 = zy < coordA + 1 ? 0 : b3.s1;
b3.s2 = zy < coordA + 2 ? 0 : b3.s2;
b3.s3 = zy < coordA + 3 ? 0 : b3.s3;
}
const int bound = (coordA + 4 > M) ? (M - coordA) : 4;
b0.s0 = (bound <= 0) ? 0 : b0.s0;
b1.s0 = (bound <= 0) ? 0 : b1.s0;
b2.s0 = (bound <= 0) ? 0 : b2.s0;
b3.s0 = (bound <= 0) ? 0 : b3.s0;
b0.s1 = (bound <= 1) ? 0 : b0.s1;
b1.s1 = (bound <= 1) ? 0 : b1.s1;
b2.s1 = (bound <= 1) ? 0 : b2.s1;
b3.s1 = (bound <= 1) ? 0 : b3.s1;
b0.s2 = (bound <= 2) ? 0 : b0.s2;
b1.s2 = (bound <= 2) ? 0 : b1.s2;
b2.s2 = (bound <= 2) ? 0 : b2.s2;
b3.s2 = (bound <= 2) ? 0 : b3.s2;
b0.s3 = (bound <= 3) ? 0 : b0.s3;
b1.s3 = (bound <= 3) ? 0 : b1.s3;
b2.s3 = (bound <= 3) ? 0 : b2.s3;
b3.s3 = (bound <= 3) ? 0 : b3.s3;
b0.s0 = (bound <= 0) ? 1 : b0.s0;
b1.s1 = (bound <= 1) ? 1 : b1.s1;
b2.s2 = (bound <= 2) ? 1 : b2.s2;
b3.s3 = (bound <= 3) ? 1 : b3.s3;
// Invert tile
a0 = 0;
a1 = 0;
a2 = 0;
a3 = 0;
a0.s0 = 1;
a1.s1 = 1;
a2.s2 = 1;
a3.s3 = 1;
a3.s3 /= b3.s3;
a2.s3 /= b3.s3;
a1.s3 /= b3.s3;
a0.s3 /= b3.s3;
a3.s2 -= a3.s3 * b3.s2;
a3.s2 /= b2.s2;
a2.s2 -= a2.s3 * b3.s2;
a2.s2 /= b2.s2;
a1.s2 -= a1.s3 * b3.s2;
a1.s2 /= b2.s2;
a0.s2 -= a0.s3 * b3.s2;
a0.s2 /= b2.s2;
a3.s1 -= a3.s3 * b3.s1;
a2.s1 -= a2.s3 * b3.s1;
a1.s1 -= a1.s3 * b3.s1;
a0.s1 -= a0.s3 * b3.s1;
a3.s0 -= a3.s3 * b3.s0;
a2.s0 -= a2.s3 * b3.s0;
a1.s0 -= a1.s3 * b3.s0;
a0.s0 -= a0.s3 * b3.s0;
a3.s1 -= a3.s2 * b2.s1;
a3.s1 /= b1.s1;
a2.s1 -= a2.s2 * b2.s1;
a2.s1 /= b1.s1;
a1.s1 -= a1.s2 * b2.s1;
a1.s1 /= b1.s1;
a0.s1 -= a0.s2 * b2.s1;
a0.s1 /= b1.s1;
a3.s0 -= a3.s2 * b2.s0;
a2.s0 -= a2.s2 * b2.s0;
a1.s0 -= a1.s2 * b2.s0;
a0.s0 -= a0.s2 * b2.s0;
a3.s0 -= a3.s1 * b1.s0;
a3.s0 /= b0.s0;
a2.s0 -= a2.s1 * b1.s0;
a2.s0 /= b0.s0;
a1.s0 -= a1.s1 * b1.s0;
a1.s0 /= b0.s0;
a0.s0 -= a0.s1 * b1.s0;
a0.s0 /= b0.s0;
b0.s0 = c0.s0;
b1.s0 = c0.s1;
b2.s0 = c0.s2;
b3.s0 = c0.s3;
c0 = 0;
c0 += a0 * b0.s0;
c0 += a1 * b1.s0;
c0 += a2 * b2.s0;
c0 += a3 * b3.s0;
// Write back the given result
uint y = min(4u, M - (uint)coordA);
uint x = min(1u, N - (uint)coordB);
if ((y == 4) && (x == 1)) {
GPtr uC;
uC.f = B + coordB * ldb + coordA;
__global float4 *pC = uC.f;
float4 tempC0;
tempC0 = c0 * alpha + 0;
pC[0] = tempC0;
}
else {
GPtr uC;
int i, j;
PPtr res;
uC.f = B + coordB * ldb + coordA;
if (x) {
switch (y) {
case 4:
uC.f[3] = c0.s3 * alpha;
case 3:
uC.f[2] = c0.s2 * alpha;
case 2:
uC.f[1] = c0.s1 * alpha;
case 1:
uC.f[0] = c0.s0 * alpha;
}
uC.f += ldb;
x--;
}
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
if (lid / 8 + m1 < 7 && coordA < M && coordB < N) {
/* -- Tiles multiplier -- */
b0 = uB.f4v[mad24(coordB % N, (ldb >> 2), (uint)(k0 >> 2))];
b0.s0 = (k0 < M) ? b0.s0 : 0;
b0.s1 = (k0 + 1 < M) ? b0.s1 : 0;
b0.s2 = (k0 + 2 < M) ? b0.s2 : 0;
b0.s3 = (k0 + 3 < M) ? b0.s3 : 0;
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))];
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))];
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))];
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))];
a0 = (k0 < M) ? a0 : 0;
a1 = (k0 + 1 < M) ? a1 : 0;
a2 = (k0 + 2 < M) ? a2 : 0;
a3 = (k0 + 3 < M) ? a3 : 0;
c0 += a0 * b0.s0;
c0 += a1 * b0.s1;
c0 += a2 * b0.s2;
c0 += a3 * b0.s3;
/* ---------------------- */
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
currM -= 32;
}
}
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff3ac1928 in llvm::MCRegisterInfo::getDwarfRegNum(unsigned int, bool) const () from /usr/lib/libMesaOpenCL.so.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment