Created
February 24, 2016 23:12
-
-
Save anadon/c1ea234ade1e9d076970 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma OPENCL EXTENSION cl_amd_printf : enable | |
typedef union GPtr { | |
__global float *f; | |
__global float2 *f2v; | |
__global float4 *f4v; | |
__global float8 *f8v; | |
__global float16 *f16v; | |
} GPtr; | |
typedef union LPtr { | |
__local float *f; | |
__local float2 *f2v; | |
__local float4 *f4v; | |
__local float8 *f8v; | |
__local float16 *f16v; | |
} LPtr; | |
typedef union PPtr { | |
float *f; | |
float2 *f2v; | |
float4 *f4v; | |
float8 *f8v; | |
float16 *f16v; | |
} PPtr; | |
__attribute__((reqd_work_group_size(64, 1, 1))) | |
void __kernel | |
strsmCached( | |
uint M, | |
uint N, | |
float alpha, | |
const __global float *restrict A, | |
uint lda, | |
__global float *B, | |
uint ldb) | |
{ | |
const int lid = get_local_id(0); | |
const int gid = get_group_id(0); | |
GPtr uA, uB; | |
uint coordA, coordB; | |
uint m0 = 0, k0, m1; | |
uint currM = (M - 1) / 32 * 32; | |
float4 a0, a1, a2, a3; | |
float4 b0, b1, b2, b3; | |
float4 c0; | |
uB.f = B; | |
for (m0 = 0; m0 < M; m0 += 32) { | |
c0 = 0; | |
coordA = currM + (lid / 8 * 4); | |
k0 = currM + 32; | |
coordB = gid * 8 + (lid % 8 * 1); | |
// Stage 1. Multiply and update with large blocks | |
if (coordB < N) { | |
for (k0 = currM + 32; k0 < M / 4 * 4; k0 += 4) { | |
/* -- Tiles multiplier -- */ | |
b0 = uB.f4v[mad24(coordB, (ldb >> 2), (uint)(k0 >> 2))]; | |
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))]; | |
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))]; | |
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))]; | |
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))]; | |
c0 += a0 * b0.s0; | |
c0 += a1 * b0.s1; | |
c0 += a2 * b0.s2; | |
c0 += a3 * b0.s3; | |
/* ---------------------- */ | |
} | |
} | |
uA.f = A; | |
if ((currM + 32 < M) && (coordB < N)) { | |
/* -- Tiles multiplier -- */ | |
b0 = uB.f4v[mad24(coordB % N, (ldb >> 2), (uint)(k0 >> 2))]; | |
b0.s0 = (k0 < M) ? b0.s0 : 0; | |
b0.s1 = (k0 + 1 < M) ? b0.s1 : 0; | |
b0.s2 = (k0 + 2 < M) ? b0.s2 : 0; | |
b0.s3 = (k0 + 3 < M) ? b0.s3 : 0; | |
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))]; | |
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))]; | |
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))]; | |
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))]; | |
a0 = (k0 < M) ? a0 : 0; | |
a1 = (k0 + 1 < M) ? a1 : 0; | |
a2 = (k0 + 2 < M) ? a2 : 0; | |
a3 = (k0 + 3 < M) ? a3 : 0; | |
c0 += a0 * b0.s0; | |
c0 += a1 * b0.s1; | |
c0 += a2 * b0.s2; | |
c0 += a3 * b0.s3; | |
/* ---------------------- */ | |
} | |
/* | |
* Stage 2. A part of work items multiply got result on a respective | |
* inverted diagonal block, and the remaining ones wait. Then they perform | |
* one step of further intermediate result evaluation as multiplying tile by tile. | |
* It continues until the whole panel of the matrix A is processed | |
*/ | |
for (m1 = 0; m1 < 8; m1++) { | |
coordA = currM + (lid / 8 * 4); | |
k0 = currM + 28 - m1 * 4; | |
coordB = gid * 8 + (lid % 8 * 1); | |
if (lid / 8 + m1 == 7 && coordA < M && coordB < N) { | |
{ | |
uint y = min(4u, M - (uint)coordA); | |
uint x = min(1u, N - (uint)coordB); | |
if ((y == 4) && (x == 1)) { | |
float beta = -1. / alpha; | |
float alpha = beta; | |
GPtr uC; | |
uC.f = B + coordB * ldb + coordA; | |
__global float4 *pC = uC.f; | |
float4 tempC0; | |
tempC0 = pC[0]; | |
c0 = c0 * alpha + tempC0; | |
} | |
else { | |
float beta = -1. / alpha; | |
float alpha = 1.; | |
GPtr uC; | |
int i, j; | |
PPtr res; | |
uC.f = B + coordB * ldb + coordA; | |
if (x) { | |
switch (y) { | |
case 4: | |
c0.s3 = c0.s3 * beta + uC.f[3] * alpha; | |
case 3: | |
c0.s2 = c0.s2 * beta + uC.f[2] * alpha; | |
case 2: | |
c0.s1 = c0.s1 * beta + uC.f[1] * alpha; | |
case 1: | |
c0.s0 = c0.s0 * beta + uC.f[0] * alpha; | |
} | |
uC.f += ldb; | |
x--; | |
} | |
} | |
} | |
// Fetch and invert the square tile located on the diagonal | |
b0 = uA.f4v[mad24(k0 % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))]; | |
b1 = uA.f4v[mad24((k0 + 1) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))]; | |
b2 = uA.f4v[mad24((k0 + 2) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))]; | |
b3 = uA.f4v[mad24((k0 + 3) % M, (lda >> 2), (uint)(coordA >> 2) % (uint)(lda >> 2))]; | |
b0 = (k0 < M) ? b0 : 0; | |
b1 = (k0 + 1 < M) ? b1 : 0; | |
b2 = (k0 + 2 < M) ? b2 : 0; | |
b3 = (k0 + 3 < M) ? b3 : 0; | |
// post fetch A | |
{ | |
uint zy = k0; | |
b0.s0 = zy < coordA ? 0 : b0.s0; | |
b0.s1 = zy < coordA + 1 ? 0 : b0.s1; | |
b0.s2 = zy < coordA + 2 ? 0 : b0.s2; | |
b0.s3 = zy < coordA + 3 ? 0 : b0.s3; | |
zy++; | |
b1.s0 = zy < coordA ? 0 : b1.s0; | |
b1.s1 = zy < coordA + 1 ? 0 : b1.s1; | |
b1.s2 = zy < coordA + 2 ? 0 : b1.s2; | |
b1.s3 = zy < coordA + 3 ? 0 : b1.s3; | |
zy++; | |
b2.s0 = zy < coordA ? 0 : b2.s0; | |
b2.s1 = zy < coordA + 1 ? 0 : b2.s1; | |
b2.s2 = zy < coordA + 2 ? 0 : b2.s2; | |
b2.s3 = zy < coordA + 3 ? 0 : b2.s3; | |
zy++; | |
b3.s0 = zy < coordA ? 0 : b3.s0; | |
b3.s1 = zy < coordA + 1 ? 0 : b3.s1; | |
b3.s2 = zy < coordA + 2 ? 0 : b3.s2; | |
b3.s3 = zy < coordA + 3 ? 0 : b3.s3; | |
} | |
const int bound = (coordA + 4 > M) ? (M - coordA) : 4; | |
b0.s0 = (bound <= 0) ? 0 : b0.s0; | |
b1.s0 = (bound <= 0) ? 0 : b1.s0; | |
b2.s0 = (bound <= 0) ? 0 : b2.s0; | |
b3.s0 = (bound <= 0) ? 0 : b3.s0; | |
b0.s1 = (bound <= 1) ? 0 : b0.s1; | |
b1.s1 = (bound <= 1) ? 0 : b1.s1; | |
b2.s1 = (bound <= 1) ? 0 : b2.s1; | |
b3.s1 = (bound <= 1) ? 0 : b3.s1; | |
b0.s2 = (bound <= 2) ? 0 : b0.s2; | |
b1.s2 = (bound <= 2) ? 0 : b1.s2; | |
b2.s2 = (bound <= 2) ? 0 : b2.s2; | |
b3.s2 = (bound <= 2) ? 0 : b3.s2; | |
b0.s3 = (bound <= 3) ? 0 : b0.s3; | |
b1.s3 = (bound <= 3) ? 0 : b1.s3; | |
b2.s3 = (bound <= 3) ? 0 : b2.s3; | |
b3.s3 = (bound <= 3) ? 0 : b3.s3; | |
b0.s0 = (bound <= 0) ? 1 : b0.s0; | |
b1.s1 = (bound <= 1) ? 1 : b1.s1; | |
b2.s2 = (bound <= 2) ? 1 : b2.s2; | |
b3.s3 = (bound <= 3) ? 1 : b3.s3; | |
// Invert tile | |
a0 = 0; | |
a1 = 0; | |
a2 = 0; | |
a3 = 0; | |
a0.s0 = 1; | |
a1.s1 = 1; | |
a2.s2 = 1; | |
a3.s3 = 1; | |
a3.s3 /= b3.s3; | |
a2.s3 /= b3.s3; | |
a1.s3 /= b3.s3; | |
a0.s3 /= b3.s3; | |
a3.s2 -= a3.s3 * b3.s2; | |
a3.s2 /= b2.s2; | |
a2.s2 -= a2.s3 * b3.s2; | |
a2.s2 /= b2.s2; | |
a1.s2 -= a1.s3 * b3.s2; | |
a1.s2 /= b2.s2; | |
a0.s2 -= a0.s3 * b3.s2; | |
a0.s2 /= b2.s2; | |
a3.s1 -= a3.s3 * b3.s1; | |
a2.s1 -= a2.s3 * b3.s1; | |
a1.s1 -= a1.s3 * b3.s1; | |
a0.s1 -= a0.s3 * b3.s1; | |
a3.s0 -= a3.s3 * b3.s0; | |
a2.s0 -= a2.s3 * b3.s0; | |
a1.s0 -= a1.s3 * b3.s0; | |
a0.s0 -= a0.s3 * b3.s0; | |
a3.s1 -= a3.s2 * b2.s1; | |
a3.s1 /= b1.s1; | |
a2.s1 -= a2.s2 * b2.s1; | |
a2.s1 /= b1.s1; | |
a1.s1 -= a1.s2 * b2.s1; | |
a1.s1 /= b1.s1; | |
a0.s1 -= a0.s2 * b2.s1; | |
a0.s1 /= b1.s1; | |
a3.s0 -= a3.s2 * b2.s0; | |
a2.s0 -= a2.s2 * b2.s0; | |
a1.s0 -= a1.s2 * b2.s0; | |
a0.s0 -= a0.s2 * b2.s0; | |
a3.s0 -= a3.s1 * b1.s0; | |
a3.s0 /= b0.s0; | |
a2.s0 -= a2.s1 * b1.s0; | |
a2.s0 /= b0.s0; | |
a1.s0 -= a1.s1 * b1.s0; | |
a1.s0 /= b0.s0; | |
a0.s0 -= a0.s1 * b1.s0; | |
a0.s0 /= b0.s0; | |
b0.s0 = c0.s0; | |
b1.s0 = c0.s1; | |
b2.s0 = c0.s2; | |
b3.s0 = c0.s3; | |
c0 = 0; | |
c0 += a0 * b0.s0; | |
c0 += a1 * b1.s0; | |
c0 += a2 * b2.s0; | |
c0 += a3 * b3.s0; | |
// Write back the given result | |
uint y = min(4u, M - (uint)coordA); | |
uint x = min(1u, N - (uint)coordB); | |
if ((y == 4) && (x == 1)) { | |
GPtr uC; | |
uC.f = B + coordB * ldb + coordA; | |
__global float4 *pC = uC.f; | |
float4 tempC0; | |
tempC0 = c0 * alpha + 0; | |
pC[0] = tempC0; | |
} | |
else { | |
GPtr uC; | |
int i, j; | |
PPtr res; | |
uC.f = B + coordB * ldb + coordA; | |
if (x) { | |
switch (y) { | |
case 4: | |
uC.f[3] = c0.s3 * alpha; | |
case 3: | |
uC.f[2] = c0.s2 * alpha; | |
case 2: | |
uC.f[1] = c0.s1 * alpha; | |
case 1: | |
uC.f[0] = c0.s0 * alpha; | |
} | |
uC.f += ldb; | |
x--; | |
} | |
} | |
} | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
if (lid / 8 + m1 < 7 && coordA < M && coordB < N) { | |
/* -- Tiles multiplier -- */ | |
b0 = uB.f4v[mad24(coordB % N, (ldb >> 2), (uint)(k0 >> 2))]; | |
b0.s0 = (k0 < M) ? b0.s0 : 0; | |
b0.s1 = (k0 + 1 < M) ? b0.s1 : 0; | |
b0.s2 = (k0 + 2 < M) ? b0.s2 : 0; | |
b0.s3 = (k0 + 3 < M) ? b0.s3 : 0; | |
a0 = uA.f4v[mad24(k0, (lda >> 2), (uint)(coordA >> 2))]; | |
a1 = uA.f4v[mad24(k0 + 1, (lda >> 2), (uint)(coordA >> 2))]; | |
a2 = uA.f4v[mad24(k0 + 2, (lda >> 2), (uint)(coordA >> 2))]; | |
a3 = uA.f4v[mad24(k0 + 3, (lda >> 2), (uint)(coordA >> 2))]; | |
a0 = (k0 < M) ? a0 : 0; | |
a1 = (k0 + 1 < M) ? a1 : 0; | |
a2 = (k0 + 2 < M) ? a2 : 0; | |
a3 = (k0 + 3 < M) ? a3 : 0; | |
c0 += a0 * b0.s0; | |
c0 += a1 * b0.s1; | |
c0 += a2 * b0.s2; | |
c0 += a3 * b0.s3; | |
/* ---------------------- */ | |
} | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
} | |
currM -= 32; | |
} | |
} | |
Program received signal SIGSEGV, Segmentation fault. | |
0x00007ffff3ac1928 in llvm::MCRegisterInfo::getDwarfRegNum(unsigned int, bool) const () from /usr/lib/libMesaOpenCL.so.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment