Skip to content

Instantly share code, notes, and snippets.

@pavanky
Last active August 29, 2015 14:06
Show Gist options
  • Save pavanky/b19b5c7fca58e557510c to your computer and use it in GitHub Desktop.
Save pavanky/b19b5c7fca58e557510c to your computer and use it in GitHub Desktop.
clblas_transpose_failure
#define __CL_ENABLE_EXCEPTIONS
#include "cl.hpp"
#include <clBLAS.h>
#include <vector>
#include <iostream>
#include <iterator>
#include <algorithm>
using namespace cl;
using namespace std;
void call_back(const char *errinfo,
const void *private_info,
::size_t cb, void *user_data)
{
printf("Context Error: %s\n", errinfo); fflush(stdout);
}
int main(int argc, char* argv[])
{
Context(CL_DEVICE_TYPE_DEFAULT, NULL, call_back);
static const unsigned elements = 10 * 10; // Just make them big enough
std::vector<float> data(elements, 1);
Buffer a(begin(data), end(data), true, false);
Buffer b(begin(data), end(data), true, false);
Buffer c(CL_MEM_READ_WRITE, elements * sizeof(float));
CommandQueue queue = CommandQueue::getDefault();
clblasSetup();
clblasStatus ret = clblasSgemm(clblasColumnMajor,
clblasTrans, clblasNoTrans,
10, 10, 10, // M, N, K
1, // Alpha
a(), 0, 10,
b(), 0, 10,
0, // Beta
c(), 0, 10,
1, &queue(), 0, 0, 0);
printf("CLBLAS STATUS: %d\n", ret);
}
========================================================
AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz
error = -11
memory pattern = Cached global memory based subgroup gemm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 16, dims[0].itemX = 8, dims[0].y = 16, dims[0].x = 8, dims[0].bwidth = 64; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ;
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64
Kernel extra flags: 939556625
Source:
#define A_BLK_H 4
#define B_BLK_H 4
#define SUBG_ITEMS 8
#define SUBG_A 4
#define SUBG_B 2
#define SUBG_B 2
#define K_VLEN_A 2
#define K_VLEN_B 2
typedef union GPtr {
__global float *f;
__global float2 *f2v;
__global float4 *f4v;
__global float8 *f8v;
__global float16 *f16v;
} GPtr;
typedef union LPtr {
__local float *f;
__local float2 *f2v;
__local float4 *f4v;
__local float8 *f8v;
__local float16 *f16v;
} LPtr;
typedef union PPtr {
float *f;
float2 *f2v;
float4 *f4v;
float8 *f8v;
float16 *f16v;
} PPtr;
__attribute__((reqd_work_group_size(8, 8, 1)))
void __kernel
sgemmSubgroup(
uint M,
uint N,
uint K,
const float alpha,
const float beta,
const __global float2 *restrict A,
const __global float2 *restrict B,
__global float2 *C,
uint lda,
uint ldb,
uint ldc)
{
uint Ktail = K % 64;
uint Kbase = K - Ktail;
float8 a0, a1, a2, a3;
float8 b0, b1, b2, b3;
float4 c0, c1, c2, c3;
lda /= K_VLEN_A;
ldb /= K_VLEN_B;
int2 itemId;
itemId.x = get_local_id(0);
itemId.y = get_local_id(1);
int coordY = A_BLK_H*( get_group_id(1)*SUBG_A + get_local_id(1)/SUBG_B );
int coordX = B_BLK_H*( get_group_id(0)*SUBG_B + get_local_id(1)%SUBG_B );
uint skipTileMul = 0;
//M block tail
if( coordY >= M ) {
skipTileMul = 1;
}
//N block tail
if( coordX >= N ) {
skipTileMul = 1;
}
//Raising "Lower" M N tails
if (coordY + 4 > M) {
coordY -= 4 - M % 4;
}
if (coordX + 4 > N) {
coordX -= 4 - N % 4;
}
A += lda*coordY;
B += ldb*coordX;
c0 = 0;
c1 = 0;
c2 = 0;
c3 = 0;
const uint vKB = (Kbase >> 1);
if( !skipTileMul ) {
for(int k = 8*get_local_id(0); k < Kbase; k += 8*SUBG_ITEMS) {
/* -- Tiles multiplier -- */
float8 sum;
const uint4 bk = ((uint4)(0, 1, 2, 3) + (( get_group_id(0)*0 + k ) >> 1)) % vKB;
b0.s01 = B[bk.s0];
b0.s23 = B[bk.s1];
b0.s45 = B[bk.s2];
b0.s67 = B[bk.s3];
b1.s01 = B[ldb + bk.s0];
b1.s23 = B[ldb + bk.s1];
b1.s45 = B[ldb + bk.s2];
b1.s67 = B[ldb + bk.s3];
b2.s01 = B[mad24(2u, ldb, bk.s0)];
b2.s23 = B[mad24(2u, ldb, bk.s1)];
b2.s45 = B[mad24(2u, ldb, bk.s2)];
b2.s67 = B[mad24(2u, ldb, bk.s3)];
b3.s01 = B[mad24(3u, ldb, bk.s0)];
b3.s23 = B[mad24(3u, ldb, bk.s1)];
b3.s45 = B[mad24(3u, ldb, bk.s2)];
b3.s67 = B[mad24(3u, ldb, bk.s3)];
a0.s01 = A[bk.s0];
a0.s23 = A[bk.s1];
a0.s45 = A[bk.s2];
a0.s67 = A[bk.s3];
a1.s01 = A[lda + bk.s0];
a1.s23 = A[lda + bk.s1];
a1.s45 = A[lda + bk.s2];
a1.s67 = A[lda + bk.s3];
a2.s01 = A[mad24(2u, lda, bk.s0)];
a2.s23 = A[mad24(2u, lda, bk.s1)];
a2.s45 = A[mad24(2u, lda, bk.s2)];
a2.s67 = A[mad24(2u, lda, bk.s3)];
a3.s01 = A[mad24(3u, lda, bk.s0)];
a3.s23 = A[mad24(3u, lda, bk.s1)];
a3.s45 = A[mad24(3u, lda, bk.s2)];
a3.s67 = A[mad24(3u, lda, bk.s3)];
c0.s0 += a0.s0 * b0.s0;
c0.s0 += a0.s1 * b0.s1;
c0.s0 += a0.s2 * b0.s2;
c0.s0 += a0.s3 * b0.s3;
c0.s0 += a0.s4 * b0.s4;
c0.s0 += a0.s5 * b0.s5;
c0.s0 += a0.s6 * b0.s6;
c0.s0 += a0.s7 * b0.s7;
c1.s0 += a0.s0 * b1.s0;
c1.s0 += a0.s1 * b1.s1;
c1.s0 += a0.s2 * b1.s2;
c1.s0 += a0.s3 * b1.s3;
c1.s0 += a0.s4 * b1.s4;
c1.s0 += a0.s5 * b1.s5;
c1.s0 += a0.s6 * b1.s6;
c1.s0 += a0.s7 * b1.s7;
c2.s0 += a0.s0 * b2.s0;
c2.s0 += a0.s1 * b2.s1;
c2.s0 += a0.s2 * b2.s2;
c2.s0 += a0.s3 * b2.s3;
c2.s0 += a0.s4 * b2.s4;
c2.s0 += a0.s5 * b2.s5;
c2.s0 += a0.s6 * b2.s6;
c2.s0 += a0.s7 * b2.s7;
c3.s0 += a0.s0 * b3.s0;
c3.s0 += a0.s1 * b3.s1;
c3.s0 += a0.s2 * b3.s2;
c3.s0 += a0.s3 * b3.s3;
c3.s0 += a0.s4 * b3.s4;
c3.s0 += a0.s5 * b3.s5;
c3.s0 += a0.s6 * b3.s6;
c3.s0 += a0.s7 * b3.s7;
c0.s1 += a1.s0 * b0.s0;
c0.s1 += a1.s1 * b0.s1;
c0.s1 += a1.s2 * b0.s2;
c0.s1 += a1.s3 * b0.s3;
c0.s1 += a1.s4 * b0.s4;
c0.s1 += a1.s5 * b0.s5;
c0.s1 += a1.s6 * b0.s6;
c0.s1 += a1.s7 * b0.s7;
c1.s1 += a1.s0 * b1.s0;
c1.s1 += a1.s1 * b1.s1;
c1.s1 += a1.s2 * b1.s2;
c1.s1 += a1.s3 * b1.s3;
c1.s1 += a1.s4 * b1.s4;
c1.s1 += a1.s5 * b1.s5;
c1.s1 += a1.s6 * b1.s6;
c1.s1 += a1.s7 * b1.s7;
c2.s1 += a1.s0 * b2.s0;
c2.s1 += a1.s1 * b2.s1;
c2.s1 += a1.s2 * b2.s2;
c2.s1 += a1.s3 * b2.s3;
c2.s1 += a1.s4 * b2.s4;
c2.s1 += a1.s5 * b2.s5;
c2.s1 += a1.s6 * b2.s6;
c2.s1 += a1.s7 * b2.s7;
c3.s1 += a1.s0 * b3.s0;
c3.s1 += a1.s1 * b3.s1;
c3.s1 += a1.s2 * b3.s2;
c3.s1 += a1.s3 * b3.s3;
c3.s1 += a1.s4 * b3.s4;
c3.s1 += a1.s5 * b3.s5;
c3.s1 += a1.s6 * b3.s6;
c3.s1 += a1.s7 * b3.s7;
c0.s2 += a2.s0 * b0.s0;
c0.s2 += a2.s1 * b0.s1;
c0.s2 += a2.s2 * b0.s2;
c0.s2 += a2.s3 * b0.s3;
c0.s2 += a2.s4 * b0.s4;
c0.s2 += a2.s5 * b0.s5;
c0.s2 += a2.s6 * b0.s6;
c0.s2 += a2.s7 * b0.s7;
c1.s2 += a2.s0 * b1.s0;
c1.s2 += a2.s1 * b1.s1;
c1.s2 += a2.s2 * b1.s2;
c1.s2 += a2.s3 * b1.s3;
c1.s2 += a2.s4 * b1.s4;
c1.s2 += a2.s5 * b1.s5;
c1.s2 += a2.s6 * b1.s6;
c1.s2 += a2.s7 * b1.s7;
c2.s2 += a2.s0 * b2.s0;
c2.s2 += a2.s1 * b2.s1;
c2.s2 += a2.s2 * b2.s2;
c2.s2 += a2.s3 * b2.s3;
c2.s2 += a2.s4 * b2.s4;
c2.s2 += a2.s5 * b2.s5;
c2.s2 += a2.s6 * b2.s6;
c2.s2 += a2.s7 * b2.s7;
c3.s2 += a2.s0 * b3.s0;
c3.s2 += a2.s1 * b3.s1;
c3.s2 += a2.s2 * b3.s2;
c3.s2 += a2.s3 * b3.s3;
c3.s2 += a2.s4 * b3.s4;
c3.s2 += a2.s5 * b3.s5;
c3.s2 += a2.s6 * b3.s6;
c3.s2 += a2.s7 * b3.s7;
c0.s3 += a3.s0 * b0.s0;
c0.s3 += a3.s1 * b0.s1;
c0.s3 += a3.s2 * b0.s2;
c0.s3 += a3.s3 * b0.s3;
c0.s3 += a3.s4 * b0.s4;
c0.s3 += a3.s5 * b0.s5;
c0.s3 += a3.s6 * b0.s6;
c0.s3 += a3.s7 * b0.s7;
c1.s3 += a3.s0 * b1.s0;
c1.s3 += a3.s1 * b1.s1;
c1.s3 += a3.s2 * b1.s2;
c1.s3 += a3.s3 * b1.s3;
c1.s3 += a3.s4 * b1.s4;
c1.s3 += a3.s5 * b1.s5;
c1.s3 += a3.s6 * b1.s6;
c1.s3 += a3.s7 * b1.s7;
c2.s3 += a3.s0 * b2.s0;
c2.s3 += a3.s1 * b2.s1;
c2.s3 += a3.s2 * b2.s2;
c2.s3 += a3.s3 * b2.s3;
c2.s3 += a3.s4 * b2.s4;
c2.s3 += a3.s5 * b2.s5;
c2.s3 += a3.s6 * b2.s6;
c2.s3 += a3.s7 * b2.s7;
c3.s3 += a3.s0 * b3.s0;
c3.s3 += a3.s1 * b3.s1;
c3.s3 += a3.s2 * b3.s2;
c3.s3 += a3.s3 * b3.s3;
c3.s3 += a3.s4 * b3.s4;
c3.s3 += a3.s5 * b3.s5;
c3.s3 += a3.s6 * b3.s6;
c3.s3 += a3.s7 * b3.s7;
/* ---------------------- */
}
uint k = Kbase + get_local_id(0)*8;
{
/* -- Tiles multiplier -- */
float8 sum;
const uint vKB = ((K + 1) >> 1);
const uint4 bk = ((uint4)(0, 1, 2, 3) + (k >> 1)) % vKB;
b0.s01 = B[bk.s0];
b0.s23 = B[bk.s1];
b0.s45 = B[bk.s2];
b0.s67 = B[bk.s3];
b1.s01 = B[ldb + bk.s0];
b1.s23 = B[ldb + bk.s1];
b1.s45 = B[ldb + bk.s2];
b1.s67 = B[ldb + bk.s3];
b2.s01 = B[mad24(2u, ldb, bk.s0)];
b2.s23 = B[mad24(2u, ldb, bk.s1)];
b2.s45 = B[mad24(2u, ldb, bk.s2)];
b2.s67 = B[mad24(2u, ldb, bk.s3)];
b3.s01 = B[mad24(3u, ldb, bk.s0)];
b3.s23 = B[mad24(3u, ldb, bk.s1)];
b3.s45 = B[mad24(3u, ldb, bk.s2)];
b3.s67 = B[mad24(3u, ldb, bk.s3)];
b0.s0 = (k < K) ? b0.s0 : 0;
b1.s0 = (k < K) ? b1.s0 : 0;
b2.s0 = (k < K) ? b2.s0 : 0;
b3.s0 = (k < K) ? b3.s0 : 0;
b0.s1 = (k + 1 < K) ? b0.s1 : 0;
b1.s1 = (k + 1 < K) ? b1.s1 : 0;
b2.s1 = (k + 1 < K) ? b2.s1 : 0;
b3.s1 = (k + 1 < K) ? b3.s1 : 0;
b0.s2 = (k + 2 < K) ? b0.s2 : 0;
b1.s2 = (k + 2 < K) ? b1.s2 : 0;
b2.s2 = (k + 2 < K) ? b2.s2 : 0;
b3.s2 = (k + 2 < K) ? b3.s2 : 0;
b0.s3 = (k + 3 < K) ? b0.s3 : 0;
b1.s3 = (k + 3 < K) ? b1.s3 : 0;
b2.s3 = (k + 3 < K) ? b2.s3 : 0;
b3.s3 = (k + 3 < K) ? b3.s3 : 0;
b0.s4 = (k + 4 < K) ? b0.s4 : 0;
b1.s4 = (k + 4 < K) ? b1.s4 : 0;
b2.s4 = (k + 4 < K) ? b2.s4 : 0;
b3.s4 = (k + 4 < K) ? b3.s4 : 0;
b0.s5 = (k + 5 < K) ? b0.s5 : 0;
b1.s5 = (k + 5 < K) ? b1.s5 : 0;
b2.s5 = (k + 5 < K) ? b2.s5 : 0;
b3.s5 = (k + 5 < K) ? b3.s5 : 0;
b0.s6 = (k + 6 < K) ? b0.s6 : 0;
b1.s6 = (k + 6 < K) ? b1.s6 : 0;
b2.s6 = (k + 6 < K) ? b2.s6 : 0;
b3.s6 = (k + 6 < K) ? b3.s6 : 0;
b0.s7 = (k + 7 < K) ? b0.s7 : 0;
b1.s7 = (k + 7 < K) ? b1.s7 : 0;
b2.s7 = (k + 7 < K) ? b2.s7 : 0;
b3.s7 = (k + 7 < K) ? b3.s7 : 0;
a0.s01 = A[bk.s0];
a0.s23 = A[bk.s1];
a0.s45 = A[bk.s2];
a0.s67 = A[bk.s3];
a1.s01 = A[lda + bk.s0];
a1.s23 = A[lda + bk.s1];
a1.s45 = A[lda + bk.s2];
a1.s67 = A[lda + bk.s3];
a2.s01 = A[mad24(2u, lda, bk.s0)];
a2.s23 = A[mad24(2u, lda, bk.s1)];
a2.s45 = A[mad24(2u, lda, bk.s2)];
a2.s67 = A[mad24(2u, lda, bk.s3)];
a3.s01 = A[mad24(3u, lda, bk.s0)];
a3.s23 = A[mad24(3u, lda, bk.s1)];
a3.s45 = A[mad24(3u, lda, bk.s2)];
a3.s67 = A[mad24(3u, lda, bk.s3)];
a0.s0 = (k < K) ? a0.s0 : 0;
a1.s0 = (k < K) ? a1.s0 : 0;
a2.s0 = (k < K) ? a2.s0 : 0;
a3.s0 = (k < K) ? a3.s0 : 0;
a0.s1 = (k + 1 < K) ? a0.s1 : 0;
a1.s1 = (k + 1 < K) ? a1.s1 : 0;
a2.s1 = (k + 1 < K) ? a2.s1 : 0;
a3.s1 = (k + 1 < K) ? a3.s1 : 0;
a0.s2 = (k + 2 < K) ? a0.s2 : 0;
a1.s2 = (k + 2 < K) ? a1.s2 : 0;
a2.s2 = (k + 2 < K) ? a2.s2 : 0;
a3.s2 = (k + 2 < K) ? a3.s2 : 0;
a0.s3 = (k + 3 < K) ? a0.s3 : 0;
a1.s3 = (k + 3 < K) ? a1.s3 : 0;
a2.s3 = (k + 3 < K) ? a2.s3 : 0;
a3.s3 = (k + 3 < K) ? a3.s3 : 0;
a0.s4 = (k + 4 < K) ? a0.s4 : 0;
a1.s4 = (k + 4 < K) ? a1.s4 : 0;
a2.s4 = (k + 4 < K) ? a2.s4 : 0;
a3.s4 = (k + 4 < K) ? a3.s4 : 0;
a0.s5 = (k + 5 < K) ? a0.s5 : 0;
a1.s5 = (k + 5 < K) ? a1.s5 : 0;
a2.s5 = (k + 5 < K) ? a2.s5 : 0;
a3.s5 = (k + 5 < K) ? a3.s5 : 0;
a0.s6 = (k + 6 < K) ? a0.s6 : 0;
a1.s6 = (k + 6 < K) ? a1.s6 : 0;
a2.s6 = (k + 6 < K) ? a2.s6 : 0;
a3.s6 = (k + 6 < K) ? a3.s6 : 0;
a0.s7 = (k + 7 < K) ? a0.s7 : 0;
a1.s7 = (k + 7 < K) ? a1.s7 : 0;
a2.s7 = (k + 7 < K) ? a2.s7 : 0;
a3.s7 = (k + 7 < K) ? a3.s7 : 0;
c0.s0 += a0.s0 * b0.s0;
c0.s0 += a0.s1 * b0.s1;
c0.s0 += a0.s2 * b0.s2;
c0.s0 += a0.s3 * b0.s3;
c0.s0 += a0.s4 * b0.s4;
c0.s0 += a0.s5 * b0.s5;
c0.s0 += a0.s6 * b0.s6;
c0.s0 += a0.s7 * b0.s7;
c1.s0 += a0.s0 * b1.s0;
c1.s0 += a0.s1 * b1.s1;
c1.s0 += a0.s2 * b1.s2;
c1.s0 += a0.s3 * b1.s3;
c1.s0 += a0.s4 * b1.s4;
c1.s0 += a0.s5 * b1.s5;
c1.s0 += a0.s6 * b1.s6;
c1.s0 += a0.s7 * b1.s7;
c2.s0 += a0.s0 * b2.s0;
c2.s0 += a0.s1 * b2.s1;
c2.s0 += a0.s2 * b2.s2;
c2.s0 += a0.s3 * b2.s3;
c2.s0 += a0.s4 * b2.s4;
c2.s0 += a0.s5 * b2.s5;
c2.s0 += a0.s6 * b2.s6;
c2.s0 += a0.s7 * b2.s7;
c3.s0 += a0.s0 * b3.s0;
c3.s0 += a0.s1 * b3.s1;
c3.s0 += a0.s2 * b3.s2;
c3.s0 += a0.s3 * b3.s3;
c3.s0 += a0.s4 * b3.s4;
c3.s0 += a0.s5 * b3.s5;
c3.s0 += a0.s6 * b3.s6;
c3.s0 += a0.s7 * b3.s7;
c0.s1 += a1.s0 * b0.s0;
c0.s1 += a1.s1 * b0.s1;
c0.s1 += a1.s2 * b0.s2;
c0.s1 += a1.s3 * b0.s3;
c0.s1 += a1.s4 * b0.s4;
c0.s1 += a1.s5 * b0.s5;
c0.s1 += a1.s6 * b0.s6;
c0.s1 += a1.s7 * b0.s7;
c1.s1 += a1.s0 * b1.s0;
c1.s1 += a1.s1 * b1.s1;
c1.s1 += a1.s2 * b1.s2;
c1.s1 += a1.s3 * b1.s3;
c1.s1 += a1.s4 * b1.s4;
c1.s1 += a1.s5 * b1.s5;
c1.s1 += a1.s6 * b1.s6;
c1.s1 += a1.s7 * b1.s7;
c2.s1 += a1.s0 * b2.s0;
c2.s1 += a1.s1 * b2.s1;
c2.s1 += a1.s2 * b2.s2;
c2.s1 += a1.s3 * b2.s3;
c2.s1 += a1.s4 * b2.s4;
c2.s1 += a1.s5 * b2.s5;
c2.s1 += a1.s6 * b2.s6;
c2.s1 += a1.s7 * b2.s7;
c3.s1 += a1.s0 * b3.s0;
c3.s1 += a1.s1 * b3.s1;
c3.s1 += a1.s2 * b3.s2;
c3.s1 += a1.s3 * b3.s3;
c3.s1 += a1.s4 * b3.s4;
c3.s1 += a1.s5 * b3.s5;
c3.s1 += a1.s6 * b3.s6;
c3.s1 += a1.s7 * b3.s7;
c0.s2 += a2.s0 * b0.s0;
c0.s2 += a2.s1 * b0.s1;
c0.s2 += a2.s2 * b0.s2;
c0.s2 += a2.s3 * b0.s3;
c0.s2 += a2.s4 * b0.s4;
c0.s2 += a2.s5 * b0.s5;
c0.s2 += a2.s6 * b0.s6;
c0.s2 += a2.s7 * b0.s7;
c1.s2 += a2.s0 * b1.s0;
c1.s2 += a2.s1 * b1.s1;
c1.s2 += a2.s2 * b1.s2;
c1.s2 += a2.s3 * b1.s3;
c1.s2 += a2.s4 * b1.s4;
c1.s2 += a2.s5 * b1.s5;
c1.s2 += a2.s6 * b1.s6;
c1.s2 += a2.s7 * b1.s7;
c2.s2 += a2.s0 * b2.s0;
c2.s2 += a2.s1 * b2.s1;
c2.s2 += a2.s2 * b2.s2;
c2.s2 += a2.s3 * b2.s3;
c2.s2 += a2.s4 * b2.s4;
c2.s2 += a2.s5 * b2.s5;
c2.s2 += a2.s6 * b2.s6;
c2.s2 += a2.s7 * b2.s7;
c3.s2 += a2.s0 * b3.s0;
c3.s2 += a2.s1 * b3.s1;
c3.s2 += a2.s2 * b3.s2;
c3.s2 += a2.s3 * b3.s3;
c3.s2 += a2.s4 * b3.s4;
c3.s2 += a2.s5 * b3.s5;
c3.s2 += a2.s6 * b3.s6;
c3.s2 += a2.s7 * b3.s7;
c0.s3 += a3.s0 * b0.s0;
c0.s3 += a3.s1 * b0.s1;
c0.s3 += a3.s2 * b0.s2;
c0.s3 += a3.s3 * b0.s3;
c0.s3 += a3.s4 * b0.s4;
c0.s3 += a3.s5 * b0.s5;
c0.s3 += a3.s6 * b0.s6;
c0.s3 += a3.s7 * b0.s7;
c1.s3 += a3.s0 * b1.s0;
c1.s3 += a3.s1 * b1.s1;
c1.s3 += a3.s2 * b1.s2;
c1.s3 += a3.s3 * b1.s3;
c1.s3 += a3.s4 * b1.s4;
c1.s3 += a3.s5 * b1.s5;
c1.s3 += a3.s6 * b1.s6;
c1.s3 += a3.s7 * b1.s7;
c2.s3 += a3.s0 * b2.s0;
c2.s3 += a3.s1 * b2.s1;
c2.s3 += a3.s2 * b2.s2;
c2.s3 += a3.s3 * b2.s3;
c2.s3 += a3.s4 * b2.s4;
c2.s3 += a3.s5 * b2.s5;
c2.s3 += a3.s6 * b2.s6;
c2.s3 += a3.s7 * b2.s7;
c3.s3 += a3.s0 * b3.s0;
c3.s3 += a3.s1 * b3.s1;
c3.s3 += a3.s2 * b3.s2;
c3.s3 += a3.s3 * b3.s3;
c3.s3 += a3.s4 * b3.s4;
c3.s3 += a3.s5 * b3.s5;
c3.s3 += a3.s6 * b3.s6;
c3.s3 += a3.s7 * b3.s7;
/* ---------------------- */
}
}
if ((coordY + 4 == M) && (M % 4)) {
coordY += 4 - M % 4;
}
if ((coordX + 4 == N) && (N % 4)) {
coordX += 4 - N % 4;
}
//-----MergeUpdateResult
// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]
__local float4 ascratch[8*8*4];
__local float4 *scratch = ascratch;
//LDS block has the same vectorization as C matrix block
//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) );
scratch += 4*((itemId.y%8)*8 +itemId.x );
for( uint mstep = 0; mstep < 8; mstep += 8 ) {
if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
scratch[0] = c0;
scratch[1] = c1;
scratch[2] = c2;
scratch[3] = c3;
c0 = 0;
c1 = 0;
c2 = 0;
c3 = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
if ( 0 == itemId.x ) {
for(uint k = 0; k < 8 * 4; k += 4) {
c0 += scratch[0];
c1 += scratch[1];
c2 += scratch[2];
c3 += scratch[3];
//Adding the LDS block size in vectors
scratch += 4;
}
if ((coordY < M) && (coordX < N)) {
uint y = min(4u, M - (uint)coordY);
uint x = min(4u, N - (uint)coordX);
if ((y == 4) && (x == 4)) {
GPtr uC;
uC.f = C + (coordX * ldc + coordY)/2;
__global float2 *pC = uC.f2v;
float4 tempC0, tempC1, tempC2, tempC3;
tempC0 = c0 * alpha + 0;
tempC1 = c1 * alpha + 0;
tempC2 = c2 * alpha + 0;
tempC3 = c3 * alpha + 0;
pC[0] = tempC0.s01;
pC[1] = tempC0.s23;
pC[(ldc >> 1)] = tempC1.s01;
pC[(ldc >> 1) + 1] = tempC1.s23;
pC[ldc] = tempC2.s01;
pC[ldc + 1] = tempC2.s23;
pC[mad24(3u, (ldc >> 1), 0u)] = tempC3.s01;
pC[mad24(3u, (ldc >> 1), 1u)] = tempC3.s23;
}
else {
GPtr uC;
int i, j;
PPtr res;
uC.f = C + (coordX * ldc + coordY)/2;
uC.f += (x-1) * ldc;
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c3.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c3.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c3.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c3.s3 * alpha;
}
uC.f -= ldc;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c2.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c2.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c2.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c2.s3 * alpha;
}
uC.f -= ldc;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c1.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c1.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c1.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c1.s3 * alpha;
}
uC.f -= ldc;
x--;
}
if (x) {
switch (y) {
case 4:
uC.f[(y+0) % 4] = c0.s0 * alpha;
case 3:
uC.f[(y+1) % 4] = c0.s1 * alpha;
case 2:
uC.f[(y+2) % 4] = c0.s2 * alpha;
case 1:
uC.f[(y+3) % 4] = c0.s3 * alpha;
}
uC.f -= ldc;
x--;
}
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
--------------------------------------------------------
Build log:
Compilation started
1:100:51: error: can't convert between vector values of different size ('uint4' and 'unsigned long')
1:569:30: warning: incompatible pointer types assigning to '__global float *' from '__global float2 *'
1:593:30: warning: incompatible pointer types assigning to '__global float *' from '__global float2 *'
Compilation failed
========================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment