Last active
August 29, 2015 14:06
-
-
Save pavanky/b19b5c7fca58e557510c to your computer and use it in GitHub Desktop.
clblas_transpose_failure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define __CL_ENABLE_EXCEPTIONS | |
#include "cl.hpp" | |
#include <clBLAS.h> | |
#include <vector> | |
#include <iostream> | |
#include <iterator> | |
#include <algorithm> | |
using namespace cl; | |
using namespace std; | |
void call_back(const char *errinfo, | |
const void *private_info, | |
::size_t cb, void *user_data) | |
{ | |
printf("Context Error: %s\n", errinfo); fflush(stdout); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
Context(CL_DEVICE_TYPE_DEFAULT, NULL, call_back); | |
static const unsigned elements = 10 * 10; // Just make them big enough | |
std::vector<float> data(elements, 1); | |
Buffer a(begin(data), end(data), true, false); | |
Buffer b(begin(data), end(data), true, false); | |
Buffer c(CL_MEM_READ_WRITE, elements * sizeof(float)); | |
CommandQueue queue = CommandQueue::getDefault(); | |
clblasSetup(); | |
clblasStatus ret = clblasSgemm(clblasColumnMajor, | |
clblasTrans, clblasNoTrans, | |
10, 10, 10, // M, N, K | |
1, // Alpha | |
a(), 0, 10, | |
b(), 0, 10, | |
0, // Beta | |
c(), 0, 10, | |
1, &queue(), 0, 0, 0); | |
printf("CLBLAS STATUS: %d\n", ret); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======================================================== | |
AN INTERNAL KERNEL BUILD ERROR OCCURRED! | |
device name = Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz | |
error = -11 | |
memory pattern = Cached global memory based subgroup gemm, computing kernel generator | |
Subproblem dimensions: dims[0].itemY = 16, dims[0].itemX = 8, dims[0].y = 16, dims[0].x = 8, dims[0].bwidth = 64; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; | |
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64 | |
Kernel extra flags: 939556625 | |
Source: | |
#define A_BLK_H 4 | |
#define B_BLK_H 4 | |
#define SUBG_ITEMS 8 | |
#define SUBG_A 4 | |
#define SUBG_B 2 | |
#define SUBG_B 2 | |
#define K_VLEN_A 2 | |
#define K_VLEN_B 2 | |
typedef union GPtr { | |
__global float *f; | |
__global float2 *f2v; | |
__global float4 *f4v; | |
__global float8 *f8v; | |
__global float16 *f16v; | |
} GPtr; | |
typedef union LPtr { | |
__local float *f; | |
__local float2 *f2v; | |
__local float4 *f4v; | |
__local float8 *f8v; | |
__local float16 *f16v; | |
} LPtr; | |
typedef union PPtr { | |
float *f; | |
float2 *f2v; | |
float4 *f4v; | |
float8 *f8v; | |
float16 *f16v; | |
} PPtr; | |
__attribute__((reqd_work_group_size(8, 8, 1))) | |
void __kernel | |
sgemmSubgroup( | |
uint M, | |
uint N, | |
uint K, | |
const float alpha, | |
const float beta, | |
const __global float2 *restrict A, | |
const __global float2 *restrict B, | |
__global float2 *C, | |
uint lda, | |
uint ldb, | |
uint ldc) | |
{ | |
uint Ktail = K % 64; | |
uint Kbase = K - Ktail; | |
float8 a0, a1, a2, a3; | |
float8 b0, b1, b2, b3; | |
float4 c0, c1, c2, c3; | |
lda /= K_VLEN_A; | |
ldb /= K_VLEN_B; | |
int2 itemId; | |
itemId.x = get_local_id(0); | |
itemId.y = get_local_id(1); | |
int coordY = A_BLK_H*( get_group_id(1)*SUBG_A + get_local_id(1)/SUBG_B ); | |
int coordX = B_BLK_H*( get_group_id(0)*SUBG_B + get_local_id(1)%SUBG_B ); | |
uint skipTileMul = 0; | |
//M block tail | |
if( coordY >= M ) { | |
skipTileMul = 1; | |
} | |
//N block tail | |
if( coordX >= N ) { | |
skipTileMul = 1; | |
} | |
//Raising "Lower" M N tails | |
if (coordY + 4 > M) { | |
coordY -= 4 - M % 4; | |
} | |
if (coordX + 4 > N) { | |
coordX -= 4 - N % 4; | |
} | |
A += lda*coordY; | |
B += ldb*coordX; | |
c0 = 0; | |
c1 = 0; | |
c2 = 0; | |
c3 = 0; | |
const uint vKB = (Kbase >> 1); | |
if( !skipTileMul ) { | |
for(int k = 8*get_local_id(0); k < Kbase; k += 8*SUBG_ITEMS) { | |
/* -- Tiles multiplier -- */ | |
float8 sum; | |
const uint4 bk = ((uint4)(0, 1, 2, 3) + (( get_group_id(0)*0 + k ) >> 1)) % vKB; | |
b0.s01 = B[bk.s0]; | |
b0.s23 = B[bk.s1]; | |
b0.s45 = B[bk.s2]; | |
b0.s67 = B[bk.s3]; | |
b1.s01 = B[ldb + bk.s0]; | |
b1.s23 = B[ldb + bk.s1]; | |
b1.s45 = B[ldb + bk.s2]; | |
b1.s67 = B[ldb + bk.s3]; | |
b2.s01 = B[mad24(2u, ldb, bk.s0)]; | |
b2.s23 = B[mad24(2u, ldb, bk.s1)]; | |
b2.s45 = B[mad24(2u, ldb, bk.s2)]; | |
b2.s67 = B[mad24(2u, ldb, bk.s3)]; | |
b3.s01 = B[mad24(3u, ldb, bk.s0)]; | |
b3.s23 = B[mad24(3u, ldb, bk.s1)]; | |
b3.s45 = B[mad24(3u, ldb, bk.s2)]; | |
b3.s67 = B[mad24(3u, ldb, bk.s3)]; | |
a0.s01 = A[bk.s0]; | |
a0.s23 = A[bk.s1]; | |
a0.s45 = A[bk.s2]; | |
a0.s67 = A[bk.s3]; | |
a1.s01 = A[lda + bk.s0]; | |
a1.s23 = A[lda + bk.s1]; | |
a1.s45 = A[lda + bk.s2]; | |
a1.s67 = A[lda + bk.s3]; | |
a2.s01 = A[mad24(2u, lda, bk.s0)]; | |
a2.s23 = A[mad24(2u, lda, bk.s1)]; | |
a2.s45 = A[mad24(2u, lda, bk.s2)]; | |
a2.s67 = A[mad24(2u, lda, bk.s3)]; | |
a3.s01 = A[mad24(3u, lda, bk.s0)]; | |
a3.s23 = A[mad24(3u, lda, bk.s1)]; | |
a3.s45 = A[mad24(3u, lda, bk.s2)]; | |
a3.s67 = A[mad24(3u, lda, bk.s3)]; | |
c0.s0 += a0.s0 * b0.s0; | |
c0.s0 += a0.s1 * b0.s1; | |
c0.s0 += a0.s2 * b0.s2; | |
c0.s0 += a0.s3 * b0.s3; | |
c0.s0 += a0.s4 * b0.s4; | |
c0.s0 += a0.s5 * b0.s5; | |
c0.s0 += a0.s6 * b0.s6; | |
c0.s0 += a0.s7 * b0.s7; | |
c1.s0 += a0.s0 * b1.s0; | |
c1.s0 += a0.s1 * b1.s1; | |
c1.s0 += a0.s2 * b1.s2; | |
c1.s0 += a0.s3 * b1.s3; | |
c1.s0 += a0.s4 * b1.s4; | |
c1.s0 += a0.s5 * b1.s5; | |
c1.s0 += a0.s6 * b1.s6; | |
c1.s0 += a0.s7 * b1.s7; | |
c2.s0 += a0.s0 * b2.s0; | |
c2.s0 += a0.s1 * b2.s1; | |
c2.s0 += a0.s2 * b2.s2; | |
c2.s0 += a0.s3 * b2.s3; | |
c2.s0 += a0.s4 * b2.s4; | |
c2.s0 += a0.s5 * b2.s5; | |
c2.s0 += a0.s6 * b2.s6; | |
c2.s0 += a0.s7 * b2.s7; | |
c3.s0 += a0.s0 * b3.s0; | |
c3.s0 += a0.s1 * b3.s1; | |
c3.s0 += a0.s2 * b3.s2; | |
c3.s0 += a0.s3 * b3.s3; | |
c3.s0 += a0.s4 * b3.s4; | |
c3.s0 += a0.s5 * b3.s5; | |
c3.s0 += a0.s6 * b3.s6; | |
c3.s0 += a0.s7 * b3.s7; | |
c0.s1 += a1.s0 * b0.s0; | |
c0.s1 += a1.s1 * b0.s1; | |
c0.s1 += a1.s2 * b0.s2; | |
c0.s1 += a1.s3 * b0.s3; | |
c0.s1 += a1.s4 * b0.s4; | |
c0.s1 += a1.s5 * b0.s5; | |
c0.s1 += a1.s6 * b0.s6; | |
c0.s1 += a1.s7 * b0.s7; | |
c1.s1 += a1.s0 * b1.s0; | |
c1.s1 += a1.s1 * b1.s1; | |
c1.s1 += a1.s2 * b1.s2; | |
c1.s1 += a1.s3 * b1.s3; | |
c1.s1 += a1.s4 * b1.s4; | |
c1.s1 += a1.s5 * b1.s5; | |
c1.s1 += a1.s6 * b1.s6; | |
c1.s1 += a1.s7 * b1.s7; | |
c2.s1 += a1.s0 * b2.s0; | |
c2.s1 += a1.s1 * b2.s1; | |
c2.s1 += a1.s2 * b2.s2; | |
c2.s1 += a1.s3 * b2.s3; | |
c2.s1 += a1.s4 * b2.s4; | |
c2.s1 += a1.s5 * b2.s5; | |
c2.s1 += a1.s6 * b2.s6; | |
c2.s1 += a1.s7 * b2.s7; | |
c3.s1 += a1.s0 * b3.s0; | |
c3.s1 += a1.s1 * b3.s1; | |
c3.s1 += a1.s2 * b3.s2; | |
c3.s1 += a1.s3 * b3.s3; | |
c3.s1 += a1.s4 * b3.s4; | |
c3.s1 += a1.s5 * b3.s5; | |
c3.s1 += a1.s6 * b3.s6; | |
c3.s1 += a1.s7 * b3.s7; | |
c0.s2 += a2.s0 * b0.s0; | |
c0.s2 += a2.s1 * b0.s1; | |
c0.s2 += a2.s2 * b0.s2; | |
c0.s2 += a2.s3 * b0.s3; | |
c0.s2 += a2.s4 * b0.s4; | |
c0.s2 += a2.s5 * b0.s5; | |
c0.s2 += a2.s6 * b0.s6; | |
c0.s2 += a2.s7 * b0.s7; | |
c1.s2 += a2.s0 * b1.s0; | |
c1.s2 += a2.s1 * b1.s1; | |
c1.s2 += a2.s2 * b1.s2; | |
c1.s2 += a2.s3 * b1.s3; | |
c1.s2 += a2.s4 * b1.s4; | |
c1.s2 += a2.s5 * b1.s5; | |
c1.s2 += a2.s6 * b1.s6; | |
c1.s2 += a2.s7 * b1.s7; | |
c2.s2 += a2.s0 * b2.s0; | |
c2.s2 += a2.s1 * b2.s1; | |
c2.s2 += a2.s2 * b2.s2; | |
c2.s2 += a2.s3 * b2.s3; | |
c2.s2 += a2.s4 * b2.s4; | |
c2.s2 += a2.s5 * b2.s5; | |
c2.s2 += a2.s6 * b2.s6; | |
c2.s2 += a2.s7 * b2.s7; | |
c3.s2 += a2.s0 * b3.s0; | |
c3.s2 += a2.s1 * b3.s1; | |
c3.s2 += a2.s2 * b3.s2; | |
c3.s2 += a2.s3 * b3.s3; | |
c3.s2 += a2.s4 * b3.s4; | |
c3.s2 += a2.s5 * b3.s5; | |
c3.s2 += a2.s6 * b3.s6; | |
c3.s2 += a2.s7 * b3.s7; | |
c0.s3 += a3.s0 * b0.s0; | |
c0.s3 += a3.s1 * b0.s1; | |
c0.s3 += a3.s2 * b0.s2; | |
c0.s3 += a3.s3 * b0.s3; | |
c0.s3 += a3.s4 * b0.s4; | |
c0.s3 += a3.s5 * b0.s5; | |
c0.s3 += a3.s6 * b0.s6; | |
c0.s3 += a3.s7 * b0.s7; | |
c1.s3 += a3.s0 * b1.s0; | |
c1.s3 += a3.s1 * b1.s1; | |
c1.s3 += a3.s2 * b1.s2; | |
c1.s3 += a3.s3 * b1.s3; | |
c1.s3 += a3.s4 * b1.s4; | |
c1.s3 += a3.s5 * b1.s5; | |
c1.s3 += a3.s6 * b1.s6; | |
c1.s3 += a3.s7 * b1.s7; | |
c2.s3 += a3.s0 * b2.s0; | |
c2.s3 += a3.s1 * b2.s1; | |
c2.s3 += a3.s2 * b2.s2; | |
c2.s3 += a3.s3 * b2.s3; | |
c2.s3 += a3.s4 * b2.s4; | |
c2.s3 += a3.s5 * b2.s5; | |
c2.s3 += a3.s6 * b2.s6; | |
c2.s3 += a3.s7 * b2.s7; | |
c3.s3 += a3.s0 * b3.s0; | |
c3.s3 += a3.s1 * b3.s1; | |
c3.s3 += a3.s2 * b3.s2; | |
c3.s3 += a3.s3 * b3.s3; | |
c3.s3 += a3.s4 * b3.s4; | |
c3.s3 += a3.s5 * b3.s5; | |
c3.s3 += a3.s6 * b3.s6; | |
c3.s3 += a3.s7 * b3.s7; | |
/* ---------------------- */ | |
} | |
uint k = Kbase + get_local_id(0)*8; | |
{ | |
/* -- Tiles multiplier -- */ | |
float8 sum; | |
const uint vKB = ((K + 1) >> 1); | |
const uint4 bk = ((uint4)(0, 1, 2, 3) + (k >> 1)) % vKB; | |
b0.s01 = B[bk.s0]; | |
b0.s23 = B[bk.s1]; | |
b0.s45 = B[bk.s2]; | |
b0.s67 = B[bk.s3]; | |
b1.s01 = B[ldb + bk.s0]; | |
b1.s23 = B[ldb + bk.s1]; | |
b1.s45 = B[ldb + bk.s2]; | |
b1.s67 = B[ldb + bk.s3]; | |
b2.s01 = B[mad24(2u, ldb, bk.s0)]; | |
b2.s23 = B[mad24(2u, ldb, bk.s1)]; | |
b2.s45 = B[mad24(2u, ldb, bk.s2)]; | |
b2.s67 = B[mad24(2u, ldb, bk.s3)]; | |
b3.s01 = B[mad24(3u, ldb, bk.s0)]; | |
b3.s23 = B[mad24(3u, ldb, bk.s1)]; | |
b3.s45 = B[mad24(3u, ldb, bk.s2)]; | |
b3.s67 = B[mad24(3u, ldb, bk.s3)]; | |
b0.s0 = (k < K) ? b0.s0 : 0; | |
b1.s0 = (k < K) ? b1.s0 : 0; | |
b2.s0 = (k < K) ? b2.s0 : 0; | |
b3.s0 = (k < K) ? b3.s0 : 0; | |
b0.s1 = (k + 1 < K) ? b0.s1 : 0; | |
b1.s1 = (k + 1 < K) ? b1.s1 : 0; | |
b2.s1 = (k + 1 < K) ? b2.s1 : 0; | |
b3.s1 = (k + 1 < K) ? b3.s1 : 0; | |
b0.s2 = (k + 2 < K) ? b0.s2 : 0; | |
b1.s2 = (k + 2 < K) ? b1.s2 : 0; | |
b2.s2 = (k + 2 < K) ? b2.s2 : 0; | |
b3.s2 = (k + 2 < K) ? b3.s2 : 0; | |
b0.s3 = (k + 3 < K) ? b0.s3 : 0; | |
b1.s3 = (k + 3 < K) ? b1.s3 : 0; | |
b2.s3 = (k + 3 < K) ? b2.s3 : 0; | |
b3.s3 = (k + 3 < K) ? b3.s3 : 0; | |
b0.s4 = (k + 4 < K) ? b0.s4 : 0; | |
b1.s4 = (k + 4 < K) ? b1.s4 : 0; | |
b2.s4 = (k + 4 < K) ? b2.s4 : 0; | |
b3.s4 = (k + 4 < K) ? b3.s4 : 0; | |
b0.s5 = (k + 5 < K) ? b0.s5 : 0; | |
b1.s5 = (k + 5 < K) ? b1.s5 : 0; | |
b2.s5 = (k + 5 < K) ? b2.s5 : 0; | |
b3.s5 = (k + 5 < K) ? b3.s5 : 0; | |
b0.s6 = (k + 6 < K) ? b0.s6 : 0; | |
b1.s6 = (k + 6 < K) ? b1.s6 : 0; | |
b2.s6 = (k + 6 < K) ? b2.s6 : 0; | |
b3.s6 = (k + 6 < K) ? b3.s6 : 0; | |
b0.s7 = (k + 7 < K) ? b0.s7 : 0; | |
b1.s7 = (k + 7 < K) ? b1.s7 : 0; | |
b2.s7 = (k + 7 < K) ? b2.s7 : 0; | |
b3.s7 = (k + 7 < K) ? b3.s7 : 0; | |
a0.s01 = A[bk.s0]; | |
a0.s23 = A[bk.s1]; | |
a0.s45 = A[bk.s2]; | |
a0.s67 = A[bk.s3]; | |
a1.s01 = A[lda + bk.s0]; | |
a1.s23 = A[lda + bk.s1]; | |
a1.s45 = A[lda + bk.s2]; | |
a1.s67 = A[lda + bk.s3]; | |
a2.s01 = A[mad24(2u, lda, bk.s0)]; | |
a2.s23 = A[mad24(2u, lda, bk.s1)]; | |
a2.s45 = A[mad24(2u, lda, bk.s2)]; | |
a2.s67 = A[mad24(2u, lda, bk.s3)]; | |
a3.s01 = A[mad24(3u, lda, bk.s0)]; | |
a3.s23 = A[mad24(3u, lda, bk.s1)]; | |
a3.s45 = A[mad24(3u, lda, bk.s2)]; | |
a3.s67 = A[mad24(3u, lda, bk.s3)]; | |
a0.s0 = (k < K) ? a0.s0 : 0; | |
a1.s0 = (k < K) ? a1.s0 : 0; | |
a2.s0 = (k < K) ? a2.s0 : 0; | |
a3.s0 = (k < K) ? a3.s0 : 0; | |
a0.s1 = (k + 1 < K) ? a0.s1 : 0; | |
a1.s1 = (k + 1 < K) ? a1.s1 : 0; | |
a2.s1 = (k + 1 < K) ? a2.s1 : 0; | |
a3.s1 = (k + 1 < K) ? a3.s1 : 0; | |
a0.s2 = (k + 2 < K) ? a0.s2 : 0; | |
a1.s2 = (k + 2 < K) ? a1.s2 : 0; | |
a2.s2 = (k + 2 < K) ? a2.s2 : 0; | |
a3.s2 = (k + 2 < K) ? a3.s2 : 0; | |
a0.s3 = (k + 3 < K) ? a0.s3 : 0; | |
a1.s3 = (k + 3 < K) ? a1.s3 : 0; | |
a2.s3 = (k + 3 < K) ? a2.s3 : 0; | |
a3.s3 = (k + 3 < K) ? a3.s3 : 0; | |
a0.s4 = (k + 4 < K) ? a0.s4 : 0; | |
a1.s4 = (k + 4 < K) ? a1.s4 : 0; | |
a2.s4 = (k + 4 < K) ? a2.s4 : 0; | |
a3.s4 = (k + 4 < K) ? a3.s4 : 0; | |
a0.s5 = (k + 5 < K) ? a0.s5 : 0; | |
a1.s5 = (k + 5 < K) ? a1.s5 : 0; | |
a2.s5 = (k + 5 < K) ? a2.s5 : 0; | |
a3.s5 = (k + 5 < K) ? a3.s5 : 0; | |
a0.s6 = (k + 6 < K) ? a0.s6 : 0; | |
a1.s6 = (k + 6 < K) ? a1.s6 : 0; | |
a2.s6 = (k + 6 < K) ? a2.s6 : 0; | |
a3.s6 = (k + 6 < K) ? a3.s6 : 0; | |
a0.s7 = (k + 7 < K) ? a0.s7 : 0; | |
a1.s7 = (k + 7 < K) ? a1.s7 : 0; | |
a2.s7 = (k + 7 < K) ? a2.s7 : 0; | |
a3.s7 = (k + 7 < K) ? a3.s7 : 0; | |
c0.s0 += a0.s0 * b0.s0; | |
c0.s0 += a0.s1 * b0.s1; | |
c0.s0 += a0.s2 * b0.s2; | |
c0.s0 += a0.s3 * b0.s3; | |
c0.s0 += a0.s4 * b0.s4; | |
c0.s0 += a0.s5 * b0.s5; | |
c0.s0 += a0.s6 * b0.s6; | |
c0.s0 += a0.s7 * b0.s7; | |
c1.s0 += a0.s0 * b1.s0; | |
c1.s0 += a0.s1 * b1.s1; | |
c1.s0 += a0.s2 * b1.s2; | |
c1.s0 += a0.s3 * b1.s3; | |
c1.s0 += a0.s4 * b1.s4; | |
c1.s0 += a0.s5 * b1.s5; | |
c1.s0 += a0.s6 * b1.s6; | |
c1.s0 += a0.s7 * b1.s7; | |
c2.s0 += a0.s0 * b2.s0; | |
c2.s0 += a0.s1 * b2.s1; | |
c2.s0 += a0.s2 * b2.s2; | |
c2.s0 += a0.s3 * b2.s3; | |
c2.s0 += a0.s4 * b2.s4; | |
c2.s0 += a0.s5 * b2.s5; | |
c2.s0 += a0.s6 * b2.s6; | |
c2.s0 += a0.s7 * b2.s7; | |
c3.s0 += a0.s0 * b3.s0; | |
c3.s0 += a0.s1 * b3.s1; | |
c3.s0 += a0.s2 * b3.s2; | |
c3.s0 += a0.s3 * b3.s3; | |
c3.s0 += a0.s4 * b3.s4; | |
c3.s0 += a0.s5 * b3.s5; | |
c3.s0 += a0.s6 * b3.s6; | |
c3.s0 += a0.s7 * b3.s7; | |
c0.s1 += a1.s0 * b0.s0; | |
c0.s1 += a1.s1 * b0.s1; | |
c0.s1 += a1.s2 * b0.s2; | |
c0.s1 += a1.s3 * b0.s3; | |
c0.s1 += a1.s4 * b0.s4; | |
c0.s1 += a1.s5 * b0.s5; | |
c0.s1 += a1.s6 * b0.s6; | |
c0.s1 += a1.s7 * b0.s7; | |
c1.s1 += a1.s0 * b1.s0; | |
c1.s1 += a1.s1 * b1.s1; | |
c1.s1 += a1.s2 * b1.s2; | |
c1.s1 += a1.s3 * b1.s3; | |
c1.s1 += a1.s4 * b1.s4; | |
c1.s1 += a1.s5 * b1.s5; | |
c1.s1 += a1.s6 * b1.s6; | |
c1.s1 += a1.s7 * b1.s7; | |
c2.s1 += a1.s0 * b2.s0; | |
c2.s1 += a1.s1 * b2.s1; | |
c2.s1 += a1.s2 * b2.s2; | |
c2.s1 += a1.s3 * b2.s3; | |
c2.s1 += a1.s4 * b2.s4; | |
c2.s1 += a1.s5 * b2.s5; | |
c2.s1 += a1.s6 * b2.s6; | |
c2.s1 += a1.s7 * b2.s7; | |
c3.s1 += a1.s0 * b3.s0; | |
c3.s1 += a1.s1 * b3.s1; | |
c3.s1 += a1.s2 * b3.s2; | |
c3.s1 += a1.s3 * b3.s3; | |
c3.s1 += a1.s4 * b3.s4; | |
c3.s1 += a1.s5 * b3.s5; | |
c3.s1 += a1.s6 * b3.s6; | |
c3.s1 += a1.s7 * b3.s7; | |
c0.s2 += a2.s0 * b0.s0; | |
c0.s2 += a2.s1 * b0.s1; | |
c0.s2 += a2.s2 * b0.s2; | |
c0.s2 += a2.s3 * b0.s3; | |
c0.s2 += a2.s4 * b0.s4; | |
c0.s2 += a2.s5 * b0.s5; | |
c0.s2 += a2.s6 * b0.s6; | |
c0.s2 += a2.s7 * b0.s7; | |
c1.s2 += a2.s0 * b1.s0; | |
c1.s2 += a2.s1 * b1.s1; | |
c1.s2 += a2.s2 * b1.s2; | |
c1.s2 += a2.s3 * b1.s3; | |
c1.s2 += a2.s4 * b1.s4; | |
c1.s2 += a2.s5 * b1.s5; | |
c1.s2 += a2.s6 * b1.s6; | |
c1.s2 += a2.s7 * b1.s7; | |
c2.s2 += a2.s0 * b2.s0; | |
c2.s2 += a2.s1 * b2.s1; | |
c2.s2 += a2.s2 * b2.s2; | |
c2.s2 += a2.s3 * b2.s3; | |
c2.s2 += a2.s4 * b2.s4; | |
c2.s2 += a2.s5 * b2.s5; | |
c2.s2 += a2.s6 * b2.s6; | |
c2.s2 += a2.s7 * b2.s7; | |
c3.s2 += a2.s0 * b3.s0; | |
c3.s2 += a2.s1 * b3.s1; | |
c3.s2 += a2.s2 * b3.s2; | |
c3.s2 += a2.s3 * b3.s3; | |
c3.s2 += a2.s4 * b3.s4; | |
c3.s2 += a2.s5 * b3.s5; | |
c3.s2 += a2.s6 * b3.s6; | |
c3.s2 += a2.s7 * b3.s7; | |
c0.s3 += a3.s0 * b0.s0; | |
c0.s3 += a3.s1 * b0.s1; | |
c0.s3 += a3.s2 * b0.s2; | |
c0.s3 += a3.s3 * b0.s3; | |
c0.s3 += a3.s4 * b0.s4; | |
c0.s3 += a3.s5 * b0.s5; | |
c0.s3 += a3.s6 * b0.s6; | |
c0.s3 += a3.s7 * b0.s7; | |
c1.s3 += a3.s0 * b1.s0; | |
c1.s3 += a3.s1 * b1.s1; | |
c1.s3 += a3.s2 * b1.s2; | |
c1.s3 += a3.s3 * b1.s3; | |
c1.s3 += a3.s4 * b1.s4; | |
c1.s3 += a3.s5 * b1.s5; | |
c1.s3 += a3.s6 * b1.s6; | |
c1.s3 += a3.s7 * b1.s7; | |
c2.s3 += a3.s0 * b2.s0; | |
c2.s3 += a3.s1 * b2.s1; | |
c2.s3 += a3.s2 * b2.s2; | |
c2.s3 += a3.s3 * b2.s3; | |
c2.s3 += a3.s4 * b2.s4; | |
c2.s3 += a3.s5 * b2.s5; | |
c2.s3 += a3.s6 * b2.s6; | |
c2.s3 += a3.s7 * b2.s7; | |
c3.s3 += a3.s0 * b3.s0; | |
c3.s3 += a3.s1 * b3.s1; | |
c3.s3 += a3.s2 * b3.s2; | |
c3.s3 += a3.s3 * b3.s3; | |
c3.s3 += a3.s4 * b3.s4; | |
c3.s3 += a3.s5 * b3.s5; | |
c3.s3 += a3.s6 * b3.s6; | |
c3.s3 += a3.s7 * b3.s7; | |
/* ---------------------- */ | |
} | |
} | |
if ((coordY + 4 == M) && (M % 4)) { | |
coordY += 4 - M % 4; | |
} | |
if ((coordX + 4 == N) && (N % 4)) { | |
coordX += 4 - N % 4; | |
} | |
//-----MergeUpdateResult | |
// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC] | |
__local float4 ascratch[8*8*4]; | |
__local float4 *scratch = ascratch; | |
//LDS block has the same vectorization as C matrix block | |
//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) ); | |
scratch += 4*((itemId.y%8)*8 +itemId.x ); | |
for( uint mstep = 0; mstep < 8; mstep += 8 ) { | |
if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) { | |
scratch[0] = c0; | |
scratch[1] = c1; | |
scratch[2] = c2; | |
scratch[3] = c3; | |
c0 = 0; | |
c1 = 0; | |
c2 = 0; | |
c3 = 0; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) { | |
if ( 0 == itemId.x ) { | |
for(uint k = 0; k < 8 * 4; k += 4) { | |
c0 += scratch[0]; | |
c1 += scratch[1]; | |
c2 += scratch[2]; | |
c3 += scratch[3]; | |
//Adding the LDS block size in vectors | |
scratch += 4; | |
} | |
if ((coordY < M) && (coordX < N)) { | |
uint y = min(4u, M - (uint)coordY); | |
uint x = min(4u, N - (uint)coordX); | |
if ((y == 4) && (x == 4)) { | |
GPtr uC; | |
uC.f = C + (coordX * ldc + coordY)/2; | |
__global float2 *pC = uC.f2v; | |
float4 tempC0, tempC1, tempC2, tempC3; | |
tempC0 = c0 * alpha + 0; | |
tempC1 = c1 * alpha + 0; | |
tempC2 = c2 * alpha + 0; | |
tempC3 = c3 * alpha + 0; | |
pC[0] = tempC0.s01; | |
pC[1] = tempC0.s23; | |
pC[(ldc >> 1)] = tempC1.s01; | |
pC[(ldc >> 1) + 1] = tempC1.s23; | |
pC[ldc] = tempC2.s01; | |
pC[ldc + 1] = tempC2.s23; | |
pC[mad24(3u, (ldc >> 1), 0u)] = tempC3.s01; | |
pC[mad24(3u, (ldc >> 1), 1u)] = tempC3.s23; | |
} | |
else { | |
GPtr uC; | |
int i, j; | |
PPtr res; | |
uC.f = C + (coordX * ldc + coordY)/2; | |
uC.f += (x-1) * ldc; | |
if (x) { | |
switch (y) { | |
case 4: | |
uC.f[(y+0) % 4] = c3.s0 * alpha; | |
case 3: | |
uC.f[(y+1) % 4] = c3.s1 * alpha; | |
case 2: | |
uC.f[(y+2) % 4] = c3.s2 * alpha; | |
case 1: | |
uC.f[(y+3) % 4] = c3.s3 * alpha; | |
} | |
uC.f -= ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 4: | |
uC.f[(y+0) % 4] = c2.s0 * alpha; | |
case 3: | |
uC.f[(y+1) % 4] = c2.s1 * alpha; | |
case 2: | |
uC.f[(y+2) % 4] = c2.s2 * alpha; | |
case 1: | |
uC.f[(y+3) % 4] = c2.s3 * alpha; | |
} | |
uC.f -= ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 4: | |
uC.f[(y+0) % 4] = c1.s0 * alpha; | |
case 3: | |
uC.f[(y+1) % 4] = c1.s1 * alpha; | |
case 2: | |
uC.f[(y+2) % 4] = c1.s2 * alpha; | |
case 1: | |
uC.f[(y+3) % 4] = c1.s3 * alpha; | |
} | |
uC.f -= ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 4: | |
uC.f[(y+0) % 4] = c0.s0 * alpha; | |
case 3: | |
uC.f[(y+1) % 4] = c0.s1 * alpha; | |
case 2: | |
uC.f[(y+2) % 4] = c0.s2 * alpha; | |
case 1: | |
uC.f[(y+3) % 4] = c0.s3 * alpha; | |
} | |
uC.f -= ldc; | |
x--; | |
} | |
} | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
} | |
-------------------------------------------------------- | |
Build log: | |
Compilation started | |
1:100:51: error: can't convert between vector values of different size ('uint4' and 'unsigned long') | |
1:569:30: warning: incompatible pointer types assigning to '__global float *' from '__global float2 *' | |
1:593:30: warning: incompatible pointer types assigning to '__global float *' from '__global float2 *' | |
Compilation failed | |
======================================================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment