Skip to content

Instantly share code, notes, and snippets.

@matejaputic
Created November 20, 2015 19:13
Show Gist options
  • Save matejaputic/a62a27d2fc6ed4a28db8 to your computer and use it in GitHub Desktop.
Save matejaputic/a62a27d2fc6ed4a28db8 to your computer and use it in GitHub Desktop.
OpenCL sgemm returns -nan
typedef union GPtr {
__global float *f;
__global float2 *f2v;
__global float4 *f4v;
__global float8 *f8v;
__global float16 *f16v;
} GPtr;
typedef union LPtr {
__local float *f;
__local float2 *f2v;
__local float4 *f4v;
__local float8 *f8v;
__local float16 *f16v;
} LPtr;
typedef union PPtr {
float *f;
float2 *f2v;
float4 *f4v;
float8 *f8v;
float16 *f16v;
} PPtr;
__attribute__((reqd_work_group_size(8, 8, 1)))
void __kernel
sgemmBlock(
uint M,
uint N,
uint K,
const float alpha,
const float beta,
const __global float *restrict A,
const __global float *restrict B,
__global float *C,
uint lda,
uint ldb,
uint ldc)
{
float8 a0;
float b0, b1, b2, b3;
float8 c0, c1, c2, c3;
uint4 coord = 0u; /* contains coordB, coordA, k */
B += 4u * (uint)get_global_id(1) * ldb;
coord.y = 8u * (uint)get_global_id(0);
coord.x = 4u * (uint)(uint)get_global_id(1);
if (coord.y >= M) {
return;
}
c0 = 0;
c1 = 0;
c2 = 0;
c3 = 0;
const uint8 ay = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.y) % lda;
for (uint k1 = 0; k1 < K; k1 += 1) {
/* -- Tiles multiplier -- */
b0 = B[0];
b1 = B[ldb];
b2 = B[(ldb << 1)];
b3 = B[mad24(3u, ldb, 0u)];
a0.s0 = A[ay.s0];
a0.s1 = A[ay.s1];
a0.s2 = A[ay.s2];
a0.s3 = A[ay.s3];
a0.s4 = A[ay.s4];
a0.s5 = A[ay.s5];
a0.s6 = A[ay.s6];
a0.s7 = A[ay.s7];
c0 = mad(a0, b0, c0);
c1 = mad(a0, b1, c1);
c2 = mad(a0, b2, c2);
c3 = mad(a0, b3, c3);
A += lda;
B += 1;
/* ---------------------- */
}
uint y = min(8u, M - (uint)coord.y);
uint x = min(4u, N - (uint)coord.x);
if ((y == 8) && (x == 4)) {
GPtr uC;
uC.f = C + coord.x * ldc + coord.y;
__global float *pC = uC.f;
float8 tempC0;
tempC0.s0 = pC[0];
tempC0.s1 = pC[1];
tempC0.s2 = pC[2];
tempC0.s3 = pC[3];
tempC0.s4 = pC[4];
tempC0.s5 = pC[5];
tempC0.s6 = pC[6];
tempC0.s7 = pC[7];
tempC0 = mad(tempC0, beta, 0);
tempC0 = mad(c0, alpha, tempC0);
pC[0] = tempC0.s0;
pC[1] = tempC0.s1;
pC[2] = tempC0.s2;
pC[3] = tempC0.s3;
pC[4] = tempC0.s4;
pC[5] = tempC0.s5;
pC[6] = tempC0.s6;
pC[7] = tempC0.s7;
tempC0.s0 = pC[ldc];
tempC0.s1 = pC[ldc + 1];
tempC0.s2 = pC[ldc + 2];
tempC0.s3 = pC[ldc + 3];
tempC0.s4 = pC[ldc + 4];
tempC0.s5 = pC[ldc + 5];
tempC0.s6 = pC[ldc + 6];
tempC0.s7 = pC[ldc + 7];
tempC0 = mad(tempC0, beta, 0);
tempC0 = mad(c1, alpha, tempC0);
pC[ldc] = tempC0.s0;
pC[ldc + 1] = tempC0.s1;
pC[ldc + 2] = tempC0.s2;
pC[ldc + 3] = tempC0.s3;
pC[ldc + 4] = tempC0.s4;
pC[ldc + 5] = tempC0.s5;
pC[ldc + 6] = tempC0.s6;
pC[ldc + 7] = tempC0.s7;
tempC0.s0 = pC[(ldc << 1)];
tempC0.s1 = pC[mad24(2u, ldc, 1u)];
tempC0.s2 = pC[mad24(2u, ldc, 2u)];
tempC0.s3 = pC[mad24(2u, ldc, 3u)];
tempC0.s4 = pC[mad24(2u, ldc, 4u)];
tempC0.s5 = pC[mad24(2u, ldc, 5u)];
tempC0.s6 = pC[mad24(2u, ldc, 6u)];
tempC0.s7 = pC[mad24(2u, ldc, 7u)];
tempC0 = mad(tempC0, beta, 0);
tempC0 = mad(c2, alpha, tempC0);
pC[(ldc << 1)] = tempC0.s0;
pC[mad24(2u, ldc, 1u)] = tempC0.s1;
pC[mad24(2u, ldc, 2u)] = tempC0.s2;
pC[mad24(2u, ldc, 3u)] = tempC0.s3;
pC[mad24(2u, ldc, 4u)] = tempC0.s4;
pC[mad24(2u, ldc, 5u)] = tempC0.s5;
pC[mad24(2u, ldc, 6u)] = tempC0.s6;
pC[mad24(2u, ldc, 7u)] = tempC0.s7;
tempC0.s0 = pC[mad24(3u, ldc, 0u)];
tempC0.s1 = pC[mad24(3u, ldc, 1u)];
tempC0.s2 = pC[mad24(3u, ldc, 2u)];
tempC0.s3 = pC[mad24(3u, ldc, 3u)];
tempC0.s4 = pC[mad24(3u, ldc, 4u)];
tempC0.s5 = pC[mad24(3u, ldc, 5u)];
tempC0.s6 = pC[mad24(3u, ldc, 6u)];
tempC0.s7 = pC[mad24(3u, ldc, 7u)];
tempC0 = mad(tempC0, beta, 0);
tempC0 = mad(c3, alpha, tempC0);
pC[mad24(3u, ldc, 0u)] = tempC0.s0;
pC[mad24(3u, ldc, 1u)] = tempC0.s1;
pC[mad24(3u, ldc, 2u)] = tempC0.s2;
pC[mad24(3u, ldc, 3u)] = tempC0.s3;
pC[mad24(3u, ldc, 4u)] = tempC0.s4;
pC[mad24(3u, ldc, 5u)] = tempC0.s5;
pC[mad24(3u, ldc, 6u)] = tempC0.s6;
pC[mad24(3u, ldc, 7u)] = tempC0.s7;
}
else {
GPtr uC;
int i, j;
PPtr res;
uC.f = C + coord.x * ldc + coord.y;
if (x) {
switch (y) {
case 8:
uC.f[7] = uC.f[7] * beta + c0.s7 * alpha;
case 7:
uC.f[6] = uC.f[6] * beta + c0.s6 * alpha;
case 6:
uC.f[5] = uC.f[5] * beta + c0.s5 * alpha;
case 5:
uC.f[4] = uC.f[4] * beta + c0.s4 * alpha;
case 4:
uC.f[3] = uC.f[3] * beta + c0.s3 * alpha;
case 3:
uC.f[2] = uC.f[2] * beta + c0.s2 * alpha;
case 2:
uC.f[1] = uC.f[1] * beta + c0.s1 * alpha;
case 1:
uC.f[0] = uC.f[0] * beta + c0.s0 * alpha;
}
uC.f += ldc;
x--;
}
if (x) {
switch (y) {
case 8:
uC.f[7] = uC.f[7] * beta + c1.s7 * alpha;
case 7:
uC.f[6] = uC.f[6] * beta + c1.s6 * alpha;
case 6:
uC.f[5] = uC.f[5] * beta + c1.s5 * alpha;
case 5:
uC.f[4] = uC.f[4] * beta + c1.s4 * alpha;
case 4:
uC.f[3] = uC.f[3] * beta + c1.s3 * alpha;
case 3:
uC.f[2] = uC.f[2] * beta + c1.s2 * alpha;
case 2:
uC.f[1] = uC.f[1] * beta + c1.s1 * alpha;
case 1:
uC.f[0] = uC.f[0] * beta + c1.s0 * alpha;
}
uC.f += ldc;
x--;
}
if (x) {
switch (y) {
case 8:
uC.f[7] = uC.f[7] * beta + c2.s7 * alpha;
case 7:
uC.f[6] = uC.f[6] * beta + c2.s6 * alpha;
case 6:
uC.f[5] = uC.f[5] * beta + c2.s5 * alpha;
case 5:
uC.f[4] = uC.f[4] * beta + c2.s4 * alpha;
case 4:
uC.f[3] = uC.f[3] * beta + c2.s3 * alpha;
case 3:
uC.f[2] = uC.f[2] * beta + c2.s2 * alpha;
case 2:
uC.f[1] = uC.f[1] * beta + c2.s1 * alpha;
case 1:
uC.f[0] = uC.f[0] * beta + c2.s0 * alpha;
}
uC.f += ldc;
x--;
}
if (x) {
switch (y) {
case 8:
uC.f[7] = uC.f[7] * beta + c3.s7 * alpha;
case 7:
uC.f[6] = uC.f[6] * beta + c3.s6 * alpha;
case 6:
uC.f[5] = uC.f[5] * beta + c3.s5 * alpha;
case 5:
uC.f[4] = uC.f[4] * beta + c3.s4 * alpha;
case 4:
uC.f[3] = uC.f[3] * beta + c3.s3 * alpha;
case 3:
uC.f[2] = uC.f[2] * beta + c3.s2 * alpha;
case 2:
uC.f[1] = uC.f[1] * beta + c3.s1 * alpha;
case 1:
uC.f[0] = uC.f[0] * beta + c3.s0 * alpha;
}
uC.f += ldc;
x--;
}
}
}
/* Parallel Prefix Sum ported to run on PIM CPUs */
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <pthread.h>
#include <CL/cl.h>
#include <hlsim.h>
#include <pim.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "clutils.h"
#define DEBUG
static const size_t matrix_a_width = 1024;
static const size_t matrix_a_height = 1024;
static const size_t matrix_b_width = 1024;
static const size_t matrix_b_height = matrix_a_width;
static const size_t matrix_c_width = matrix_b_width;
static const size_t matrix_c_height = matrix_a_height;
static const size_t matrix_a_size = (matrix_a_width * matrix_a_height);
static const size_t matrix_b_size = (matrix_b_width * matrix_b_height);
static const size_t matrix_c_size = (matrix_c_width * matrix_c_height);
int main(int argc, char **argv) {
/* Seed RNG */
srand((unsigned int)time(NULL));
/* Allocate local space for input data */
#ifdef DEBUG
printf("Size of Matrix A: %'lu B\n", sizeof(float) * matrix_a_size);
#endif
float *matrix_a = (float *)malloc(sizeof(float) * matrix_a_size);
if (matrix_a == NULL) printf("ERROR in allocating matrix_a on the CPU\n");
#ifdef DEBUG
printf("Size of Matrix B: %'lu B\n", sizeof(float) * matrix_b_size);
#endif
float *matrix_b = (float *)malloc(sizeof(float) * matrix_b_size);
if (matrix_b == NULL) printf("ERROR in allocating matrix_b on the CPU\n");
/* Allocate local space for output data */
#ifdef DEBUG
printf("Size of Matrix C: %'lu B\n", sizeof(float) * matrix_c_size);
#endif
float *matrix_c = (float *)calloc(matrix_c_size, sizeof(float));
if (matrix_c == NULL) printf("ERROR in allocating matrix_c on the CPU\n");
/* Fill matrices A and B with random floats */
for (size_t i = 0; i < matrix_a_size; ++i) {
matrix_a[i] = rand() / (float)RAND_MAX;
}
for (size_t i = 0; i < matrix_b_size; ++i) {
matrix_b[i] = rand() / (float)RAND_MAX;
}
/* Print data initialization */
#ifdef DEBUG
// printf("Matrix A\n");
// for (size_t i = 0; i < matrix_a_size; ++i)
// {
// printf("%f ", matrix_a[i]);
// if ((i + 1) % matrix_a_width == 0)
// {
// printf("\n");
// }
// }
// printf("Matrix B\n");
// for (size_t i = 0; i < matrix_b_size; ++i)
// {
// printf("%f ", matrix_b[i]);
// if ((i + 1) % matrix_b_width == 0)
// {
// printf("\n");
// }
// }
printf("Matrix C\n");
for (size_t i = 0; i < matrix_c_size; ++i)
{
printf("%f ", matrix_c[i]);
if ((i + 1) % matrix_c_width == 0)
{
printf("\n");
}
}
#endif
cl_device_id device_id; /* compute device id */
cl_context context; /* compute context */
cl_command_queue commands; /* compute command queue */
cl_program program; /* compute program */
cl_kernel kernel; /* compute kernel */
cl_event *kernel_event_list;
cl_double kernelTime; /* kernel profiling timer */
// size_t localWorkSize[2] = {4, 4};
size_t globalWorkSize[2] = {matrix_c_height, matrix_c_width};
size_t localWorkSize[2] = {8, 8};
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
context = cl_init_context(0, 0, 0);
program = cl_compileProgram((char *)"_temp_4_Hawaii.cl", NULL);
kernel = clCreateKernel(program, "sgemmBlock", &errcode);
cl_errChk(errcode, "ERROR in creating kernel sgemmBlock", true);
cl_mem cl_matrix_a, cl_matrix_b, cl_matrix_c;
cl_matrix_c = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * matrix_c_size, NULL, &errcode);
cl_errChk(errcode, "ERROR in creating buffer cl_matrix_c", true);
cl_matrix_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, matrix_a_size, matrix_a, &errcode);
cl_errChk(errcode, "ERROR in creating image cl_matrix_a", true);
cl_matrix_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, matrix_b_size, matrix_b, &errcode);
cl_errChk(errcode, "ERROR in creating image cl_matrix_b", true);
/**
* sgemmBlock args:
*
* uint M,
* uint N,
* uint K,
* const float alpha,
* const float beta,
* const __global float *restrict A,
* const __global float *restrict B,
* __global float *C,
* uint lda,
* uint ldb,
* uint ldc)
*/
unsigned int N = 1024;
unsigned int M = 1024;
unsigned int K = 1024;
const float alpha = 1.0f;
const float beta = 1.0f;
unsigned int lda = 0.0f;
unsigned int ldb = 0.0f;
unsigned int ldc = 0.0f;
commands = cl_getCommandQueue();
cl_event writeEvent, kernelEvent, readEvent;
cl_int argchk = 0x0;
argchk |= clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&N);
argchk |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&M);
argchk |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&K);
argchk |= clSetKernelArg(kernel, 3, sizeof(cl_float), (void *)&alpha);
argchk |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&beta);
argchk |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&cl_matrix_a);
argchk |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&cl_matrix_b);
argchk |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&cl_matrix_c);
argchk |= clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&lda);
argchk |= clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&ldb);
argchk |= clSetKernelArg(kernel, 10, sizeof(cl_float), (void *)&ldc);
cl_errChk(argchk, "ERROR in Setting NaiveGEMM kernel args", true);
errcode = clEnqueueNDRangeKernel(
commands, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernelEvent);
cl_errChk(errcode, "ERROR in Executing Kernel NaiveGEMM", true);
errcode = clEnqueueReadBuffer(commands,
cl_matrix_c,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * matrix_c_size,
matrix_c,
0, // num_events_in_wait_list
NULL, // event_wait_list
&readEvent);
cl_errChk(errcode, "ERROR with clEnqueueReadBuffer", true);
#ifdef DEBUG
printf("Matrix C\n");
for (size_t i = 0; i < matrix_c_size; ++i)
{
printf("%f ", matrix_c[i]);
if ((i + 1) % matrix_c_width == 0)
{
printf("\n");
}
}
#endif
clReleaseMemObject(cl_matrix_a);
clReleaseMemObject(cl_matrix_b);
clReleaseMemObject(cl_matrix_c);
/* Free up matrix memory */
free(matrix_a);
free(matrix_b);
free(matrix_c);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment