Created
November 20, 2015 19:13
-
-
Save matejaputic/a62a27d2fc6ed4a28db8 to your computer and use it in GitHub Desktop.
OpenCL sgemm returns -nan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
typedef union GPtr { | |
__global float *f; | |
__global float2 *f2v; | |
__global float4 *f4v; | |
__global float8 *f8v; | |
__global float16 *f16v; | |
} GPtr; | |
typedef union LPtr { | |
__local float *f; | |
__local float2 *f2v; | |
__local float4 *f4v; | |
__local float8 *f8v; | |
__local float16 *f16v; | |
} LPtr; | |
typedef union PPtr { | |
float *f; | |
float2 *f2v; | |
float4 *f4v; | |
float8 *f8v; | |
float16 *f16v; | |
} PPtr; | |
__attribute__((reqd_work_group_size(8, 8, 1))) | |
void __kernel | |
sgemmBlock( | |
uint M, | |
uint N, | |
uint K, | |
const float alpha, | |
const float beta, | |
const __global float *restrict A, | |
const __global float *restrict B, | |
__global float *C, | |
uint lda, | |
uint ldb, | |
uint ldc) | |
{ | |
float8 a0; | |
float b0, b1, b2, b3; | |
float8 c0, c1, c2, c3; | |
uint4 coord = 0u; /* contains coordB, coordA, k */ | |
B += 4u * (uint)get_global_id(1) * ldb; | |
coord.y = 8u * (uint)get_global_id(0); | |
coord.x = 4u * (uint)(uint)get_global_id(1); | |
if (coord.y >= M) { | |
return; | |
} | |
c0 = 0; | |
c1 = 0; | |
c2 = 0; | |
c3 = 0; | |
const uint8 ay = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.y) % lda; | |
for (uint k1 = 0; k1 < K; k1 += 1) { | |
/* -- Tiles multiplier -- */ | |
b0 = B[0]; | |
b1 = B[ldb]; | |
b2 = B[(ldb << 1)]; | |
b3 = B[mad24(3u, ldb, 0u)]; | |
a0.s0 = A[ay.s0]; | |
a0.s1 = A[ay.s1]; | |
a0.s2 = A[ay.s2]; | |
a0.s3 = A[ay.s3]; | |
a0.s4 = A[ay.s4]; | |
a0.s5 = A[ay.s5]; | |
a0.s6 = A[ay.s6]; | |
a0.s7 = A[ay.s7]; | |
c0 = mad(a0, b0, c0); | |
c1 = mad(a0, b1, c1); | |
c2 = mad(a0, b2, c2); | |
c3 = mad(a0, b3, c3); | |
A += lda; | |
B += 1; | |
/* ---------------------- */ | |
} | |
uint y = min(8u, M - (uint)coord.y); | |
uint x = min(4u, N - (uint)coord.x); | |
if ((y == 8) && (x == 4)) { | |
GPtr uC; | |
uC.f = C + coord.x * ldc + coord.y; | |
__global float *pC = uC.f; | |
float8 tempC0; | |
tempC0.s0 = pC[0]; | |
tempC0.s1 = pC[1]; | |
tempC0.s2 = pC[2]; | |
tempC0.s3 = pC[3]; | |
tempC0.s4 = pC[4]; | |
tempC0.s5 = pC[5]; | |
tempC0.s6 = pC[6]; | |
tempC0.s7 = pC[7]; | |
tempC0 = mad(tempC0, beta, 0); | |
tempC0 = mad(c0, alpha, tempC0); | |
pC[0] = tempC0.s0; | |
pC[1] = tempC0.s1; | |
pC[2] = tempC0.s2; | |
pC[3] = tempC0.s3; | |
pC[4] = tempC0.s4; | |
pC[5] = tempC0.s5; | |
pC[6] = tempC0.s6; | |
pC[7] = tempC0.s7; | |
tempC0.s0 = pC[ldc]; | |
tempC0.s1 = pC[ldc + 1]; | |
tempC0.s2 = pC[ldc + 2]; | |
tempC0.s3 = pC[ldc + 3]; | |
tempC0.s4 = pC[ldc + 4]; | |
tempC0.s5 = pC[ldc + 5]; | |
tempC0.s6 = pC[ldc + 6]; | |
tempC0.s7 = pC[ldc + 7]; | |
tempC0 = mad(tempC0, beta, 0); | |
tempC0 = mad(c1, alpha, tempC0); | |
pC[ldc] = tempC0.s0; | |
pC[ldc + 1] = tempC0.s1; | |
pC[ldc + 2] = tempC0.s2; | |
pC[ldc + 3] = tempC0.s3; | |
pC[ldc + 4] = tempC0.s4; | |
pC[ldc + 5] = tempC0.s5; | |
pC[ldc + 6] = tempC0.s6; | |
pC[ldc + 7] = tempC0.s7; | |
tempC0.s0 = pC[(ldc << 1)]; | |
tempC0.s1 = pC[mad24(2u, ldc, 1u)]; | |
tempC0.s2 = pC[mad24(2u, ldc, 2u)]; | |
tempC0.s3 = pC[mad24(2u, ldc, 3u)]; | |
tempC0.s4 = pC[mad24(2u, ldc, 4u)]; | |
tempC0.s5 = pC[mad24(2u, ldc, 5u)]; | |
tempC0.s6 = pC[mad24(2u, ldc, 6u)]; | |
tempC0.s7 = pC[mad24(2u, ldc, 7u)]; | |
tempC0 = mad(tempC0, beta, 0); | |
tempC0 = mad(c2, alpha, tempC0); | |
pC[(ldc << 1)] = tempC0.s0; | |
pC[mad24(2u, ldc, 1u)] = tempC0.s1; | |
pC[mad24(2u, ldc, 2u)] = tempC0.s2; | |
pC[mad24(2u, ldc, 3u)] = tempC0.s3; | |
pC[mad24(2u, ldc, 4u)] = tempC0.s4; | |
pC[mad24(2u, ldc, 5u)] = tempC0.s5; | |
pC[mad24(2u, ldc, 6u)] = tempC0.s6; | |
pC[mad24(2u, ldc, 7u)] = tempC0.s7; | |
tempC0.s0 = pC[mad24(3u, ldc, 0u)]; | |
tempC0.s1 = pC[mad24(3u, ldc, 1u)]; | |
tempC0.s2 = pC[mad24(3u, ldc, 2u)]; | |
tempC0.s3 = pC[mad24(3u, ldc, 3u)]; | |
tempC0.s4 = pC[mad24(3u, ldc, 4u)]; | |
tempC0.s5 = pC[mad24(3u, ldc, 5u)]; | |
tempC0.s6 = pC[mad24(3u, ldc, 6u)]; | |
tempC0.s7 = pC[mad24(3u, ldc, 7u)]; | |
tempC0 = mad(tempC0, beta, 0); | |
tempC0 = mad(c3, alpha, tempC0); | |
pC[mad24(3u, ldc, 0u)] = tempC0.s0; | |
pC[mad24(3u, ldc, 1u)] = tempC0.s1; | |
pC[mad24(3u, ldc, 2u)] = tempC0.s2; | |
pC[mad24(3u, ldc, 3u)] = tempC0.s3; | |
pC[mad24(3u, ldc, 4u)] = tempC0.s4; | |
pC[mad24(3u, ldc, 5u)] = tempC0.s5; | |
pC[mad24(3u, ldc, 6u)] = tempC0.s6; | |
pC[mad24(3u, ldc, 7u)] = tempC0.s7; | |
} | |
else { | |
GPtr uC; | |
int i, j; | |
PPtr res; | |
uC.f = C + coord.x * ldc + coord.y; | |
if (x) { | |
switch (y) { | |
case 8: | |
uC.f[7] = uC.f[7] * beta + c0.s7 * alpha; | |
case 7: | |
uC.f[6] = uC.f[6] * beta + c0.s6 * alpha; | |
case 6: | |
uC.f[5] = uC.f[5] * beta + c0.s5 * alpha; | |
case 5: | |
uC.f[4] = uC.f[4] * beta + c0.s4 * alpha; | |
case 4: | |
uC.f[3] = uC.f[3] * beta + c0.s3 * alpha; | |
case 3: | |
uC.f[2] = uC.f[2] * beta + c0.s2 * alpha; | |
case 2: | |
uC.f[1] = uC.f[1] * beta + c0.s1 * alpha; | |
case 1: | |
uC.f[0] = uC.f[0] * beta + c0.s0 * alpha; | |
} | |
uC.f += ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 8: | |
uC.f[7] = uC.f[7] * beta + c1.s7 * alpha; | |
case 7: | |
uC.f[6] = uC.f[6] * beta + c1.s6 * alpha; | |
case 6: | |
uC.f[5] = uC.f[5] * beta + c1.s5 * alpha; | |
case 5: | |
uC.f[4] = uC.f[4] * beta + c1.s4 * alpha; | |
case 4: | |
uC.f[3] = uC.f[3] * beta + c1.s3 * alpha; | |
case 3: | |
uC.f[2] = uC.f[2] * beta + c1.s2 * alpha; | |
case 2: | |
uC.f[1] = uC.f[1] * beta + c1.s1 * alpha; | |
case 1: | |
uC.f[0] = uC.f[0] * beta + c1.s0 * alpha; | |
} | |
uC.f += ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 8: | |
uC.f[7] = uC.f[7] * beta + c2.s7 * alpha; | |
case 7: | |
uC.f[6] = uC.f[6] * beta + c2.s6 * alpha; | |
case 6: | |
uC.f[5] = uC.f[5] * beta + c2.s5 * alpha; | |
case 5: | |
uC.f[4] = uC.f[4] * beta + c2.s4 * alpha; | |
case 4: | |
uC.f[3] = uC.f[3] * beta + c2.s3 * alpha; | |
case 3: | |
uC.f[2] = uC.f[2] * beta + c2.s2 * alpha; | |
case 2: | |
uC.f[1] = uC.f[1] * beta + c2.s1 * alpha; | |
case 1: | |
uC.f[0] = uC.f[0] * beta + c2.s0 * alpha; | |
} | |
uC.f += ldc; | |
x--; | |
} | |
if (x) { | |
switch (y) { | |
case 8: | |
uC.f[7] = uC.f[7] * beta + c3.s7 * alpha; | |
case 7: | |
uC.f[6] = uC.f[6] * beta + c3.s6 * alpha; | |
case 6: | |
uC.f[5] = uC.f[5] * beta + c3.s5 * alpha; | |
case 5: | |
uC.f[4] = uC.f[4] * beta + c3.s4 * alpha; | |
case 4: | |
uC.f[3] = uC.f[3] * beta + c3.s3 * alpha; | |
case 3: | |
uC.f[2] = uC.f[2] * beta + c3.s2 * alpha; | |
case 2: | |
uC.f[1] = uC.f[1] * beta + c3.s1 * alpha; | |
case 1: | |
uC.f[0] = uC.f[0] * beta + c3.s0 * alpha; | |
} | |
uC.f += ldc; | |
x--; | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Parallel Prefix Sum ported to run on PIM CPUs */ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#include <pthread.h> | |
#include <CL/cl.h> | |
#include <hlsim.h> | |
#include <pim.h> | |
#include <string.h> | |
#include <math.h> | |
#include <time.h> | |
#include "clutils.h" | |
#define DEBUG | |
static const size_t matrix_a_width = 1024; | |
static const size_t matrix_a_height = 1024; | |
static const size_t matrix_b_width = 1024; | |
static const size_t matrix_b_height = matrix_a_width; | |
static const size_t matrix_c_width = matrix_b_width; | |
static const size_t matrix_c_height = matrix_a_height; | |
static const size_t matrix_a_size = (matrix_a_width * matrix_a_height); | |
static const size_t matrix_b_size = (matrix_b_width * matrix_b_height); | |
static const size_t matrix_c_size = (matrix_c_width * matrix_c_height); | |
int main(int argc, char **argv) { | |
/* Seed RNG */ | |
srand((unsigned int)time(NULL)); | |
/* Allocate local space for input data */ | |
#ifdef DEBUG | |
printf("Size of Matrix A: %'lu B\n", sizeof(float) * matrix_a_size); | |
#endif | |
float *matrix_a = (float *)malloc(sizeof(float) * matrix_a_size); | |
if (matrix_a == NULL) printf("ERROR in allocating matrix_a on the CPU\n"); | |
#ifdef DEBUG | |
printf("Size of Matrix B: %'lu B\n", sizeof(float) * matrix_b_size); | |
#endif | |
float *matrix_b = (float *)malloc(sizeof(float) * matrix_b_size); | |
if (matrix_b == NULL) printf("ERROR in allocating matrix_b on the CPU\n"); | |
/* Allocate local space for output data */ | |
#ifdef DEBUG | |
printf("Size of Matrix C: %'lu B\n", sizeof(float) * matrix_c_size); | |
#endif | |
float *matrix_c = (float *)calloc(matrix_c_size, sizeof(float)); | |
if (matrix_c == NULL) printf("ERROR in allocating matrix_c on the CPU\n"); | |
/* Fill matrices A and B with random floats */ | |
for (size_t i = 0; i < matrix_a_size; ++i) { | |
matrix_a[i] = rand() / (float)RAND_MAX; | |
} | |
for (size_t i = 0; i < matrix_b_size; ++i) { | |
matrix_b[i] = rand() / (float)RAND_MAX; | |
} | |
/* Print data initialization */ | |
#ifdef DEBUG | |
// printf("Matrix A\n"); | |
// for (size_t i = 0; i < matrix_a_size; ++i) | |
// { | |
// printf("%f ", matrix_a[i]); | |
// if ((i + 1) % matrix_a_width == 0) | |
// { | |
// printf("\n"); | |
// } | |
// } | |
// printf("Matrix B\n"); | |
// for (size_t i = 0; i < matrix_b_size; ++i) | |
// { | |
// printf("%f ", matrix_b[i]); | |
// if ((i + 1) % matrix_b_width == 0) | |
// { | |
// printf("\n"); | |
// } | |
// } | |
printf("Matrix C\n"); | |
for (size_t i = 0; i < matrix_c_size; ++i) | |
{ | |
printf("%f ", matrix_c[i]); | |
if ((i + 1) % matrix_c_width == 0) | |
{ | |
printf("\n"); | |
} | |
} | |
#endif | |
cl_device_id device_id; /* compute device id */ | |
cl_context context; /* compute context */ | |
cl_command_queue commands; /* compute command queue */ | |
cl_program program; /* compute program */ | |
cl_kernel kernel; /* compute kernel */ | |
cl_event *kernel_event_list; | |
cl_double kernelTime; /* kernel profiling timer */ | |
// size_t localWorkSize[2] = {4, 4}; | |
size_t globalWorkSize[2] = {matrix_c_height, matrix_c_width}; | |
size_t localWorkSize[2] = {8, 8}; | |
size_t dataBytes; | |
size_t kernelLength; | |
cl_int errcode; | |
context = cl_init_context(0, 0, 0); | |
program = cl_compileProgram((char *)"_temp_4_Hawaii.cl", NULL); | |
kernel = clCreateKernel(program, "sgemmBlock", &errcode); | |
cl_errChk(errcode, "ERROR in creating kernel sgemmBlock", true); | |
cl_mem cl_matrix_a, cl_matrix_b, cl_matrix_c; | |
cl_matrix_c = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * matrix_c_size, NULL, &errcode); | |
cl_errChk(errcode, "ERROR in creating buffer cl_matrix_c", true); | |
cl_matrix_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, matrix_a_size, matrix_a, &errcode); | |
cl_errChk(errcode, "ERROR in creating image cl_matrix_a", true); | |
cl_matrix_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, matrix_b_size, matrix_b, &errcode); | |
cl_errChk(errcode, "ERROR in creating image cl_matrix_b", true); | |
/** | |
* sgemmBlock args: | |
* | |
* uint M, | |
* uint N, | |
* uint K, | |
* const float alpha, | |
* const float beta, | |
* const __global float *restrict A, | |
* const __global float *restrict B, | |
* __global float *C, | |
* uint lda, | |
* uint ldb, | |
* uint ldc) | |
*/ | |
unsigned int N = 1024; | |
unsigned int M = 1024; | |
unsigned int K = 1024; | |
const float alpha = 1.0f; | |
const float beta = 1.0f; | |
unsigned int lda = 0.0f; | |
unsigned int ldb = 0.0f; | |
unsigned int ldc = 0.0f; | |
commands = cl_getCommandQueue(); | |
cl_event writeEvent, kernelEvent, readEvent; | |
cl_int argchk = 0x0; | |
argchk |= clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&N); | |
argchk |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&M); | |
argchk |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&K); | |
argchk |= clSetKernelArg(kernel, 3, sizeof(cl_float), (void *)&alpha); | |
argchk |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&beta); | |
argchk |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&cl_matrix_a); | |
argchk |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&cl_matrix_b); | |
argchk |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&cl_matrix_c); | |
argchk |= clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&lda); | |
argchk |= clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&ldb); | |
argchk |= clSetKernelArg(kernel, 10, sizeof(cl_float), (void *)&ldc); | |
cl_errChk(argchk, "ERROR in Setting NaiveGEMM kernel args", true); | |
errcode = clEnqueueNDRangeKernel( | |
commands, kernel, 2, NULL, | |
globalWorkSize, localWorkSize, | |
0, NULL, &kernelEvent); | |
cl_errChk(errcode, "ERROR in Executing Kernel NaiveGEMM", true); | |
errcode = clEnqueueReadBuffer(commands, | |
cl_matrix_c, | |
1, // change to 0 for nonblocking write | |
0, // offset | |
sizeof(float) * matrix_c_size, | |
matrix_c, | |
0, // num_events_in_wait_list | |
NULL, // event_wait_list | |
&readEvent); | |
cl_errChk(errcode, "ERROR with clEnqueueReadBuffer", true); | |
#ifdef DEBUG | |
printf("Matrix C\n"); | |
for (size_t i = 0; i < matrix_c_size; ++i) | |
{ | |
printf("%f ", matrix_c[i]); | |
if ((i + 1) % matrix_c_width == 0) | |
{ | |
printf("\n"); | |
} | |
} | |
#endif | |
clReleaseMemObject(cl_matrix_a); | |
clReleaseMemObject(cl_matrix_b); | |
clReleaseMemObject(cl_matrix_c); | |
/* Free up matrix memory */ | |
free(matrix_a); | |
free(matrix_b); | |
free(matrix_c); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment