Skip to content

Instantly share code, notes, and snippets.

@pratikone
Last active July 26, 2016 17:07
Show Gist options
  • Save pratikone/993a50000d852fda085256075de4389c to your computer and use it in GitHub Desktop.
Save pratikone/993a50000d852fda085256075de4389c to your computer and use it in GitHub Desktop.
OpenCL knapsack implementation - host and kernel files
// The code is based on needle.c for Needleman-Wunsch algorithm at
// https://github.com/vtsynergy/OpenDwarfs/blob/master/dynamic-programming/nw/needle.c
#define LIMIT -999
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "needle.h"
#include <sys/time.h>
#include "../../include/rdtsc.h"
#include "../../include/common_args.h"
#include <malloc.h>
#define AOCL_ALIGNMENT 64
//#define TRACE
void runTest( int argc, char** argv);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
int
maximum( int a,
int b,
int c){
int k;
if( a <= b )
k = b;
else
k = a;
if( k <=c )
return(c);
else
return(k);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
ocd_init(&argc, &argv, NULL);
ocd_initCL();
runTest( argc, argv);
ocd_finalize();
return EXIT_SUCCESS;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
fprintf(stderr, "\t<dimension> - x and y dimensions\n");
fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
exit(1);
}
void runTest( int argc, char** argv)
{
int max_rows, max_cols, penalty;
int *input_itemsets, *weigths, *values;
cl_mem matrix_cuda,weigths_cuda, values_cuda;
int size;
int i, j;
cl_int errcode;
// the lengths of the two sequences should be able to divided by 16.
// And at current stage max_rows needs to equal max_cols
if (argc == 3)
{
max_rows = atoi(argv[1]); //no of items
max_cols = atoi(argv[2]); // maximum capacity , column range will be W
}
else{
usage(argc, argv);
}
if(atoi(argv[1])%16!=0){
fprintf(stderr,"The dimension values must be a multiple of 16\n");
exit(1);
}
max_rows = max_rows + 1;
max_cols = max_cols + 1;
if(_deviceType == 3) {
weigths = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
values = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
input_itemsets = (int *)memalign(AOCL_ALIGNMENT, max_rows * max_cols * sizeof(int) );
} else {
weigths = (int *)malloc( max_rows * sizeof(int) );
values = (int *)malloc( max_rows * sizeof(int) );
input_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int) );
}
if (!input_itemsets)
fprintf(stderr, "error: can not allocate memory");
srand ( 7 ); //TODO
//Initialize to zero
for (i = 0 ; i < max_cols; i++){
for (j = 0 ; j < max_rows; j++){
input_itemsets[i*max_cols+j] = 0;
}
}
//Used for input itemsets, then after the weigths matrix is constructed it is used/rewritten to store scores.
for(i=0; i< max_rows ; i++){ //please define your own sequence.
weigths[i] = rand() % 10;
values[i] = rand() % 10;
}
//cl_device_id device_id;
//cl_context context;
//cl_command_queue commands;
cl_program clProgram;
cl_kernel clKernel_nw1;
cl_kernel clKernel_nw2;
FILE *kernelFile;
char *kernelSource;
size_t kernelLength;
char* kernel_files;
int num_kernels = 1;
kernel_files = (char*) malloc(sizeof(char*)*num_kernels);
strcpy(kernel_files,"knapsack_kernel");
clProgram = ocdBuildProgramFromFile(context,device_id, kernel_files, NULL);
clKernel_nw1 = clCreateKernel(clProgram, "knapsack_opencl_shared_1", &errcode);
CHKERR(errcode, "Failed to create kernel!");
size = max_cols * max_rows;
weigths_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
CHKERR(errcode, "Failed to create buffer!");
values_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
CHKERR(errcode, "Failed to create buffer!");
matrix_cuda = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*size, NULL, &errcode);
CHKERR(errcode, "Failed to create buffer!");
errcode = clEnqueueWriteBuffer(commands, weigths_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) weigths, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(errcode, "Failed to enqueue write buffer!");
errcode = clEnqueueWriteBuffer(commands, values_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) values, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(errcode, "Failed to enqueue write buffer!");
errcode = clEnqueueWriteBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(int)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Knapsack table Set Copy", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(errcode, "Failed to enqueue write buffer!");
size_t localWorkSize[2] = {BLOCK_SIZE, 1}; //BLOCK_SIZE work items per work-group in 1D only.
size_t globalWorkSize[2];
int block_width = ( max_cols - 1 )/BLOCK_SIZE;
// Processing the whole Knapsack table row by row here
for(i = 1 ; i < max_rows ; i++){
globalWorkSize[0] = max_cols - 1;
globalWorkSize[1] = 1*localWorkSize[1];
//printf("%s %d %s %d\n", "running as wave front, work-group nos ", globalWorkSize[0]/localWorkSize[0] , "with globalWorkSize", globalWorkSize[0]);
errcode = clSetKernelArg(clKernel_nw1, 0, sizeof(cl_mem), (void *) &weigths_cuda);
errcode = clSetKernelArg(clKernel_nw1, 1, sizeof(cl_mem), (void *) &values_cuda);
errcode |= clSetKernelArg(clKernel_nw1, 2, sizeof(cl_mem), (void *) &matrix_cuda);
errcode |= clSetKernelArg(clKernel_nw1, 3, sizeof(int), (void *) &max_cols);
errcode |= clSetKernelArg(clKernel_nw1, 4, sizeof(int), (void *) &i);
CHKERR(errcode, "Failed to set kernel arguments!");
errcode = clEnqueueNDRangeKernel(commands, clKernel_nw1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "NW Kernel nw1", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(errcode, "Failed to enqueue kernel!");
}
errcode = clEnqueueReadBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(float)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "knapsack table Set Copy", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(errcode, "Failed to enqueue read buffer!");
for (i = 0; i < size; ++i)
{
printf("%d ", input_itemsets[ i ] );
}
clReleaseMemObject(weigths_cuda);
clReleaseMemObject(matrix_cuda);
clReleaseKernel(clKernel_nw1);
clReleaseProgram(clProgram);
clReleaseCommandQueue(commands);
clReleaseContext(context);
}
#define BLOCK_SIZE 16
int
maximum( int a,
int b){
if( a <= b )
return (b);
else
return (a);
}
__kernel void
knapsack_opencl_shared_1( __global int* weights,
__global int* values,
__global int* matrix_opencl,
int cols,
int i)
{
int bx = get_group_id(0);
int tx = get_local_id(0);
//printf("@@@ kernel group-id %d local-id %d\n", bx, tx);
int b_index_row = i;
int b_index_col = BLOCK_SIZE * bx + tx; // j index in a W.G.
int index = b_index_row * cols + b_index_col;
int universal_row = ref_x/cols;
int universal_col = ref_x % cols;
if (weights[ b_index_row - 1 ] > universal_col ) //copy previous value if weight of item is more than current capacity
matrix_opencl[index] = matrix_opencl[index - cols] ;
else{
matrix_opencl[index] = maximum( matrix_opencl[index - (cols + weights[b_index_row] )] + values[b_index_row],
matrix_opencl[index - cols]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment