pratikone/knapsack.c

## knapsack.c
// The code is based on needle.c for Needleman-Wunsch algorithm at
// https://github.com/vtsynergy/OpenDwarfs/blob/master/dynamic-programming/nw/needle.c

#define LIMIT -999
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "needle.h"
#include <sys/time.h>
#include "../../include/rdtsc.h"
#include "../../include/common_args.h"
#include <malloc.h>
#define AOCL_ALIGNMENT 64


//#define TRACE

void runTest( int argc, char** argv);


double gettime() {
	struct timeval t;
	gettimeofday(&t,NULL);
	return t.tv_sec+t.tv_usec*1e-6;
}

int
maximum( int a,
		int b,
		int c){

	int k;
	if( a <= b )
		k = b;
	else
		k = a;

	if( k <=c )
		return(c);
	else
		return(k);

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
	int
main( int argc, char** argv)
{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();
	runTest( argc, argv);
	ocd_finalize();
	return EXIT_SUCCESS;
}

void usage(int argc, char **argv)
{
	fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
	fprintf(stderr, "\t<dimension>  - x and y dimensions\n");
	fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
	exit(1);
}

void runTest( int argc, char** argv)
{
	int max_rows, max_cols, penalty;
	int *input_itemsets, *weigths, *values;
	cl_mem matrix_cuda,weigths_cuda, values_cuda;
	int size;
	int i, j;

	cl_int errcode;

	// the lengths of the two sequences should be able to divided by 16.
	// And at current stage  max_rows needs to equal max_cols
	if (argc == 3)
	{
		max_rows = atoi(argv[1]);    //no of items
		max_cols = atoi(argv[2]);    // maximum capacity , column range will be W
	}
	else{
		usage(argc, argv);
	}

	if(atoi(argv[1])%16!=0){
		fprintf(stderr,"The dimension values must be a multiple of 16\n");
		exit(1);
	}

	max_rows = max_rows + 1;
	max_cols = max_cols + 1;
    if(_deviceType == 3) {
	    weigths      = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
	    values      = (int *)memalign(AOCL_ALIGNMENT, max_rows * sizeof(int) );
	    input_itemsets  = (int *)memalign(AOCL_ALIGNMENT, max_rows * max_cols * sizeof(int) );

    } else {
        weigths      = (int *)malloc( max_rows * sizeof(int) );
        values      = (int *)malloc( max_rows * sizeof(int) );
	    input_itemsets  = (int *)malloc(max_rows * max_cols * sizeof(int) );
    }

	if (!input_itemsets)
		fprintf(stderr, "error: can not allocate memory");

	srand ( 7 );   //TODO
	//Initialize to zero
	for (i = 0 ; i < max_cols; i++){
		for (j = 0 ; j < max_rows; j++){
			input_itemsets[i*max_cols+j] = 0;
		}
	}

	//Used for input itemsets, then after the weigths matrix is constructed it is used/rewritten to store scores.
	for(i=0; i< max_rows ; i++){    //please define your own sequence.
		weigths[i] = rand() % 10;
		values[i] = rand() % 10;
	}


	//cl_device_id device_id;
	//cl_context context;
	//cl_command_queue commands;
	cl_program clProgram;
	cl_kernel clKernel_nw1;
	cl_kernel clKernel_nw2;


	FILE *kernelFile;
	char *kernelSource;
	size_t kernelLength;
    char* kernel_files;
    int num_kernels = 1;
    kernel_files = (char*) malloc(sizeof(char*)*num_kernels);
	strcpy(kernel_files,"knapsack_kernel");
    clProgram = ocdBuildProgramFromFile(context,device_id, kernel_files, NULL);

	clKernel_nw1 = clCreateKernel(clProgram, "knapsack_opencl_shared_1", &errcode);
	CHKERR(errcode, "Failed to create kernel!");

	size = max_cols * max_rows;
	weigths_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	values_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	matrix_cuda = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*size, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	errcode = clEnqueueWriteBuffer(commands, weigths_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) weigths, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	errcode = clEnqueueWriteBuffer(commands, values_cuda, CL_TRUE, 0, sizeof(int)*max_rows, (void *) values, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");


	errcode = clEnqueueWriteBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(int)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	size_t localWorkSize[2] = {BLOCK_SIZE, 1}; //BLOCK_SIZE work items per work-group in 1D only.
	size_t globalWorkSize[2];
	int block_width = ( max_cols - 1 )/BLOCK_SIZE;


	// Processing the whole Knapsack table row by row here
	for(i = 1 ; i < max_rows ; i++){
		globalWorkSize[0] = max_cols - 1;

		globalWorkSize[1] = 1*localWorkSize[1];

		//printf("%s %d %s %d\n", "running as wave front, work-group nos ", globalWorkSize[0]/localWorkSize[0] , "with globalWorkSize", globalWorkSize[0]);
		errcode = clSetKernelArg(clKernel_nw1,  0, sizeof(cl_mem), (void *) &weigths_cuda);
		errcode = clSetKernelArg(clKernel_nw1,  1, sizeof(cl_mem), (void *) &values_cuda);
		errcode |= clSetKernelArg(clKernel_nw1, 2, sizeof(cl_mem), (void *) &matrix_cuda);
		errcode |= clSetKernelArg(clKernel_nw1, 3, sizeof(int), (void *) &max_cols);
		errcode |= clSetKernelArg(clKernel_nw1, 4, sizeof(int), (void *) &i);
		CHKERR(errcode, "Failed to set kernel arguments!");
		errcode = clEnqueueNDRangeKernel(commands, clKernel_nw1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
		clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "NW Kernel nw1", ocdTempTimer)
	 	END_TIMER(ocdTempTimer)
		CHKERR(errcode, "Failed to enqueue kernel!");

	}


	errcode = clEnqueueReadBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(float)*size, (void *) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue read buffer!");


	for (i = 0; i < size; ++i)
	{
		printf("%d   ",  input_itemsets[ i ] );
	}


	clReleaseMemObject(weigths_cuda);
	clReleaseMemObject(matrix_cuda);
	clReleaseKernel(clKernel_nw1);
	clReleaseProgram(clProgram);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

}


## knapsack_kernel.cl
#define BLOCK_SIZE 16

int
maximum( int a,
		int b){

	if( a <= b )
		return (b);
	else
		return (a);
}

	__kernel void
knapsack_opencl_shared_1(  __global int* weights,
		__global int* values,
		__global int* matrix_opencl,
		int cols,
		int i)
{
	int bx = get_group_id(0);
	int tx = get_local_id(0);


	//printf("@@@ kernel group-id %d local-id %d\n", bx, tx);

	int b_index_row = i;
	int b_index_col = BLOCK_SIZE * bx + tx;  // j index in a W.G.
	int index   = b_index_row * cols + b_index_col;


	int universal_row = ref_x/cols;
	int universal_col = ref_x % cols;

	if (weights[ b_index_row - 1 ] > universal_col )  //copy previous value if weight of item is more than current capacity
		matrix_opencl[index] = matrix_opencl[index - cols] ;

	else{

		matrix_opencl[index] = maximum( matrix_opencl[index - (cols + weights[b_index_row]  )] + values[b_index_row],
											matrix_opencl[index - cols]);
	}


}
	// The code is based on needle.c for Needleman-Wunsch algorithm at
	// https://github.com/vtsynergy/OpenDwarfs/blob/master/dynamic-programming/nw/needle.c

	#define LIMIT -999
	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <math.h>
	#include "needle.h"
	#include <sys/time.h>
	#include "../../include/rdtsc.h"
	#include "../../include/common_args.h"
	#include <malloc.h>
	#define AOCL_ALIGNMENT 64


	//#define TRACE

	void runTest( int argc, char** argv);


	double gettime() {
	struct timeval t;
	gettimeofday(&t,NULL);
	return t.tv_sec+t.tv_usec*1e-6;
	}

	int
	maximum( int a,
	int b,
	int c){

	int k;
	if( a <= b )
	k = b;
	else
	k = a;

	if( k <=c )
	return(c);
	else
	return(k);

	}

	////////////////////////////////////////////////////////////////////////////////
	// Program main
	////////////////////////////////////////////////////////////////////////////////
	int
	main( int argc, char** argv)
	{
	ocd_init(&argc, &argv, NULL);
	ocd_initCL();
	runTest( argc, argv);
	ocd_finalize();
	return EXIT_SUCCESS;
	}

	void usage(int argc, char **argv)
	{
	fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
	fprintf(stderr, "\t<dimension> - x and y dimensions\n");
	fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
	exit(1);
	}

	void runTest( int argc, char** argv)
	{
	int max_rows, max_cols, penalty;
	int input_itemsets, weigths, *values;
	cl_mem matrix_cuda,weigths_cuda, values_cuda;
	int size;
	int i, j;

	cl_int errcode;

	// the lengths of the two sequences should be able to divided by 16.
	// And at current stage max_rows needs to equal max_cols
	if (argc == 3)
	{
	max_rows = atoi(argv[1]); //no of items
	max_cols = atoi(argv[2]); // maximum capacity , column range will be W
	}
	else{
	usage(argc, argv);
	}

	if(atoi(argv[1])%16!=0){
	fprintf(stderr,"The dimension values must be a multiple of 16\n");
	exit(1);
	}

	max_rows = max_rows + 1;
	max_cols = max_cols + 1;
	if(_deviceType == 3) {
	weigths = (int )memalign(AOCL_ALIGNMENT, max_rows sizeof(int) );
	values = (int )memalign(AOCL_ALIGNMENT, max_rows sizeof(int) );
	input_itemsets = (int )memalign(AOCL_ALIGNMENT, max_rows max_cols * sizeof(int) );

	} else {
	weigths = (int )malloc( max_rows sizeof(int) );
	values = (int )malloc( max_rows sizeof(int) );
	input_itemsets = (int )malloc(max_rows max_cols * sizeof(int) );
	}

	if (!input_itemsets)
	fprintf(stderr, "error: can not allocate memory");

	srand ( 7 ); //TODO
	//Initialize to zero
	for (i = 0 ; i < max_cols; i++){
	for (j = 0 ; j < max_rows; j++){
	input_itemsets[i*max_cols+j] = 0;
	}
	}

	//Used for input itemsets, then after the weigths matrix is constructed it is used/rewritten to store scores.
	for(i=0; i< max_rows ; i++){ //please define your own sequence.
	weigths[i] = rand() % 10;
	values[i] = rand() % 10;
	}


	//cl_device_id device_id;
	//cl_context context;
	//cl_command_queue commands;
	cl_program clProgram;
	cl_kernel clKernel_nw1;
	cl_kernel clKernel_nw2;


	FILE *kernelFile;
	char *kernelSource;
	size_t kernelLength;
	char* kernel_files;
	int num_kernels = 1;
	kernel_files = (char) malloc(sizeof(char)*num_kernels);
	strcpy(kernel_files,"knapsack_kernel");
	clProgram = ocdBuildProgramFromFile(context,device_id, kernel_files, NULL);

	clKernel_nw1 = clCreateKernel(clProgram, "knapsack_opencl_shared_1", &errcode);
	CHKERR(errcode, "Failed to create kernel!");

	size = max_cols * max_rows;
	weigths_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	values_cuda = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*max_rows, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	matrix_cuda = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*size, NULL, &errcode);
	CHKERR(errcode, "Failed to create buffer!");

	errcode = clEnqueueWriteBuffer(commands, weigths_cuda, CL_TRUE, 0, sizeof(int)max_rows, (void ) weigths, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	errcode = clEnqueueWriteBuffer(commands, values_cuda, CL_TRUE, 0, sizeof(int)max_rows, (void ) values, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "NW Reference Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");




	errcode = clEnqueueWriteBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(int)size, (void ) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue write buffer!");

	size_t localWorkSize[2] = {BLOCK_SIZE, 1}; //BLOCK_SIZE work items per work-group in 1D only.
	size_t globalWorkSize[2];
	int block_width = ( max_cols - 1 )/BLOCK_SIZE;



	// Processing the whole Knapsack table row by row here
	for(i = 1 ; i < max_rows ; i++){
	globalWorkSize[0] = max_cols - 1;

	globalWorkSize[1] = 1*localWorkSize[1];

	//printf("%s %d %s %d\n", "running as wave front, work-group nos ", globalWorkSize[0]/localWorkSize[0] , "with globalWorkSize", globalWorkSize[0]);
	errcode = clSetKernelArg(clKernel_nw1, 0, sizeof(cl_mem), (void *) &weigths_cuda);
	errcode = clSetKernelArg(clKernel_nw1, 1, sizeof(cl_mem), (void *) &values_cuda);
	errcode \|= clSetKernelArg(clKernel_nw1, 2, sizeof(cl_mem), (void *) &matrix_cuda);
	errcode \|= clSetKernelArg(clKernel_nw1, 3, sizeof(int), (void *) &max_cols);
	errcode \|= clSetKernelArg(clKernel_nw1, 4, sizeof(int), (void *) &i);
	CHKERR(errcode, "Failed to set kernel arguments!");
	errcode = clEnqueueNDRangeKernel(commands, clKernel_nw1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "NW Kernel nw1", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue kernel!");

	}


	errcode = clEnqueueReadBuffer(commands, matrix_cuda, CL_TRUE, 0, sizeof(float)size, (void ) input_itemsets, 0, NULL, &ocdTempEvent);
	clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "knapsack table Set Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(errcode, "Failed to enqueue read buffer!");


	for (i = 0; i < size; ++i)
	{
	printf("%d ", input_itemsets[ i ] );
	}


	clReleaseMemObject(weigths_cuda);
	clReleaseMemObject(matrix_cuda);
	clReleaseKernel(clKernel_nw1);
	clReleaseProgram(clProgram);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	}
	#define BLOCK_SIZE 16

	int
	maximum( int a,
	int b){

	if( a <= b )
	return (b);
	else
	return (a);
	}

	__kernel void
	knapsack_opencl_shared_1( __global int* weights,
	__global int* values,
	__global int* matrix_opencl,
	int cols,
	int i)
	{
	int bx = get_group_id(0);
	int tx = get_local_id(0);


	//printf("@@@ kernel group-id %d local-id %d\n", bx, tx);

	int b_index_row = i;
	int b_index_col = BLOCK_SIZE * bx + tx; // j index in a W.G.
	int index = b_index_row * cols + b_index_col;


	int universal_row = ref_x/cols;
	int universal_col = ref_x % cols;

	if (weights[ b_index_row - 1 ] > universal_col ) //copy previous value if weight of item is more than current capacity
	matrix_opencl[index] = matrix_opencl[index - cols] ;

	else{

	matrix_opencl[index] = maximum( matrix_opencl[index - (cols + weights[b_index_row] )] + values[b_index_row],
	matrix_opencl[index - cols]);
	}



	}