yousefhamza/kernel.cu

## kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define BLOCKSIZE 2
#define THREADSIZE 3

void cpu_convolve(float *image, float *filter, float *result, unsigned int N) {
	float sum;
	for (unsigned int i = 1; i < N - 1; i++)
		for (unsigned int j = 1; j < N - 1; j++) {
			sum = 0;
			sum += image[(i - 1) * N + j - 1] * filter[0] + image[(i - 1) * N + j] * filter[1] + image[(i - 1) * N + j + 1] * filter[2]
				+ image[i * N + j - 1] * filter[3] + image[i * N + j] * filter[4] + image[i * N + j + 1] * filter[5]
				+ image[(i + 1) * N + j - 1] * filter[6] + image[(i + 1) * N + j] * filter[7] + image[(i + 1) * N + j + 1] * filter[8];
			result[i * N + j] = sum;
		}
}

__global__ void gpu_convolve(float *image, float *filter, float *result, unsigned int N) {
	__shared__ float s_filter[9];
	for (unsigned char i = 0; i < 9; ++i) s_filter[i] = filter[i];
	__syncthreads();
	float sum = 0;
	int j = threadIdx.x + blockDim.x * blockIdx.x + 1;
	int i = threadIdx.y + blockDim.y * blockIdx.y + 1;

	/* Add image to shared memory */
	__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2];
	int ty = threadIdx.x + 1; int tx = threadIdx.y + 1;
	s_image[tx][ty] = image[i * N + j];
	// Handling corners
	if (threadIdx.x == 0 &&
		threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1];
	else if (threadIdx.x == THREADSIZE - 1 &&
		threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1];
	else if (threadIdx.x == 0 &&
		threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1];
	else if (threadIdx.y == 0 &&
		threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1];

	// Handling edges
	if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1];
	if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j];
	if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1];
	if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j];
	__syncthreads();
	sum = s_image[i - 1][j - 1] * s_filter[0] + s_image[i - 1][j] * s_filter[1] + s_image[i - 1][j + 1] * s_filter[2]
		+ s_image[i][j - 1] * s_filter[3] + s_image[i][j] * s_filter[4] + s_image[i][j + 1] * s_filter[5]
		+ s_image[i + 1][j - 1] * s_filter[6] + s_image[i + 1][j] * s_filter[7] + s_image[i + 1][j + 1] * s_filter[8];
	result[i * N + j] = sum;
}

// decearing the cuda Constant filter of size 9
__constant__ float dev_filter[9];

__global__ void gpu_convolve_constant_filter(float *image, float *result, unsigned int N) {
	float sum = 0;
	int j = threadIdx.x + blockDim.x * blockIdx.x + 1;
	int i = threadIdx.y + blockDim.y * blockIdx.y + 1;

	/* Add image to shared memory */
	__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2];
	int ty = threadIdx.x + 1; int tx = threadIdx.y + 1;
	s_image[tx][ty] = image[i * N + j];
	// Handling corners
	if (threadIdx.x == 0 &&
		threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1];
	else if (threadIdx.x == THREADSIZE - 1 &&
		threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1];
	else if (threadIdx.x == 0 &&
		threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1];
	else if (threadIdx.y == 0 &&
		threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1];

	// Handling edges
	if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1];
	if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j];
	if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1];
	if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j];
	__syncthreads();
	sum = s_image[i - 1][j - 1] * dev_filter[0] + s_image[i - 1][j] * dev_filter[1] + s_image[i - 1][j + 1] * dev_filter[2]
		+ s_image[i][j - 1] * dev_filter[3] + s_image[i][j] * dev_filter[4] + s_image[i][j + 1] * dev_filter[5]
		+ s_image[i + 1][j - 1] * dev_filter[6] + s_image[i + 1][j] * dev_filter[7] + s_image[i + 1][j + 1] * dev_filter[8];
	result[i * N + j] = sum;
}


void print_matrix(float arr[], unsigned int N) {
	for (unsigned int i = 1; i < N - 1; ++i) {
		for (unsigned int j = 1; j < N - 1; ++j) {
			printf("%10f", arr[i * N + j]);
		}
		printf("\n");
	}
}


int main(int argc, char *argv[]) {
	int Grid_Dim_x = BLOCKSIZE, Grid_Dim_y = BLOCKSIZE;
	int Block_Dim_x = THREADSIZE, Block_Dim_y = THREADSIZE;
	const unsigned int N = 4;
	float image[] = { 0, 0, 0, 0, 0, 0,
		0, 1, 1, 1, 1, 0,
		0, 1, 1, 1, 1, 0,
		0, 1, 1, 1, 1, 0,
		0, 1, 1, 1, 1, 0,
		0, 0, 0, 0, 0, 0 };

	// host copy for the filter
	float filter[] = { 1, 1, 1,
		1, 1, 1,
		1, 1, 1 };

	float result[(N + 2) * (N + 2)];
	float d_result_cpu[(N + 2) * (N + 2)];

	dim3 Grid(Grid_Dim_x, Grid_Dim_y);
	dim3 Block(Block_Dim_x, Block_Dim_y);

	float *d_image, *d_filter, *d_result;

	cudaMalloc((void**)&d_image, sizeof(image));
	cudaMalloc((void**)&d_filter, sizeof(filter));
	cudaMalloc((void**)&d_result, sizeof(result));
	cudaMemcpy(d_image, image, sizeof(image), cudaMemcpyHostToDevice);
	cudaMemcpy(d_filter, filter, sizeof(filter), cudaMemcpyHostToDevice);

	cpu_convolve(image, filter, result, N + 2);
	// normal gpu_convolve function
	gpu_convolve << <Grid, Block >> >(d_image, d_filter, d_result, N + 2);
	cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost);
	print_matrix(result, N + 2);
	printf("\n");
	print_matrix(d_result_cpu, N + 2);


	// prepare filter to send to constant memory
	cudaMemcpyToSymbol(dev_filter, filter, sizeof(filter));
	// gpu_convolve_constant_filter function with constant filter @author Moataz_Farid
	// gpu_convolve_constant_filter << <Grid, Block >> >(d_image, d_result, N + 2);
	//cuda copy result
	cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost);
	//print result
	print_matrix(d_result_cpu, N + 2);


	cudaFree(d_image);
	cudaFree(d_filter);
	cudaFree(d_result);
	return 0;
}
	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"
	#include <stdio.h>

	#define BLOCKSIZE 2
	#define THREADSIZE 3

	void cpu_convolve(float image, float filter, float *result, unsigned int N) {
	float sum;
	for (unsigned int i = 1; i < N - 1; i++)
	for (unsigned int j = 1; j < N - 1; j++) {
	sum = 0;
	sum += image[(i - 1) * N + j - 1] * filter[0] + image[(i - 1) * N + j] * filter[1] + image[(i - 1) * N + j + 1] * filter[2]
	+ image[i * N + j - 1] * filter[3] + image[i * N + j] * filter[4] + image[i * N + j + 1] * filter[5]
	+ image[(i + 1) * N + j - 1] * filter[6] + image[(i + 1) * N + j] * filter[7] + image[(i + 1) * N + j + 1] * filter[8];
	result[i * N + j] = sum;
	}
	}

	__global__ void gpu_convolve(float image, float filter, float *result, unsigned int N) {
	__shared__ float s_filter[9];
	for (unsigned char i = 0; i < 9; ++i) s_filter[i] = filter[i];
	__syncthreads();
	float sum = 0;
	int j = threadIdx.x + blockDim.x * blockIdx.x + 1;
	int i = threadIdx.y + blockDim.y * blockIdx.y + 1;

	/* Add image to shared memory */
	__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2];
	int ty = threadIdx.x + 1; int tx = threadIdx.y + 1;
	s_image[tx][ty] = image[i * N + j];
	// Handling corners
	if (threadIdx.x == 0 &&
	threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1];
	else if (threadIdx.x == THREADSIZE - 1 &&
	threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1];
	else if (threadIdx.x == 0 &&
	threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1];
	else if (threadIdx.y == 0 &&
	threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1];

	// Handling edges
	if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1];
	if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j];
	if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1];
	if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j];
	__syncthreads();
	sum = s_image[i - 1][j - 1] * s_filter[0] + s_image[i - 1][j] * s_filter[1] + s_image[i - 1][j + 1] * s_filter[2]
	+ s_image[i][j - 1] * s_filter[3] + s_image[i][j] * s_filter[4] + s_image[i][j + 1] * s_filter[5]
	+ s_image[i + 1][j - 1] * s_filter[6] + s_image[i + 1][j] * s_filter[7] + s_image[i + 1][j + 1] * s_filter[8];
	result[i * N + j] = sum;
	}

	// decearing the cuda Constant filter of size 9
	__constant__ float dev_filter[9];

	__global__ void gpu_convolve_constant_filter(float image, float result, unsigned int N) {
	float sum = 0;
	int j = threadIdx.x + blockDim.x * blockIdx.x + 1;
	int i = threadIdx.y + blockDim.y * blockIdx.y + 1;

	/* Add image to shared memory */
	__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2];
	int ty = threadIdx.x + 1; int tx = threadIdx.y + 1;
	s_image[tx][ty] = image[i * N + j];
	// Handling corners
	if (threadIdx.x == 0 &&
	threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1];
	else if (threadIdx.x == THREADSIZE - 1 &&
	threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1];
	else if (threadIdx.x == 0 &&
	threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1];
	else if (threadIdx.y == 0 &&
	threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1];

	// Handling edges
	if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1];
	if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j];
	if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1];
	if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j];
	__syncthreads();
	sum = s_image[i - 1][j - 1] * dev_filter[0] + s_image[i - 1][j] * dev_filter[1] + s_image[i - 1][j + 1] * dev_filter[2]
	+ s_image[i][j - 1] * dev_filter[3] + s_image[i][j] * dev_filter[4] + s_image[i][j + 1] * dev_filter[5]
	+ s_image[i + 1][j - 1] * dev_filter[6] + s_image[i + 1][j] * dev_filter[7] + s_image[i + 1][j + 1] * dev_filter[8];
	result[i * N + j] = sum;
	}


	void print_matrix(float arr[], unsigned int N) {
	for (unsigned int i = 1; i < N - 1; ++i) {
	for (unsigned int j = 1; j < N - 1; ++j) {
	printf("%10f", arr[i * N + j]);
	}
	printf("\n");
	}
	}


	int main(int argc, char *argv[]) {
	int Grid_Dim_x = BLOCKSIZE, Grid_Dim_y = BLOCKSIZE;
	int Block_Dim_x = THREADSIZE, Block_Dim_y = THREADSIZE;
	const unsigned int N = 4;
	float image[] = { 0, 0, 0, 0, 0, 0,
	0, 1, 1, 1, 1, 0,
	0, 1, 1, 1, 1, 0,
	0, 1, 1, 1, 1, 0,
	0, 1, 1, 1, 1, 0,
	0, 0, 0, 0, 0, 0 };

	// host copy for the filter
	float filter[] = { 1, 1, 1,
	1, 1, 1,
	1, 1, 1 };

	float result[(N + 2) * (N + 2)];
	float d_result_cpu[(N + 2) * (N + 2)];

	dim3 Grid(Grid_Dim_x, Grid_Dim_y);
	dim3 Block(Block_Dim_x, Block_Dim_y);

	float d_image, d_filter, *d_result;

	cudaMalloc((void**)&d_image, sizeof(image));
	cudaMalloc((void**)&d_filter, sizeof(filter));
	cudaMalloc((void**)&d_result, sizeof(result));
	cudaMemcpy(d_image, image, sizeof(image), cudaMemcpyHostToDevice);
	cudaMemcpy(d_filter, filter, sizeof(filter), cudaMemcpyHostToDevice);

	cpu_convolve(image, filter, result, N + 2);
	// normal gpu_convolve function
	gpu_convolve << <Grid, Block >> >(d_image, d_filter, d_result, N + 2);
	cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost);
	print_matrix(result, N + 2);
	printf("\n");
	print_matrix(d_result_cpu, N + 2);


	// prepare filter to send to constant memory
	cudaMemcpyToSymbol(dev_filter, filter, sizeof(filter));
	// gpu_convolve_constant_filter function with constant filter @author Moataz_Farid
	// gpu_convolve_constant_filter << <Grid, Block >> >(d_image, d_result, N + 2);
	//cuda copy result
	cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost);
	//print result
	print_matrix(d_result_cpu, N + 2);


	cudaFree(d_image);
	cudaFree(d_filter);
	cudaFree(d_result);
	return 0;
	}