Last active
December 3, 2015 11:52
-
-
Save yousefhamza/eae794e47bd941b7b932 to your computer and use it in GitHub Desktop.
Soble algorithm in Cuda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <stdio.h> | |
#define BLOCKSIZE 2 | |
#define THREADSIZE 3 | |
void cpu_convolve(float *image, float *filter, float *result, unsigned int N) { | |
float sum; | |
for (unsigned int i = 1; i < N - 1; i++) | |
for (unsigned int j = 1; j < N - 1; j++) { | |
sum = 0; | |
sum += image[(i - 1) * N + j - 1] * filter[0] + image[(i - 1) * N + j] * filter[1] + image[(i - 1) * N + j + 1] * filter[2] | |
+ image[i * N + j - 1] * filter[3] + image[i * N + j] * filter[4] + image[i * N + j + 1] * filter[5] | |
+ image[(i + 1) * N + j - 1] * filter[6] + image[(i + 1) * N + j] * filter[7] + image[(i + 1) * N + j + 1] * filter[8]; | |
result[i * N + j] = sum; | |
} | |
} | |
__global__ void gpu_convolve(float *image, float *filter, float *result, unsigned int N) { | |
__shared__ float s_filter[9]; | |
for (unsigned char i = 0; i < 9; ++i) s_filter[i] = filter[i]; | |
__syncthreads(); | |
float sum = 0; | |
int j = threadIdx.x + blockDim.x * blockIdx.x + 1; | |
int i = threadIdx.y + blockDim.y * blockIdx.y + 1; | |
/* Add image to shared memory */ | |
__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2]; | |
int ty = threadIdx.x + 1; int tx = threadIdx.y + 1; | |
s_image[tx][ty] = image[i * N + j]; | |
// Handling corners | |
if (threadIdx.x == 0 && | |
threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1]; | |
else if (threadIdx.x == THREADSIZE - 1 && | |
threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1]; | |
else if (threadIdx.x == 0 && | |
threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1]; | |
else if (threadIdx.y == 0 && | |
threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1]; | |
// Handling edges | |
if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1]; | |
if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j]; | |
if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1]; | |
if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j]; | |
__syncthreads(); | |
sum = s_image[i - 1][j - 1] * s_filter[0] + s_image[i - 1][j] * s_filter[1] + s_image[i - 1][j + 1] * s_filter[2] | |
+ s_image[i][j - 1] * s_filter[3] + s_image[i][j] * s_filter[4] + s_image[i][j + 1] * s_filter[5] | |
+ s_image[i + 1][j - 1] * s_filter[6] + s_image[i + 1][j] * s_filter[7] + s_image[i + 1][j + 1] * s_filter[8]; | |
result[i * N + j] = sum; | |
} | |
// decearing the cuda Constant filter of size 9 | |
__constant__ float dev_filter[9]; | |
__global__ void gpu_convolve_constant_filter(float *image, float *result, unsigned int N) { | |
float sum = 0; | |
int j = threadIdx.x + blockDim.x * blockIdx.x + 1; | |
int i = threadIdx.y + blockDim.y * blockIdx.y + 1; | |
/* Add image to shared memory */ | |
__shared__ float s_image[BLOCKSIZE + 2][BLOCKSIZE + 2]; | |
int ty = threadIdx.x + 1; int tx = threadIdx.y + 1; | |
s_image[tx][ty] = image[i * N + j]; | |
// Handling corners | |
if (threadIdx.x == 0 && | |
threadIdx.y == 0) s_image[tx - 1][ty - 1] = image[(i - 1) * N + j - 1]; | |
else if (threadIdx.x == THREADSIZE - 1 && | |
threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty + 1] = image[(i + 1) * N + j + 1]; | |
else if (threadIdx.x == 0 && | |
threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty - 1] = image[(i + 1) * N + j - 1]; | |
else if (threadIdx.y == 0 && | |
threadIdx.x == THREADSIZE - 1) s_image[tx - 1][ty + 1] = image[(i - 1) * N + j + 1]; | |
// Handling edges | |
if (threadIdx.x == 0) s_image[tx][ty - 1] = image[i * N + j - 1]; | |
if (threadIdx.y == 0) s_image[tx - 1][ty] = image[(i - 1) * N + j]; | |
if (threadIdx.x == THREADSIZE - 1) s_image[tx][ty + 1] = image[i * N + j + 1]; | |
if (threadIdx.y == THREADSIZE - 1) s_image[tx + 1][ty] = image[(i + 1) * N + j]; | |
__syncthreads(); | |
sum = s_image[i - 1][j - 1] * dev_filter[0] + s_image[i - 1][j] * dev_filter[1] + s_image[i - 1][j + 1] * dev_filter[2] | |
+ s_image[i][j - 1] * dev_filter[3] + s_image[i][j] * dev_filter[4] + s_image[i][j + 1] * dev_filter[5] | |
+ s_image[i + 1][j - 1] * dev_filter[6] + s_image[i + 1][j] * dev_filter[7] + s_image[i + 1][j + 1] * dev_filter[8]; | |
result[i * N + j] = sum; | |
} | |
void print_matrix(float arr[], unsigned int N) { | |
for (unsigned int i = 1; i < N - 1; ++i) { | |
for (unsigned int j = 1; j < N - 1; ++j) { | |
printf("%10f", arr[i * N + j]); | |
} | |
printf("\n"); | |
} | |
} | |
int main(int argc, char *argv[]) { | |
int Grid_Dim_x = BLOCKSIZE, Grid_Dim_y = BLOCKSIZE; | |
int Block_Dim_x = THREADSIZE, Block_Dim_y = THREADSIZE; | |
const unsigned int N = 4; | |
float image[] = { 0, 0, 0, 0, 0, 0, | |
0, 1, 1, 1, 1, 0, | |
0, 1, 1, 1, 1, 0, | |
0, 1, 1, 1, 1, 0, | |
0, 1, 1, 1, 1, 0, | |
0, 0, 0, 0, 0, 0 }; | |
// host copy for the filter | |
float filter[] = { 1, 1, 1, | |
1, 1, 1, | |
1, 1, 1 }; | |
float result[(N + 2) * (N + 2)]; | |
float d_result_cpu[(N + 2) * (N + 2)]; | |
dim3 Grid(Grid_Dim_x, Grid_Dim_y); | |
dim3 Block(Block_Dim_x, Block_Dim_y); | |
float *d_image, *d_filter, *d_result; | |
cudaMalloc((void**)&d_image, sizeof(image)); | |
cudaMalloc((void**)&d_filter, sizeof(filter)); | |
cudaMalloc((void**)&d_result, sizeof(result)); | |
cudaMemcpy(d_image, image, sizeof(image), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_filter, filter, sizeof(filter), cudaMemcpyHostToDevice); | |
cpu_convolve(image, filter, result, N + 2); | |
// normal gpu_convolve function | |
gpu_convolve << <Grid, Block >> >(d_image, d_filter, d_result, N + 2); | |
cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost); | |
print_matrix(result, N + 2); | |
printf("\n"); | |
print_matrix(d_result_cpu, N + 2); | |
// prepare filter to send to constant memory | |
cudaMemcpyToSymbol(dev_filter, filter, sizeof(filter)); | |
// gpu_convolve_constant_filter function with constant filter @author Moataz_Farid | |
// gpu_convolve_constant_filter << <Grid, Block >> >(d_image, d_result, N + 2); | |
//cuda copy result | |
cudaMemcpy(d_result_cpu, d_result, sizeof(d_result_cpu), cudaMemcpyDeviceToHost); | |
//print result | |
print_matrix(d_result_cpu, N + 2); | |
cudaFree(d_image); | |
cudaFree(d_filter); | |
cudaFree(d_result); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment