Skip to content

Instantly share code, notes, and snippets.

@yogesh-desai
Forked from wh5a/mp6.c
Last active January 3, 2019 18:27
Show Gist options
  • Save yogesh-desai/b3692de434b491df780b to your computer and use it in GitHub Desktop.
Save yogesh-desai/b3692de434b491df780b to your computer and use it in GitHub Desktop.
/*
* Title: 2D Image Convolution on GPU by using Shared Memory and Constant Memory.
*
* Image Size: 2048 X 2048
* Mask Size: 64 X 64
* TILE_WIDTH 32
*
*
* */
#include<stdio.h>
#include<cuda_runtime_api.h>
#include<cuda.h>
#include<stdlib.h>
#include<assert.h>
// Check ec2-174-129-21-232.compute-1.amazonaws.com:8080/mp/6 for more information
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
printf( "Failed to run stmt %d ", __LINE__); \
printf( "Got CUDA error ... %s ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
#define Mask_width 64
#define Mask_height 64
#define Mask_radius_x Mask_width/2
#define Mask_radius_y Mask_height/2
#define TILE_WIDTH 32 //16 X 16 TILE
#define w_x (TILE_WIDTH + Mask_width - 1) //Shared Memory Elements needed to load as per Mask Size
#define w_y (TILE_WIDTH + Mask_height - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))
//@@ INSERT CODE HERE
__global__ void convolution(float *I, const float* __restrict__ M, float *P,int channels, int width, int height)
{
__shared__ float N_ds[w_y][w_x];
int k;
for (k = 0; k < channels; k++)
{
//1. Phase to Load Data into Shared Memory. Each Thread loads multiple elements indexed by each Batch loading
//1.dest: RMO ID 2. destY & destX: Row and Column of Shared Memory
//3. srcY & srcX: Indexes to fetch data from input Image
//4. src: RMO index of Input Image
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / w_x, destX = dest % w_x,
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x,
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y,
src = (srcY * width + srcX) * channels + k;
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0.0;
for (int iter=1; iter <= (w_x * w_y) / (TILE_WIDTH*TILE_WIDTH); iter++)
{
// Second batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + iter*(TILE_WIDTH * TILE_WIDTH);
destY = dest / w_x, destX = dest % w_x;
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius_x;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius_y;
src = (srcY * width + srcX) * channels + k;
if (destY < w_y && destX < w_x)
{
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0.0;
}
}
__syncthreads();
float accum = 0;
int y, x;
for (y = 0; y < Mask_width; y++)
for (x = 0; x < Mask_width; x++)
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
y = blockIdx.y * TILE_WIDTH + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width)
//P[(y * width + x) * channels + k] = clamp(accum);
P[(y * width + x) * channels + k] = accum;
__syncthreads();
}
}
float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j);
int main() {
int maskRows=Mask_height; // Set it as per requirement of 64 X 32
int maskColumns=Mask_width;
int imageChannels=1;
int imageWidth=2048;
int imageHeight=2048;
float * hostInputImageData;
float * hostOutputImageData;
float * hostMaskData;
float * deviceInputImageData;
float * deviceOutputImageData;
float * deviceMaskData;
float * outputImageOnHost;
hostMaskData = (float *) malloc(sizeof(float)*maskRows*maskColumns);
for(int i=0;i<maskRows*maskColumns;i++)//To set Mask of size 5*5 which has all values as 1
{
hostMaskData[i]=1.0;
}
//Comment this assert code for 64 X 32
//assert(maskRows == 5); /* mask height is fixed to 5 in this mp */
//assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */
//To store Memory
hostInputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
hostOutputImageData = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
outputImageOnHost = (float *) malloc(sizeof(float)*imageWidth*imageHeight);
for(int i=0;i<imageWidth*imageHeight;i++)//To set Image data as 1.0
{
hostInputImageData[i]=1.0;
}
wbCheck(cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)));
wbCheck(cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)));
wbCheck(cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float)));
wbCheck(cudaMemcpy(deviceInputImageData,
hostInputImageData,
imageWidth * imageHeight * imageChannels * sizeof(float),
cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceMaskData,
hostMaskData,
maskRows * maskColumns * sizeof(float),
cudaMemcpyHostToDevice));
dim3 dimGrid(((imageWidth-1)/TILE_WIDTH)+1, ((imageHeight-1)/TILE_WIDTH)+1,1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
convolution<<<dimGrid,dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData,
imageChannels, imageWidth, imageHeight);
cudaError_t err1 = cudaPeekAtLastError();
cudaDeviceSynchronize();
printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));
cudaMemcpy(hostOutputImageData,
deviceOutputImageData,
imageWidth * imageHeight * imageChannels * sizeof(float),
cudaMemcpyDeviceToHost);
//printf("\n Output from Device:\n");
#if 0
for(int i=0;i<imageWidth*imageHeight;i++)
{
if(i>0 && (i%imageWidth==0))
printf("\n");
printf("%0.2f \t",*(hostOutputImageData+i));
}
#endif
//Convolution on Host
for(int i=0;i<imageWidth;i++)
{
for(int j=0;j<imageHeight;j++)
{
outputImageOnHost[(i*imageWidth)+j]=convolution_2D_OnHost(hostInputImageData,hostMaskData,imageWidth,imageHeight,i,j);
}
}
//printf("\n Output from Host:\n");
#if 0
for(int i=0;i<imageWidth*imageHeight;i++)
{
if(i>0 && (i%imageWidth==0))
printf("\n");
printf("%0.2f \t",*(outputImageOnHost+i));
}
#endif
for(int i=0;i<imageWidth*imageHeight;i++)
{
if(outputImageOnHost[i]!=hostOutputImageData[i])
{
printf("\nMismatch at (Row,Col) = [%d][%d], hostComputed[]: %0.0f And device[]: %0.0f", i / imageWidth, i % imageHeight, outputImageOnHost[i], hostOutputImageData[i]);
}
}
cudaFree(deviceInputImageData);
cudaFree(deviceOutputImageData);
cudaFree(deviceMaskData);
free(hostMaskData);
free(hostInputImageData);
free(hostOutputImageData);
printf("\n Number of Threads Per Block created in code: %d",TILE_WIDTH*TILE_WIDTH);
printf("\n Number of Blocks Created :%d",(((imageWidth-1)/TILE_WIDTH)+1)*(((imageWidth-1)/TILE_WIDTH)+1));
printf("No Error");
return 0;
}
float convolution_2D_OnHost(float * N,float * M,int width, int height,int i,int j)
{
float Pvalue=0.0;
int N_start_point_i = i - (Mask_width/2);
int N_start_point_j = j - (Mask_height/2);
for(int k=0;k<Mask_width;k++)
{
for(int l=0;l<Mask_height;l++)
{
if(((N_start_point_i+k)>=0) && ((N_start_point_i+k)<width)&&((N_start_point_j+l)>=0)&&((N_start_point_j+l)<height))
{
Pvalue+=N[(N_start_point_i+k)*width+(N_start_point_j+l)]*M[(k*Mask_width)+l];
}
}
}
// return(clamp(Pvalue));
return((Pvalue));
}
@vijaytida
Copy link

Hi if I increase the number of channels what changes I should do for this code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment