Skip to content

Instantly share code, notes, and snippets.

@wh5a
Last active April 5, 2024 08:02
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wh5a/4641641 to your computer and use it in GitHub Desktop.
Save wh5a/4641641 to your computer and use it in GitHub Desktop.
CUDA 2D Convolution.
#include <wb.h>
// Check ec2-174-129-21-232.compute-1.amazonaws.com:8080/mp/6 for more information
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
#define Mask_width 5
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))
//@@ INSERT CODE HERE
__global__ void convolution(float *I, const float* __restrict__ M, float *P,
int channels, int width, int height) {
__shared__ float N_ds[w][w];
int k;
for (k = 0; k < channels; k++) {
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / w, destX = dest % w,
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
src = (srcY * width + srcX) * channels + k;
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
// Second batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
destY = dest / w, destX = dest % w;
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
src = (srcY * width + srcX) * channels + k;
if (destY < w) {
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
}
__syncthreads();
float accum = 0;
int y, x;
for (y = 0; y < Mask_width; y++)
for (x = 0; x < Mask_width; x++)
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
y = blockIdx.y * TILE_WIDTH + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width)
P[(y * width + x) * channels + k] = clamp(accum);
__syncthreads();
}
}
int main(int argc, char* argv[]) {
wbArg_t arg;
int maskRows;
int maskColumns;
int imageChannels;
int imageWidth;
int imageHeight;
char * inputImageFile;
char * inputMaskFile;
wbImage_t inputImage;
wbImage_t outputImage;
float * hostInputImageData;
float * hostOutputImageData;
float * hostMaskData;
float * deviceInputImageData;
float * deviceOutputImageData;
float * deviceMaskData;
arg = wbArg_read(argc, argv); /* parse the input arguments */
inputImageFile = wbArg_getInputFile(arg, 0);
inputMaskFile = wbArg_getInputFile(arg, 1);
inputImage = wbImport(inputImageFile);
hostMaskData = (float *) wbImport(inputMaskFile, &maskRows, &maskColumns);
assert(maskRows == 5); /* mask height is fixed to 5 in this mp */
assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */
imageWidth = wbImage_getWidth(inputImage);
imageHeight = wbImage_getHeight(inputImage);
imageChannels = wbImage_getChannels(inputImage);
outputImage = wbImage_new(imageWidth, imageHeight, imageChannels);
hostInputImageData = wbImage_getData(inputImage);
hostOutputImageData = wbImage_getData(outputImage);
wbTime_start(GPU, "Doing GPU Computation (memory + compute)");
wbTime_start(GPU, "Doing GPU memory allocation");
cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float));
cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float));
cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float));
wbTime_stop(GPU, "Doing GPU memory allocation");
wbTime_start(Copy, "Copying data to the GPU");
cudaMemcpy(deviceInputImageData,
hostInputImageData,
imageWidth * imageHeight * imageChannels * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMaskData,
hostMaskData,
maskRows * maskColumns * sizeof(float),
cudaMemcpyHostToDevice);
wbTime_stop(Copy, "Copying data to the GPU");
wbTime_start(Compute, "Doing the computation on the GPU");
//@@ INSERT CODE HERE
dim3 dimGrid(ceil((float)imageWidth/TILE_WIDTH), ceil((float)imageHeight/TILE_WIDTH));
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData,
imageChannels, imageWidth, imageHeight);
wbTime_stop(Compute, "Doing the computation on the GPU");
wbTime_start(Copy, "Copying data from the GPU");
cudaMemcpy(hostOutputImageData,
deviceOutputImageData,
imageWidth * imageHeight * imageChannels * sizeof(float),
cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying data from the GPU");
wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");
wbSolution(arg, outputImage);
cudaFree(deviceInputImageData);
cudaFree(deviceOutputImageData);
cudaFree(deviceMaskData);
free(hostMaskData);
wbImage_delete(outputImage);
wbImage_delete(inputImage);
return 0;
}
Copy link

ghost commented Jan 30, 2013

Hi,

I wanted to know if I can use your code to debug my own version of the program.

Regards.

@eddyrene
Copy link

eddyrene commented Jul 2, 2017

I tested and it's very slowly , 10 seconds :( , with an image 512 x 512

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment