Skip to content

Instantly share code, notes, and snippets.

@jhjin
Created February 20, 2016 02:19
Show Gist options
  • Save jhjin/462e1fca87fb9c3367db to your computer and use it in GitHub Desktop.
Save jhjin/462e1fca87fb9c3367db to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
__global__ void yourkernel(const int n, const float *x, float *y) {
CUDA_KERNEL_LOOP(index, n) {
y[index] = x[index] + 10.0f;
}
}
int main(int argc, char *argv[]) {
long channel = 2;
long height = 3;
long width = 4;
long length = channel*height*width;
// assign initial values
float *input = new float[length];
float *output = new float[length];
for (int i = 0; i < length; i++)
input[i] = i;
// memcpy host to device
float *input_cuda;
float *output_cuda;
cudaMalloc((void**)&input_cuda, length*sizeof(float));
cudaMalloc((void**)&output_cuda, length*sizeof(float));
cudaMemcpy(input_cuda, input, length*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(output_cuda, output, length*sizeof(float), cudaMemcpyHostToDevice);
// launch cuda kernel
yourkernel <<< GET_BLOCKS(length), CUDA_NUM_THREADS >>> (length, input_cuda, output_cuda);
// memcpy device to host
cudaMemcpy(input, input_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(output, output_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);
// print input
printf("==> input\n");
for (int i = 0; i < channel; i++) {
for (int j = 0; j < height; j++) {
for (int k = 0; k < width; k++) {
printf("%5.2f\t", input[i*height*width + j*width + k]);
}
printf("\n");
}
printf("\n");
}
// print output
printf("==> output\n");
for (int i = 0; i < channel; i++) {
for (int j = 0; j < height; j++) {
for (int k = 0; k < width; k++) {
printf("%5.2f\t", output[i*height*width + j*width + k]);
}
printf("\n");
}
printf("\n");
}
// check last cuda error
cudaError errcode = cudaGetLastError();
if (errcode != cudaSuccess)
printf("%s\n", cudaGetErrorString(errcode));
// free all
cudaFree(input_cuda);
cudaFree(output_cuda);
delete[] input;
delete[] output;
return 0;
}
@Atcold
Copy link

Atcold commented Aug 29, 2016

Why are you using CUDA_KERNEL_LOOP(i, n) when n == length?
I'm confused.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment