Created
February 20, 2016 02:19
-
-
Save jhjin/462e1fca87fb9c3367db to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
// CUDA: grid stride looping | |
#define CUDA_KERNEL_LOOP(i, n) \ | |
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
const int CUDA_NUM_THREADS = 1024; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) { | |
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; | |
} | |
__global__ void yourkernel(const int n, const float *x, float *y) { | |
CUDA_KERNEL_LOOP(index, n) { | |
y[index] = x[index] + 10.0f; | |
} | |
} | |
int main(int argc, char *argv[]) { | |
long channel = 2; | |
long height = 3; | |
long width = 4; | |
long length = channel*height*width; | |
// assign initial values | |
float *input = new float[length]; | |
float *output = new float[length]; | |
for (int i = 0; i < length; i++) | |
input[i] = i; | |
// memcpy host to device | |
float *input_cuda; | |
float *output_cuda; | |
cudaMalloc((void**)&input_cuda, length*sizeof(float)); | |
cudaMalloc((void**)&output_cuda, length*sizeof(float)); | |
cudaMemcpy(input_cuda, input, length*sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(output_cuda, output, length*sizeof(float), cudaMemcpyHostToDevice); | |
// launch cuda kernel | |
yourkernel <<< GET_BLOCKS(length), CUDA_NUM_THREADS >>> (length, input_cuda, output_cuda); | |
// memcpy device to host | |
cudaMemcpy(input, input_cuda, length*sizeof(float), cudaMemcpyDeviceToHost); | |
cudaMemcpy(output, output_cuda, length*sizeof(float), cudaMemcpyDeviceToHost); | |
// print input | |
printf("==> input\n"); | |
for (int i = 0; i < channel; i++) { | |
for (int j = 0; j < height; j++) { | |
for (int k = 0; k < width; k++) { | |
printf("%5.2f\t", input[i*height*width + j*width + k]); | |
} | |
printf("\n"); | |
} | |
printf("\n"); | |
} | |
// print output | |
printf("==> output\n"); | |
for (int i = 0; i < channel; i++) { | |
for (int j = 0; j < height; j++) { | |
for (int k = 0; k < width; k++) { | |
printf("%5.2f\t", output[i*height*width + j*width + k]); | |
} | |
printf("\n"); | |
} | |
printf("\n"); | |
} | |
// check last cuda error | |
cudaError errcode = cudaGetLastError(); | |
if (errcode != cudaSuccess) | |
printf("%s\n", cudaGetErrorString(errcode)); | |
// free all | |
cudaFree(input_cuda); | |
cudaFree(output_cuda); | |
delete[] input; | |
delete[] output; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Why are you using
CUDA_KERNEL_LOOP(i, n)
whenn == length
?I'm confused.