Created
May 27, 2014 14:38
-
-
Save starrify/eb32bd6e75255f794dc3 to your computer and use it in GitHub Desktop.
a simple sample of a CUDA C program for Duan Dinglong
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This is a simple sample of a CUDA C program for Duan Dinglong, | |
* By Pengyu CHEN (pengyu[at]libstarrify.so) | |
* COPYLEFT, ALL WRONGS RESERVED. | |
* | |
* usage: nvcc vec_add_test.cu && time ./a.out GPU 1000000 | |
*/ | |
#include <cuda.h> | |
#include <vector> | |
#include <assert.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#define CUDA_SAFE_CALL(call) \ | |
{ \ | |
cudaError_t err = call; \ | |
if( cudaSuccess != err) \ | |
{ \ | |
fprintf(stderr, "Error %x: %s in file '%s' in line %i.\n", \ | |
err, cudaGetErrorString(err), __FILE__, __LINE__ ); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} | |
// adjust this value according to the graphics card you use | |
#define THREAD_SIZE 1024 | |
#define TEST_RANDOMIZE 0 | |
#define TEST_VERIFY 0 | |
static void vec_add_test_CPU( | |
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C) | |
{ | |
for (long i = 0; i < n; i++) | |
ptr_dst_C[i] = ptr_src_A[i] + ptr_src_B[i]; | |
return; | |
} | |
static __global__ void vec_add_test_GPU_kernel( | |
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C) | |
{ | |
long idx = blockIdx.x * blockDim.x + threadIdx.x; | |
// the comparasion is important here | |
if (idx < n) | |
ptr_dst_C[idx] = ptr_src_A[idx] + ptr_src_B[idx]; | |
return; | |
} | |
static void vec_add_test_GPU( | |
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C) | |
{ | |
long byte_size = n * sizeof(*ptr_src_A); | |
int *dptr_src_A; | |
int *dptr_src_B; | |
int *dptr_dst_C; | |
CUDA_SAFE_CALL(cudaMalloc(&dptr_src_A, byte_size)); | |
CUDA_SAFE_CALL(cudaMalloc(&dptr_src_B, byte_size)); | |
CUDA_SAFE_CALL(cudaMalloc(&dptr_dst_C, byte_size)); | |
CUDA_SAFE_CALL(cudaMemcpy(dptr_src_A, ptr_src_A, byte_size, | |
cudaMemcpyHostToDevice)); | |
CUDA_SAFE_CALL(cudaMemcpy(dptr_src_B, ptr_src_B, byte_size, | |
cudaMemcpyHostToDevice)); | |
long block_size = (n + THREAD_SIZE - 1) / THREAD_SIZE; | |
vec_add_test_GPU_kernel<<<block_size, THREAD_SIZE>>>( | |
n, dptr_src_A, dptr_src_B, dptr_dst_C); | |
// you can safely comment this following line here. see: | |
// http://stackoverflow.com/questions/11888772/when-to-call-cudadevicesynchronize | |
CUDA_SAFE_CALL(cudaDeviceSynchronize()); | |
CUDA_SAFE_CALL(cudaMemcpy(ptr_dst_C, dptr_dst_C, byte_size, | |
cudaMemcpyDeviceToHost)); | |
CUDA_SAFE_CALL(cudaFree(dptr_src_A)); | |
CUDA_SAFE_CALL(cudaFree(dptr_src_B)); | |
CUDA_SAFE_CALL(cudaFree(dptr_dst_C)); | |
return; | |
} | |
static void show_usage_and_exit(char **argv) | |
{ | |
fprintf(stderr, "Usage: %s CPU/GPU vector_size", argv[0]); | |
exit(0); | |
// this line shall not be reached | |
return; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc != 3) | |
show_usage_and_exit(argv); | |
const long vector_size = atol(argv[2]); | |
std::vector<int> src_A(vector_size); | |
std::vector<int> src_B(vector_size); | |
std::vector<int> dst_C(vector_size); | |
#if TEST_RANDOMIZE | |
srandom(0xdeadbeef); | |
for (int i = 0; i < vector_size; i++) | |
{ | |
src_A[i] = random() % 256; | |
src_B[i] = random() % 256; | |
} | |
#endif // TEST_RANDOMIZE | |
if (!strcmp(argv[1], "CPU")) | |
vec_add_test_CPU(vector_size, src_A.data(), src_B.data(), dst_C.data()); | |
else if (!strcmp(argv[1], "GPU")) | |
vec_add_test_GPU(vector_size, src_A.data(), src_B.data(), dst_C.data()); | |
else | |
show_usage_and_exit(argv); | |
#if TEST_VERIFY | |
for (int i = 0; i < vector_size; i++) | |
assert(src_A[i] + src_B[i] == dst_C[i]); | |
#endif // TEST_VERIFY | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment