Skip to content

Instantly share code, notes, and snippets.

@starrify
Created May 27, 2014 14:38
Show Gist options
  • Save starrify/eb32bd6e75255f794dc3 to your computer and use it in GitHub Desktop.
Save starrify/eb32bd6e75255f794dc3 to your computer and use it in GitHub Desktop.
a simple sample of a CUDA C program for Duan Dinglong
/*
* This is a simple sample of a CUDA C program for Duan Dinglong,
* By Pengyu CHEN (pengyu[at]libstarrify.so)
* COPYLEFT, ALL WRONGS RESERVED.
*
* usage: nvcc vec_add_test.cu && time ./a.out GPU 1000000
*/
#include <cuda.h>
#include <vector>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CUDA_SAFE_CALL(call) \
{ \
cudaError_t err = call; \
if( cudaSuccess != err) \
{ \
fprintf(stderr, "Error %x: %s in file '%s' in line %i.\n", \
err, cudaGetErrorString(err), __FILE__, __LINE__ ); \
exit(EXIT_FAILURE); \
} \
}
// adjust this value according to the graphics card you use
#define THREAD_SIZE 1024
#define TEST_RANDOMIZE 0
#define TEST_VERIFY 0
static void vec_add_test_CPU(
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
for (long i = 0; i < n; i++)
ptr_dst_C[i] = ptr_src_A[i] + ptr_src_B[i];
return;
}
static __global__ void vec_add_test_GPU_kernel(
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
long idx = blockIdx.x * blockDim.x + threadIdx.x;
// the comparasion is important here
if (idx < n)
ptr_dst_C[idx] = ptr_src_A[idx] + ptr_src_B[idx];
return;
}
static void vec_add_test_GPU(
long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
long byte_size = n * sizeof(*ptr_src_A);
int *dptr_src_A;
int *dptr_src_B;
int *dptr_dst_C;
CUDA_SAFE_CALL(cudaMalloc(&dptr_src_A, byte_size));
CUDA_SAFE_CALL(cudaMalloc(&dptr_src_B, byte_size));
CUDA_SAFE_CALL(cudaMalloc(&dptr_dst_C, byte_size));
CUDA_SAFE_CALL(cudaMemcpy(dptr_src_A, ptr_src_A, byte_size,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dptr_src_B, ptr_src_B, byte_size,
cudaMemcpyHostToDevice));
long block_size = (n + THREAD_SIZE - 1) / THREAD_SIZE;
vec_add_test_GPU_kernel<<<block_size, THREAD_SIZE>>>(
n, dptr_src_A, dptr_src_B, dptr_dst_C);
// you can safely comment this following line here. see:
// http://stackoverflow.com/questions/11888772/when-to-call-cudadevicesynchronize
CUDA_SAFE_CALL(cudaDeviceSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(ptr_dst_C, dptr_dst_C, byte_size,
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(dptr_src_A));
CUDA_SAFE_CALL(cudaFree(dptr_src_B));
CUDA_SAFE_CALL(cudaFree(dptr_dst_C));
return;
}
static void show_usage_and_exit(char **argv)
{
fprintf(stderr, "Usage: %s CPU/GPU vector_size", argv[0]);
exit(0);
// this line shall not be reached
return;
}
int main(int argc, char **argv)
{
if (argc != 3)
show_usage_and_exit(argv);
const long vector_size = atol(argv[2]);
std::vector<int> src_A(vector_size);
std::vector<int> src_B(vector_size);
std::vector<int> dst_C(vector_size);
#if TEST_RANDOMIZE
srandom(0xdeadbeef);
for (int i = 0; i < vector_size; i++)
{
src_A[i] = random() % 256;
src_B[i] = random() % 256;
}
#endif // TEST_RANDOMIZE
if (!strcmp(argv[1], "CPU"))
vec_add_test_CPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
else if (!strcmp(argv[1], "GPU"))
vec_add_test_GPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
else
show_usage_and_exit(argv);
#if TEST_VERIFY
for (int i = 0; i < vector_size; i++)
assert(src_A[i] + src_B[i] == dst_C[i]);
#endif // TEST_VERIFY
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment