starrify/vec_add_test.cu

## vec_add_test.cu
/*
 * This is a simple sample of a CUDA C program for Duan Dinglong,
 * By Pengyu CHEN (pengyu[at]libstarrify.so)
 * COPYLEFT, ALL WRONGS RESERVED.
 *
 * usage: nvcc vec_add_test.cu && time ./a.out GPU 1000000
 */

#include <cuda.h>

#include <vector>

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define CUDA_SAFE_CALL(call) \
    {   \
        cudaError_t err = call; \
        if( cudaSuccess != err) \
        {   \
            fprintf(stderr, "Error %x: %s in file '%s' in line %i.\n",  \
                err, cudaGetErrorString(err), __FILE__, __LINE__ ); \
            exit(EXIT_FAILURE); \
        }   \
    }


// adjust this value according to the graphics card you use
#define THREAD_SIZE 1024

#define TEST_RANDOMIZE  0
#define TEST_VERIFY     0

static void vec_add_test_CPU(
    long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
    for (long i = 0; i < n; i++)
        ptr_dst_C[i] = ptr_src_A[i] + ptr_src_B[i];
    return;
}

static __global__ void vec_add_test_GPU_kernel(
    long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
    long idx = blockIdx.x * blockDim.x + threadIdx.x;
    // the comparasion is important here
    if (idx < n)
        ptr_dst_C[idx] = ptr_src_A[idx] + ptr_src_B[idx];
    return;
}

static void vec_add_test_GPU(
    long n, const int *ptr_src_A, const int *ptr_src_B, int *ptr_dst_C)
{
    long byte_size = n * sizeof(*ptr_src_A);
    int *dptr_src_A;
    int *dptr_src_B;
    int *dptr_dst_C;
    CUDA_SAFE_CALL(cudaMalloc(&dptr_src_A, byte_size));
    CUDA_SAFE_CALL(cudaMalloc(&dptr_src_B, byte_size));
    CUDA_SAFE_CALL(cudaMalloc(&dptr_dst_C, byte_size));

    CUDA_SAFE_CALL(cudaMemcpy(dptr_src_A, ptr_src_A, byte_size,
        cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL(cudaMemcpy(dptr_src_B, ptr_src_B, byte_size,
        cudaMemcpyHostToDevice));

    long block_size = (n + THREAD_SIZE - 1) / THREAD_SIZE;
    vec_add_test_GPU_kernel<<<block_size, THREAD_SIZE>>>(
        n, dptr_src_A, dptr_src_B, dptr_dst_C);

    // you can safely comment this following line here. see:
    // http://stackoverflow.com/questions/11888772/when-to-call-cudadevicesynchronize
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
    CUDA_SAFE_CALL(cudaMemcpy(ptr_dst_C, dptr_dst_C, byte_size,
        cudaMemcpyDeviceToHost));

    CUDA_SAFE_CALL(cudaFree(dptr_src_A));
    CUDA_SAFE_CALL(cudaFree(dptr_src_B));
    CUDA_SAFE_CALL(cudaFree(dptr_dst_C));
    return;
}

static void show_usage_and_exit(char **argv)
{
    fprintf(stderr, "Usage: %s CPU/GPU vector_size", argv[0]);
    exit(0);
    // this line shall not be reached
    return;
}

int main(int argc, char **argv)
{
    if (argc != 3)
        show_usage_and_exit(argv);

    const long vector_size = atol(argv[2]);
    std::vector<int> src_A(vector_size);
    std::vector<int> src_B(vector_size);
    std::vector<int> dst_C(vector_size);

    #if TEST_RANDOMIZE
    srandom(0xdeadbeef);
    for (int i = 0; i < vector_size; i++)
    {
        src_A[i] = random() % 256;
        src_B[i] = random() % 256;
    }
    #endif // TEST_RANDOMIZE

    if (!strcmp(argv[1], "CPU"))
        vec_add_test_CPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
    else if (!strcmp(argv[1], "GPU"))
        vec_add_test_GPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
    else
        show_usage_and_exit(argv);

    #if TEST_VERIFY
    for (int i = 0; i < vector_size; i++)
        assert(src_A[i] + src_B[i] == dst_C[i]);
    #endif // TEST_VERIFY

    return 0;
}
	/*
	* This is a simple sample of a CUDA C program for Duan Dinglong,
	* By Pengyu CHEN (pengyu[at]libstarrify.so)
	* COPYLEFT, ALL WRONGS RESERVED.
	*
	* usage: nvcc vec_add_test.cu && time ./a.out GPU 1000000
	*/

	#include <cuda.h>

	#include <vector>

	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#define CUDA_SAFE_CALL(call) \
	{ \
	cudaError_t err = call; \
	if( cudaSuccess != err) \
	{ \
	fprintf(stderr, "Error %x: %s in file '%s' in line %i.\n", \
	err, cudaGetErrorString(err), __FILE__, __LINE__ ); \
	exit(EXIT_FAILURE); \
	} \
	}



	// adjust this value according to the graphics card you use
	#define THREAD_SIZE 1024

	#define TEST_RANDOMIZE 0
	#define TEST_VERIFY 0

	static void vec_add_test_CPU(
	long n, const int ptr_src_A, const int ptr_src_B, int *ptr_dst_C)
	{
	for (long i = 0; i < n; i++)
	ptr_dst_C[i] = ptr_src_A[i] + ptr_src_B[i];
	return;
	}

	static __global__ void vec_add_test_GPU_kernel(
	long n, const int ptr_src_A, const int ptr_src_B, int *ptr_dst_C)
	{
	long idx = blockIdx.x * blockDim.x + threadIdx.x;
	// the comparasion is important here
	if (idx < n)
	ptr_dst_C[idx] = ptr_src_A[idx] + ptr_src_B[idx];
	return;
	}

	static void vec_add_test_GPU(
	long n, const int ptr_src_A, const int ptr_src_B, int *ptr_dst_C)
	{
	long byte_size = n * sizeof(*ptr_src_A);
	int *dptr_src_A;
	int *dptr_src_B;
	int *dptr_dst_C;
	CUDA_SAFE_CALL(cudaMalloc(&dptr_src_A, byte_size));
	CUDA_SAFE_CALL(cudaMalloc(&dptr_src_B, byte_size));
	CUDA_SAFE_CALL(cudaMalloc(&dptr_dst_C, byte_size));

	CUDA_SAFE_CALL(cudaMemcpy(dptr_src_A, ptr_src_A, byte_size,
	cudaMemcpyHostToDevice));
	CUDA_SAFE_CALL(cudaMemcpy(dptr_src_B, ptr_src_B, byte_size,
	cudaMemcpyHostToDevice));

	long block_size = (n + THREAD_SIZE - 1) / THREAD_SIZE;
	vec_add_test_GPU_kernel<<<block_size, THREAD_SIZE>>>(
	n, dptr_src_A, dptr_src_B, dptr_dst_C);

	// you can safely comment this following line here. see:
	// http://stackoverflow.com/questions/11888772/when-to-call-cudadevicesynchronize
	CUDA_SAFE_CALL(cudaDeviceSynchronize());
	CUDA_SAFE_CALL(cudaMemcpy(ptr_dst_C, dptr_dst_C, byte_size,
	cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaFree(dptr_src_A));
	CUDA_SAFE_CALL(cudaFree(dptr_src_B));
	CUDA_SAFE_CALL(cudaFree(dptr_dst_C));
	return;
	}

	static void show_usage_and_exit(char **argv)
	{
	fprintf(stderr, "Usage: %s CPU/GPU vector_size", argv[0]);
	exit(0);
	// this line shall not be reached
	return;
	}

	int main(int argc, char **argv)
	{
	if (argc != 3)
	show_usage_and_exit(argv);

	const long vector_size = atol(argv[2]);
	std::vector<int> src_A(vector_size);
	std::vector<int> src_B(vector_size);
	std::vector<int> dst_C(vector_size);

	#if TEST_RANDOMIZE
	srandom(0xdeadbeef);
	for (int i = 0; i < vector_size; i++)
	{
	src_A[i] = random() % 256;
	src_B[i] = random() % 256;
	}
	#endif // TEST_RANDOMIZE

	if (!strcmp(argv[1], "CPU"))
	vec_add_test_CPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
	else if (!strcmp(argv[1], "GPU"))
	vec_add_test_GPU(vector_size, src_A.data(), src_B.data(), dst_C.data());
	else
	show_usage_and_exit(argv);

	#if TEST_VERIFY
	for (int i = 0; i < vector_size; i++)
	assert(src_A[i] + src_B[i] == dst_C[i]);
	#endif // TEST_VERIFY

	return 0;
	}