Skip to content

Instantly share code, notes, and snippets.

Last active Oct 12, 2017
What would you like to do?
A minimal CUDA program that naively performs vector addition, component-wise!
// Compile and run with:
// nvcc -o add1 && ./add1
// This minimal CUDA program performs vector addition, component-wise!
#include <stdio.h>
#define N_ELEMENTS 8
// ------------------------------------------------------------------------------------------------
// __global__functions run on the device, so they must be pointers (to device memory)!
__global__ void cuda_vec_add(uint* a, uint* b, uint* c){
uint i = blockIdx.x * blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
// ------------------------------------------------------------------------------------------------
__host__ int main(){
puts("MSG This minimal CUDA program performs vector addition, component-wise!");
// ----------------------------
uint *cpu_a, *cpu_b, *cpu_c; // Host copies of a, b, c!
uint *gpu_a, *gpu_b, *gpu_c; // Device copies of a, b, c!
// ----------------------------
puts("MALLOC Data on host!");
cpu_a = (uint*)malloc(N_ELEMENTS * sizeof(uint));
cpu_b = (uint*)malloc(N_ELEMENTS * sizeof(uint));
cpu_c = (uint*)malloc(N_ELEMENTS * sizeof(uint));
puts("RUN cudaMalloc() Allocate memory for device copies of a, b, c!");
cudaMalloc((void**)&gpu_a, N_ELEMENTS * sizeof(uint)); // For some reason, we don't pass `gpu_a` but `&gpu_a`. Why??
cudaMalloc((void**)&gpu_b, N_ELEMENTS * sizeof(uint));
cudaMalloc((void**)&gpu_c, N_ELEMENTS * sizeof(uint));
// ----------------------------
puts("INIT Data on host!");
for(uint i=0; i<N_ELEMENTS; ++i){
cpu_a[i] = i;
cpu_b[i] = 2 * i;
puts("RUN cudaMemcpy() Copy data from host to device!");
cudaMemcpy(gpu_a, cpu_a, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_b, cpu_b, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);
// ----------------------------
puts("RUN cuda_vec_add() Launch CUDA kernel on the device!");
cuda_vec_add<<<1, N_ELEMENTS>>>(gpu_a, gpu_b, gpu_c);
puts("RUN cudaMemcpy() Copy results back to the host");
cudaMemcpy(cpu_c, gpu_c, N_ELEMENTS * sizeof(uint), cudaMemcpyDeviceToHost); // Store resulting matrix `gpu_c` (GPU-side) in `cpu_c` (CPU-side)
// ----------------------------
puts("\nSHOW Data on host (regardless of where it was computed)!");
printf("cpu_a\n ");
for(uint i=0; i<N_ELEMENTS; ++i)
printf("%u ", cpu_a[i]);
printf("cpu_b\n ");
for(uint i=0; i<N_ELEMENTS; ++i)
printf("%u ", cpu_b[i]);
printf("cpu_c (from gpu_c, computed on GPU)\n ");
for(uint i=0; i<N_ELEMENTS; ++i)
printf("%u ", cpu_c[i]);
// ----------------------------
puts("\nRUN cudaFree() Free device memory");
puts("RUN free() Free host memory");
// ----------------------------
puts("\nExit success!");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment