Skip to content

Instantly share code, notes, and snippets.

@DWarez
Created May 20, 2024 09:55
Show Gist options
  • Save DWarez/90515ace919f5dca6e65a4d35f09a8b5 to your computer and use it in GitHub Desktop.
Save DWarez/90515ace919f5dca6e65a4d35f09a8b5 to your computer and use it in GitHub Desktop.
A simple CUDA Hello World
#include <stdio.h>
// Macro for checking CUDA errors
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
exit(EXIT_FAILURE); \
}
// Kernel definition
// __global__ means that this function can be used by both device and host
__global__ void vecAddKernel(float* A, float* B, float* C, int n) {
// index is computed by summing block index to the thread index
int i = threadIdx.x + blockIdx.x * blockDim.x;
// the if is used because the number of threads can be greater than the number of elements
if (i < n) {
C[i] = A[i] + B[i];
}
}
// Host function
// A_h, B_h and C_h are the input and output vectors in the host
void vecAdd(float* A_h, float* B_h, float* C_h, int n) {
int size = n * sizeof(float);
// A_d, B_d and C_d are the input and output vectors in the device
float *A_d, *B_d, *C_d;
// the function allocates memory for the vectors in the device
cudaMalloc((void**)&A_d, size);
cudaMalloc((void**)&B_d, size);
cudaMalloc((void**)&C_d, size);
// the function copies the vectors from the host to the device
cudaError_t error = cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR(error);
error = cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR(error);
// the function calls the kernel with the number of blocks and threads
int number_of_threads = 256;
dim3 dimGrid(ceil(n/number_of_threads), 1, 1);
dim3 dimBlock(number_of_threads, 1, 1);
vecAddKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, n);
// the function copies the result from the device to the host
error = cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
CUDA_CHECK_ERROR(error);
// the function frees the memory allocated in the device
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
}
int main() {
int n = 512;
float A_h[n], B_h[n], C_h[n];
for (int i = 0; i < n; i++) {
A_h[i] = i;
B_h[i] = i;
}
vecAdd(A_h, B_h, C_h, n);
for (int i = 0; i < n; i++) {
printf("%g\n", C_h[i]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment