Created
May 20, 2024 09:55
-
-
Save DWarez/90515ace919f5dca6e65a4d35f09a8b5 to your computer and use it in GitHub Desktop.
A simple CUDA Hello World
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// Macro for checking CUDA errors | |
#define CUDA_CHECK_ERROR(err) \ | |
if (err != cudaSuccess) { \ | |
printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \ | |
exit(EXIT_FAILURE); \ | |
} | |
// Kernel definition | |
// __global__ means that this function can be used by both device and host | |
__global__ void vecAddKernel(float* A, float* B, float* C, int n) { | |
// index is computed by summing block index to the thread index | |
int i = threadIdx.x + blockIdx.x * blockDim.x; | |
// the if is used because the number of threads can be greater than the number of elements | |
if (i < n) { | |
C[i] = A[i] + B[i]; | |
} | |
} | |
// Host function | |
// A_h, B_h and C_h are the input and output vectors in the host | |
void vecAdd(float* A_h, float* B_h, float* C_h, int n) { | |
int size = n * sizeof(float); | |
// A_d, B_d and C_d are the input and output vectors in the device | |
float *A_d, *B_d, *C_d; | |
// the function allocates memory for the vectors in the device | |
cudaMalloc((void**)&A_d, size); | |
cudaMalloc((void**)&B_d, size); | |
cudaMalloc((void**)&C_d, size); | |
// the function copies the vectors from the host to the device | |
cudaError_t error = cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); | |
CUDA_CHECK_ERROR(error); | |
error = cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice); | |
CUDA_CHECK_ERROR(error); | |
// the function calls the kernel with the number of blocks and threads | |
int number_of_threads = 256; | |
dim3 dimGrid(ceil(n/number_of_threads), 1, 1); | |
dim3 dimBlock(number_of_threads, 1, 1); | |
vecAddKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, n); | |
// the function copies the result from the device to the host | |
error = cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost); | |
CUDA_CHECK_ERROR(error); | |
// the function frees the memory allocated in the device | |
cudaFree(A_d); | |
cudaFree(B_d); | |
cudaFree(C_d); | |
} | |
int main() { | |
int n = 512; | |
float A_h[n], B_h[n], C_h[n]; | |
for (int i = 0; i < n; i++) { | |
A_h[i] = i; | |
B_h[i] = i; | |
} | |
vecAdd(A_h, B_h, C_h, n); | |
for (int i = 0; i < n; i++) { | |
printf("%g\n", C_h[i]); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment