DWarez/hello_world.cu

## hello_world.cu
#include <stdio.h>

// Macro for checking CUDA errors
#define CUDA_CHECK_ERROR(err) \
    if (err != cudaSuccess) { \
        printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
        exit(EXIT_FAILURE); \
    }

// Kernel definition
// __global__ means that this function can be used by both device and host
__global__ void vecAddKernel(float* A, float* B, float* C, int n) {
    // index is computed by summing block index to the thread index
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    // the if is used because the number of threads can be greater than the number of elements
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

// Host function
// A_h, B_h and C_h are the input and output vectors in the host
void vecAdd(float* A_h, float* B_h, float* C_h, int n) {
    int size = n * sizeof(float);
    // A_d, B_d and C_d are the input and output vectors in the device
    float *A_d, *B_d, *C_d;

    // the function allocates memory for the vectors in the device
    cudaMalloc((void**)&A_d, size);
    cudaMalloc((void**)&B_d, size);
    cudaMalloc((void**)&C_d, size);

    // the function copies the vectors from the host to the device
    cudaError_t error = cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
    CUDA_CHECK_ERROR(error);
    error = cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
    CUDA_CHECK_ERROR(error);

    // the function calls the kernel with the number of blocks and threads
    int number_of_threads = 256;
    dim3 dimGrid(ceil(n/number_of_threads), 1, 1);
    dim3 dimBlock(number_of_threads, 1, 1);
    vecAddKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, n);

    // the function copies the result from the device to the host
    error = cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
    CUDA_CHECK_ERROR(error);
    // the function frees the memory allocated in the device
    cudaFree(A_d);
    cudaFree(B_d);
    cudaFree(C_d);
}

int main() {
    int n = 512;
    float A_h[n], B_h[n], C_h[n];
    for (int i = 0; i < n; i++) {
        A_h[i] = i;
        B_h[i] = i;
    }
    vecAdd(A_h, B_h, C_h, n);
    for (int i = 0; i < n; i++) {
        printf("%g\n", C_h[i]);
    }
    return 0;
}
	#include <stdio.h>

	// Macro for checking CUDA errors
	#define CUDA_CHECK_ERROR(err) \
	if (err != cudaSuccess) { \
	printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
	exit(EXIT_FAILURE); \
	}

	// Kernel definition
	// __global__ means that this function can be used by both device and host
	__global__ void vecAddKernel(float* A, float* B, float* C, int n) {
	// index is computed by summing block index to the thread index
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	// the if is used because the number of threads can be greater than the number of elements
	if (i < n) {
	C[i] = A[i] + B[i];
	}
	}

	// Host function
	// A_h, B_h and C_h are the input and output vectors in the host
	void vecAdd(float* A_h, float* B_h, float* C_h, int n) {
	int size = n * sizeof(float);
	// A_d, B_d and C_d are the input and output vectors in the device
	float A_d, B_d, *C_d;

	// the function allocates memory for the vectors in the device
	cudaMalloc((void**)&A_d, size);
	cudaMalloc((void**)&B_d, size);
	cudaMalloc((void**)&C_d, size);

	// the function copies the vectors from the host to the device
	cudaError_t error = cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
	CUDA_CHECK_ERROR(error);
	error = cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
	CUDA_CHECK_ERROR(error);

	// the function calls the kernel with the number of blocks and threads
	int number_of_threads = 256;
	dim3 dimGrid(ceil(n/number_of_threads), 1, 1);
	dim3 dimBlock(number_of_threads, 1, 1);
	vecAddKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, n);

	// the function copies the result from the device to the host
	error = cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
	CUDA_CHECK_ERROR(error);
	// the function frees the memory allocated in the device
	cudaFree(A_d);
	cudaFree(B_d);
	cudaFree(C_d);
	}

	int main() {
	int n = 512;
	float A_h[n], B_h[n], C_h[n];
	for (int i = 0; i < n; i++) {
	A_h[i] = i;
	B_h[i] = i;
	}
	vecAdd(A_h, B_h, C_h, n);
	for (int i = 0; i < n; i++) {
	printf("%g\n", C_h[i]);
	}
	return 0;
	}