Skip to content

Instantly share code, notes, and snippets.

@Ediolot
Created January 27, 2023 11:49
Show Gist options
  • Save Ediolot/1ea9c05e999dfdc3ccb24f6a97d96bce to your computer and use it in GitHub Desktop.
Save Ediolot/1ea9c05e999dfdc3ccb24f6a97d96bce to your computer and use it in GitHub Desktop.
CUDA Vector add example
#include <iostream>
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) {
const uint32_t threads_per_block = blockDim.x;
const uint32_t total_blocks = gridDim.x;
const uint32_t block_id = blockIdx.x;
const uint32_t thread_id = threadIdx.x;
const uint32_t total_threads = total_blocks * threads_per_block;
const uint32_t idx = block_id * threads_per_block + thread_id;
const uint32_t warp_id = thread_id / 32;
const uint32_t thread_id_inside_warp = thread_id % 32;
for (uint32_t i = idx; i < N; i += total_threads) {
C[i] = A[i] + B[i];
}
}
int main() {
uint32_t N = 1000000000;
// Create host vectors
auto *h_A = new float[N];
auto *h_B = new float[N];
auto *h_C = new float[N];
// Initialize host vectors
for (int i = 0; i < N; i++) {
h_A[i] = 1.0f;
h_B[i] = 2.0f;
h_C[i] = 0;
}
// Create device vectors
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, N * sizeof(float));
cudaMalloc(&d_B, N * sizeof(float));
cudaMalloc(&d_C, N * sizeof(float));
// Copy host vectors to device and measure time taken
cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel and measure time taken
vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N);
// Copy device vector to host
cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
// Check result
for (int i = 0; i < N; i++) {
if (h_C[i] != 3.0f) {
std::cout << "Error: " << h_C[i] << " != 3.0f" << std::endl;
break;
}
}
// Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment