Skip to content

Instantly share code, notes, and snippets.

@Ediolot
Created January 27, 2023 12:15
Show Gist options
  • Save Ediolot/bee2e3941f16c7af6ef68d49732559de to your computer and use it in GitHub Desktop.
Save Ediolot/bee2e3941f16c7af6ef68d49732559de to your computer and use it in GitHub Desktop.
GPU and CPU performance test
#include <iostream>
#include <chrono>
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) {
const uint32_t threads_per_block = blockDim.x;
const uint32_t total_blocks = gridDim.x;
const uint32_t block_id = blockIdx.x;
const uint32_t thread_id = threadIdx.x;
const uint32_t total_threads = total_blocks * threads_per_block;
const uint32_t idx = block_id * threads_per_block + thread_id;
const uint32_t warp_id = thread_id / 32;
const uint32_t thread_id_inside_warp = thread_id % 32;
for (uint32_t i = idx; i < N; i += total_threads) {
for (uint32_t j = 0; j < 100; ++j) {
C[i] += A[(i + j) % N] * B[j];
}
}
}
int main() {
float ms;
uint32_t M = 100;
uint32_t N = 100;
printf("%u,", N);
// Create host vectors
auto *h_A = new float[N];
auto *h_B = new float[N];
auto *h_C = new float[N];
// Initialize host vectors
for (int i = 0; i < N; i++) {
h_A[i] = 1.0f;
h_C[i] = 0;
}
for (int i = 0; i < M; i++) {
h_B[i] = 2.0f;
}
// Create device vectors
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, N * sizeof(float));
cudaMalloc(&d_B, M * sizeof(float));
cudaMalloc(&d_C, N * sizeof(float));
// Copy host vectors to device and measure time taken
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
printf("%f,", ms);
// Launch kernel and measure time taken in CUDA events
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
printf("%f,", ms);
// Copy device vector to host and measure time taken
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
printf("%f,", ms);
// Run the same operation on CPU while measuring time with std::chrono as milliseconds
auto start_cpu = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; ++j) {
h_C[i] += h_A[(i + j) % N] * h_B[j];
}
}
auto end_cpu = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_cpu - start_cpu);
ms = float(duration.count()) * 1e-3f;
printf("%f\n", ms);
// Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment