Ediolot/gpu_cpu_performance_test.cu

## gpu_cpu_performance_test.cu
#include <iostream>
#include <chrono>

__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) {
    const uint32_t threads_per_block = blockDim.x;
    const uint32_t total_blocks = gridDim.x;
    const uint32_t block_id = blockIdx.x;
    const uint32_t thread_id = threadIdx.x;

    const uint32_t total_threads = total_blocks * threads_per_block;
    const uint32_t idx = block_id * threads_per_block + thread_id;

    const uint32_t warp_id = thread_id / 32;
    const uint32_t thread_id_inside_warp = thread_id % 32;

    for (uint32_t i = idx; i < N; i += total_threads) {
        for (uint32_t j = 0; j < 100; ++j) {
            C[i] += A[(i + j) % N] * B[j];
        }
    }
}

int main() {
    float ms;
    uint32_t M = 100;
    uint32_t N = 100;
    printf("%u,", N);

    // Create host vectors
    auto *h_A = new float[N];
    auto *h_B = new float[N];
    auto *h_C = new float[N];

    // Initialize host vectors
    for (int i = 0; i < N; i++) {
        h_A[i] = 1.0f;
        h_C[i] = 0;
    }
    for (int i = 0; i < M; i++) {
        h_B[i] = 2.0f;
    }

    // Create device vectors
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, N * sizeof(float));
    cudaMalloc(&d_B, M * sizeof(float));
    cudaMalloc(&d_C, N * sizeof(float));

    // Copy host vectors to device and measure time taken
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, M * sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("%f,", ms);

    // Launch kernel and measure time taken in CUDA events
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("%f,", ms);

    // Copy device vector to host and measure time taken
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("%f,", ms);

    // Run the same operation on CPU while measuring time with std::chrono as milliseconds
    auto start_cpu = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; ++j) {
            h_C[i] += h_A[(i + j) % N] * h_B[j];
        }
    }
    auto end_cpu = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_cpu - start_cpu);
    ms = float(duration.count()) * 1e-3f;
    printf("%f\n", ms);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    delete[] h_A;
    delete[] h_B;
    delete[] h_C;

    return 0;
}
	#include <iostream>
	#include <chrono>

	__global__ void vectorAdd(const float A, const float B, float *C, uint32_t N) {
	const uint32_t threads_per_block = blockDim.x;
	const uint32_t total_blocks = gridDim.x;
	const uint32_t block_id = blockIdx.x;
	const uint32_t thread_id = threadIdx.x;

	const uint32_t total_threads = total_blocks * threads_per_block;
	const uint32_t idx = block_id * threads_per_block + thread_id;

	const uint32_t warp_id = thread_id / 32;
	const uint32_t thread_id_inside_warp = thread_id % 32;

	for (uint32_t i = idx; i < N; i += total_threads) {
	for (uint32_t j = 0; j < 100; ++j) {
	C[i] += A[(i + j) % N] * B[j];
	}
	}
	}

	int main() {
	float ms;
	uint32_t M = 100;
	uint32_t N = 100;
	printf("%u,", N);

	// Create host vectors
	auto *h_A = new float[N];
	auto *h_B = new float[N];
	auto *h_C = new float[N];

	// Initialize host vectors
	for (int i = 0; i < N; i++) {
	h_A[i] = 1.0f;
	h_C[i] = 0;
	}
	for (int i = 0; i < M; i++) {
	h_B[i] = 2.0f;
	}

	// Create device vectors
	float d_A, d_B, *d_C;
	cudaMalloc(&d_A, N * sizeof(float));
	cudaMalloc(&d_B, M * sizeof(float));
	cudaMalloc(&d_C, N * sizeof(float));

	// Copy host vectors to device and measure time taken
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start);
	cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, M * sizeof(float), cudaMemcpyHostToDevice);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("%f,", ms);

	// Launch kernel and measure time taken in CUDA events
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start);
	vectorAdd<<<256, 256>>>(d_A, d_B, d_C, N);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("%f,", ms);

	// Copy device vector to host and measure time taken
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start);
	cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&ms, start, stop);
	printf("%f,", ms);

	// Run the same operation on CPU while measuring time with std::chrono as milliseconds
	auto start_cpu = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < N; i++) {
	for (int j = 0; j < M; ++j) {
	h_C[i] += h_A[(i + j) % N] * h_B[j];
	}
	}
	auto end_cpu = std::chrono::high_resolution_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_cpu - start_cpu);
	ms = float(duration.count()) * 1e-3f;
	printf("%f\n", ms);

	// Free memory
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	delete[] h_A;
	delete[] h_B;
	delete[] h_C;

	return 0;
	}