so298/cudaMemcpyBandwidthTest.cu

## cudaMemcpyBandwidthTest.cu
#include <iostream>
#include <ctime>
#include <cuda_runtime.h>

#define MEGA_BYTE (1'000'000)

const int N = 100 * MEGA_BYTE; // Size of data
const int numIterations = 100; // iteration

// CUDA error check macro
#define CUDA_CHECK(call) \
do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(err); \
    } \
} while (0)

int main() {
    // allocate memory on host and device
    int* h_data = new int[N];
    int* d_data;
    CUDA_CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));

    // data initialization
    for (int i = 0; i < N; i++) {
        h_data[i] = i;
    }

    // measure time for dev to host
    clock_t start, end;
    double devToHostTime = 0.0;

    for (int iter = 0; iter < numIterations; iter++) {
        start = clock();
        CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost));
        end = clock();
        devToHostTime += (double)(end - start) / CLOCKS_PER_SEC;
    }

    // measure time for host to dev
    double hostToDevTime = 0.0;

    for (int iter = 0; iter < numIterations; iter++) {
        start = clock();
        CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
        end = clock();
        hostToDevTime += (double)(end - start) / CLOCKS_PER_SEC;
    }

    // convert to average
    devToHostTime /= numIterations;
    hostToDevTime /= numIterations;

    // show result
    std::cout << "Average bandwidth for Device to Host memory copy: " << N * sizeof(int) / devToHostTime / MEGA_BYTE << " Mbytes / s" << std::endl;
    std::cout << "Average bandwidth for Host to Device memory copy: " << N * sizeof(int) / hostToDevTime / MEGA_BYTE << " Mbytes / s" << std::endl;

    // free memory
    delete[] h_data;
    CUDA_CHECK(cudaFree(d_data));

    return 0;
}
	#include <iostream>
	#include <ctime>
	#include <cuda_runtime.h>

	#define MEGA_BYTE (1'000'000)

	const int N = 100 * MEGA_BYTE; // Size of data
	const int numIterations = 100; // iteration

	// CUDA error check macro
	#define CUDA_CHECK(call) \
	do { \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" << __LINE__ << std::endl; \
	exit(err); \
	} \
	} while (0)

	int main() {
	// allocate memory on host and device
	int* h_data = new int[N];
	int* d_data;
	CUDA_CHECK(cudaMalloc((void*)&d_data, N sizeof(int)));

	// data initialization
	for (int i = 0; i < N; i++) {
	h_data[i] = i;
	}

	// measure time for dev to host
	clock_t start, end;
	double devToHostTime = 0.0;

	for (int iter = 0; iter < numIterations; iter++) {
	start = clock();
	CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost));
	end = clock();
	devToHostTime += (double)(end - start) / CLOCKS_PER_SEC;
	}

	// measure time for host to dev
	double hostToDevTime = 0.0;

	for (int iter = 0; iter < numIterations; iter++) {
	start = clock();
	CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
	end = clock();
	hostToDevTime += (double)(end - start) / CLOCKS_PER_SEC;
	}

	// convert to average
	devToHostTime /= numIterations;
	hostToDevTime /= numIterations;

	// show result
	std::cout << "Average bandwidth for Device to Host memory copy: " << N * sizeof(int) / devToHostTime / MEGA_BYTE << " Mbytes / s" << std::endl;
	std::cout << "Average bandwidth for Host to Device memory copy: " << N * sizeof(int) / hostToDevTime / MEGA_BYTE << " Mbytes / s" << std::endl;

	// free memory
	delete[] h_data;
	CUDA_CHECK(cudaFree(d_data));

	return 0;
	}