Skip to content

Instantly share code, notes, and snippets.

@so298
Last active September 17, 2023 11:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save so298/9b6575d5368e10cc18edab70ef9916cc to your computer and use it in GitHub Desktop.
Save so298/9b6575d5368e10cc18edab70ef9916cc to your computer and use it in GitHub Desktop.
cudaMemcpy bandwidth test
#include <iostream>
#include <ctime>
#include <cuda_runtime.h>
#define MEGA_BYTE (1'000'000)
const int N = 100 * MEGA_BYTE; // Size of data
const int numIterations = 100; // iteration
// CUDA error check macro
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" << __LINE__ << std::endl; \
exit(err); \
} \
} while (0)
int main() {
// allocate memory on host and device
int* h_data = new int[N];
int* d_data;
CUDA_CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));
// data initialization
for (int i = 0; i < N; i++) {
h_data[i] = i;
}
// measure time for dev to host
clock_t start, end;
double devToHostTime = 0.0;
for (int iter = 0; iter < numIterations; iter++) {
start = clock();
CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost));
end = clock();
devToHostTime += (double)(end - start) / CLOCKS_PER_SEC;
}
// measure time for host to dev
double hostToDevTime = 0.0;
for (int iter = 0; iter < numIterations; iter++) {
start = clock();
CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
end = clock();
hostToDevTime += (double)(end - start) / CLOCKS_PER_SEC;
}
// convert to average
devToHostTime /= numIterations;
hostToDevTime /= numIterations;
// show result
std::cout << "Average bandwidth for Device to Host memory copy: " << N * sizeof(int) / devToHostTime / MEGA_BYTE << " Mbytes / s" << std::endl;
std::cout << "Average bandwidth for Host to Device memory copy: " << N * sizeof(int) / hostToDevTime / MEGA_BYTE << " Mbytes / s" << std::endl;
// free memory
delete[] h_data;
CUDA_CHECK(cudaFree(d_data));
return 0;
}
@so298
Copy link
Author

so298 commented Sep 11, 2023

Example output (RTX 3060Ti, PCIe 4.0)

$ nvidia-smi                 
Sun Sep 17 20:14:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   40C    P8    15W / 170W |    974MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
              
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      2471      G   /usr/lib/xorg/Xorg                666MiB |
|    0   N/A  N/A      2603      G   /usr/bin/gnome-shell               41MiB |
|    0   N/A  N/A      4311      G   ...548866862053813773,262144       69MiB |
|    0   N/A  N/A      5587      G   ...veSuggestionsOnlyOnDemand       50MiB |
|    0   N/A  N/A     35238      G   ...RendererForSitePerProcess       26MiB |
|    0   N/A  N/A    129829      G   ...features=BackForwardCache        8MiB |
|    0   N/A  N/A    132541      G   ...RendererForSitePerProcess       58MiB |
+-----------------------------------------------------------------------------+

Average bandwidth for Device to Host memory copy: 8880.28 Mbytes / s
Average bandwidth for Host to Device memory copy: 7798.98 Mbytes / s

@so298
Copy link
Author

so298 commented Sep 17, 2023

Wisteria/BDEC-1 aquarius

$ nvidia-smi
Sun Sep 17 19:52:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-SXM...  On   | 00000000:27:00.0 Off |                    0 |
| N/A   26C    P0    53W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:2A:00.0 Off |                    0 |
| N/A   24C    P0    54W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA A100-SXM...  On   | 00000000:51:00.0 Off |                    0 |
| N/A   24C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA A100-SXM...  On   | 00000000:57:00.0 Off |                    0 |
| N/A   24C    P0    53W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   4  NVIDIA A100-SXM...  On   | 00000000:9E:00.0 Off |                    0 |
| N/A   24C    P0    54W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   5  NVIDIA A100-SXM...  On   | 00000000:A4:00.0 Off |                    0 |
| N/A   24C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   6  NVIDIA A100-SXM...  On   | 00000000:C7:00.0 Off |                    0 |
| N/A   24C    P0    54W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   7  NVIDIA A100-SXM...  On   | 00000000:CA:00.0 Off |                    0 |
| N/A   25C    P0    53W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Output

Average bandwidth for Device to Host memory copy: 10242.4 Mbytes / s
Average bandwidth for Host to Device memory copy: 11211.3 Mbytes / s

@so298
Copy link
Author

so298 commented Sep 17, 2023

MDX (A100 with vmware virtualization)

$ nvidia-smi
Sun Sep 17 20:11:06 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA A100-SXM4-40GB          On  | 00000000:0C:00.0 Off |                    0 |
| N/A   26C    P0              45W / 400W |      4MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

Output

Average bandwidth for Device to Host memory copy: 9817.57 Mbytes / s
Average bandwidth for Host to Device memory copy: 10713.9 Mbytes / s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment