Created
October 7, 2020 17:13
-
-
Save zarzen/d1e076cbab802414725705cb04ed7204 to your computer and use it in GitHub Desktop.
PCIe bandwidth test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// CUDA runtime | |
#include <cuda_runtime.h> | |
#include <cuda.h> | |
#include <cassert> | |
#include <iostream> | |
#include <memory> | |
#include <thread> | |
#include <atomic> | |
using std::atomic_bool; | |
using std::atomic_int; | |
void HtoDBandwidth(atomic_bool& startCpy, atomic_int& readyCnt, int memSize, int repeat, int warmup){ | |
cudaSetDevice(0); | |
unsigned char *h_odata; | |
unsigned char *d_idata; | |
float totalBandwidth = 0; | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
h_odata = (unsigned char *) malloc(memSize); | |
cudaMalloc((void **)&d_idata, memSize); | |
readyCnt++; | |
while (!startCpy){} | |
for (int i = 0; i < repeat; i++) { | |
cudaEventRecord(start); | |
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
if (i < warmup) { | |
continue; | |
} | |
float milliseconds = 0; | |
cudaEventElapsedTime(&milliseconds, start, stop); | |
float bw = memSize*8/milliseconds/1e6; | |
totalBandwidth += bw; | |
} | |
printf("H2D avg Bandwidth Gbps %f \n", totalBandwidth / (repeat - warmup)); | |
} | |
void DtoHBandwidth(atomic_bool& startCpy, atomic_int& readyCnt, int memSize, int repeat, int warmup) { | |
cudaSetDevice(0); | |
unsigned char *h_idata; | |
unsigned char *d_odata; | |
float totalBandwidth = 0; | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
h_idata = (unsigned char *) malloc(memSize); | |
cudaMalloc((void **)&d_odata, memSize); | |
readyCnt++; | |
while (!startCpy){} | |
for (int i=0; i<repeat; i++) { | |
cudaEventRecord(start); | |
cudaMemcpy(h_idata, d_odata, memSize, cudaMemcpyDeviceToHost); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
if (i < warmup) { | |
continue; | |
} | |
float milliseconds = 0; | |
cudaEventElapsedTime(&milliseconds, start, stop); | |
float bw = memSize*8/milliseconds/1e6; | |
totalBandwidth += bw; | |
} | |
printf("D2H avg Bandwidth Gbps %f \n", totalBandwidth / (repeat - warmup)); | |
} | |
int main(){ | |
atomic_bool startFlag{false}; | |
atomic_int readyCnt{0}; | |
int memSize = 128 * 1024 * 1024; | |
int repeat = 20; | |
int warmup = 5; | |
std::thread t1(HtoDBandwidth, std::ref(startFlag), std::ref(readyCnt), memSize, repeat, warmup); | |
std::thread t2(DtoHBandwidth, std::ref(startFlag), std::ref(readyCnt),memSize, repeat, warmup); | |
while(readyCnt < 2) {} | |
printf("ready to start bandwdith tests\n"); | |
startFlag = true; | |
t1.join(); | |
t2.join(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment