Created
May 24, 2019 16:17
-
-
Save enfiskutensykkel/d8eb36843e294a57a320533a326ea2db to your computer and use it in GitHub Desktop.
Simple program for measuring GPU bandwidth
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <stdexcept> | |
#include <string> | |
#include <vector> | |
#include <memory> | |
#include <cstdint> | |
#include <cstdio> | |
#include <unistd.h> | |
using error = std::runtime_error; | |
using std::string; | |
using gpulist = std::vector<int>; | |
using gpuptr = std::shared_ptr<void>; | |
struct event | |
{ | |
cudaEvent_t evt; | |
event() | |
{ | |
auto err = cudaEventCreate(&evt); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not create event: " + string(cudaGetErrorString(err))); | |
} | |
err = cudaEventRecord(evt); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not record event: " + string(cudaGetErrorString(err))); | |
} | |
} | |
~event() | |
{ | |
cudaEventDestroy(evt); | |
} | |
double operator-(const event& other) const | |
{ | |
float ms = 0; | |
auto err = cudaEventElapsedTime(&ms, other.evt, evt); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not get elapsed time: " + string(cudaGetErrorString(err))); | |
} | |
return ((double) ms) * 1000.0; | |
} | |
}; | |
__host__ static void setDevice(int device) | |
{ | |
auto err = cudaSetDevice(device); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not set device: " + string(cudaGetErrorString(err))); | |
} | |
} | |
__host__ static gpuptr createBuffer(int device, size_t size, bool gpu) | |
{ | |
cudaError_t err; | |
void* buffer = nullptr; | |
if (gpu) | |
{ | |
setDevice(device); | |
err = cudaMalloc(&buffer, size); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not allocate device memory: " + string(cudaGetErrorString(err))); | |
} | |
return gpuptr(buffer, cudaFree); | |
} | |
else | |
{ | |
err = cudaHostAlloc(&buffer, size, cudaHostAllocMapped); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not allocate mapped host memory: " + string(cudaGetErrorString(err))); | |
} | |
return gpuptr(buffer, cudaFreeHost); | |
} | |
} | |
__host__ static void runHost(int dev, gpuptr gpubuf, gpuptr hostbuf, size_t size, size_t count, size_t repeat, cudaMemcpyKind kind) | |
{ | |
const size_t start = 1024; | |
const size_t end = size; | |
for (size_t size = start; size <= end; size <<= 1) | |
{ | |
for (size_t i = 0; i < repeat; ++i) | |
{ | |
setDevice(dev); | |
auto src = hostbuf; | |
auto dst = gpubuf; | |
if (kind == cudaMemcpyHostToDevice) | |
{ | |
src = gpubuf; | |
dst = hostbuf; | |
} | |
event before; | |
for (size_t j = 0; j < count; ++j) | |
{ | |
auto err = cudaMemcpy(src.get(), dst.get(), size, kind); | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not copy memory: " + string(cudaGetErrorString(err))); | |
} | |
} | |
event after; | |
auto err = cudaDeviceSynchronize(); | |
if (err != cudaSuccess) | |
{ | |
throw error(cudaGetErrorString(err)); | |
} | |
auto time = (after - before) / count; | |
auto bw = ((double) size) / time; | |
if (kind == cudaMemcpyHostToDevice) | |
{ | |
fprintf(stdout, "host %4d HtoD %10zu %12.5f %12.3f\n", dev, size, time, bw); | |
} | |
else | |
{ | |
fprintf(stdout, "%4d host DtoH %10zu %12.5f %12.3f\n", dev, size, time, bw); | |
} | |
} | |
} | |
} | |
__global__ static void dostuff(int* ptr) | |
{ | |
float j = threadIdx.x; | |
for (int i = 1; i < 10000; ++i) | |
{ | |
j = (j + 1) / j; | |
} | |
if (threadIdx.x == j) | |
{ | |
ptr[0] = j; | |
} | |
} | |
__host__ static void runP2P(bool p2p, int srcdev, gpuptr srcbuf, int dstdev, gpuptr dstbuf, size_t size, size_t count, size_t repeat) | |
{ | |
cudaError_t err; | |
const size_t start = 1024; | |
const size_t end = size; | |
for (size_t size = start; size <= end; size <<= 1) | |
{ | |
for (size_t i = 0; i < repeat; ++i) | |
{ | |
setDevice(srcdev); | |
dostuff<<<1, 1>>>((int*) srcbuf.get()); | |
event before; | |
for (size_t j = 0; j < count; ++j) | |
{ | |
if (p2p) | |
{ | |
err = cudaMemcpyPeerAsync(srcbuf.get(), srcdev, dstbuf.get(), dstdev, size); | |
} | |
else | |
{ | |
err = cudaMemcpyAsync(srcbuf.get(), dstbuf.get(), size, cudaMemcpyDeviceToDevice); | |
} | |
if (err != cudaSuccess) | |
{ | |
throw error("Could not copy memory: " + string(cudaGetErrorString(err))); | |
} | |
} | |
event after; | |
err = cudaDeviceSynchronize(); | |
if (err != cudaSuccess) | |
{ | |
throw error(cudaGetErrorString(err)); | |
} | |
auto time = (after - before) / count; | |
auto bw = ((double) size) / time; | |
fprintf(stdout, "%4d %4d DtoD %10zu %12.5f %12.3f\n", srcdev, dstdev, size, time, bw); | |
} | |
} | |
} | |
__host__ static void bandwidthTest(bool p2p, bool dtoh, bool htod, bool dtod, int ping, int pong, size_t count, size_t repeat) | |
{ | |
const size_t size = 128ULL << 20; | |
gpuptr src = createBuffer(ping, size, true); | |
gpuptr dst = createBuffer(pong, size, true); | |
gpuptr host = createBuffer(-1, size, false); | |
fprintf(stdout, "%4s %4s %4s %10s %12s %12s\n", "src", "dst", "mode", "size", "usecs", "bandwidth"); | |
if (dtoh) | |
{ | |
runHost(ping, src, host, size, count, repeat, cudaMemcpyDeviceToHost); | |
runHost(pong, dst, host, size, count, repeat, cudaMemcpyDeviceToHost); | |
} | |
if (htod) | |
{ | |
runHost(ping, src, host, size, count, repeat, cudaMemcpyHostToDevice); | |
runHost(pong, dst, host, size, count, repeat, cudaMemcpyHostToDevice); | |
} | |
if (dtod) | |
{ | |
runP2P(p2p, ping, src, pong, dst, size, count, repeat); | |
runP2P(p2p, pong, dst, ping, src, size, count, repeat); | |
} | |
} | |
static void enableP2P(int src, int dst) | |
{ | |
cudaError_t err; | |
err = cudaSetDevice(src); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set GPU: ") + cudaGetErrorString(err)); | |
} | |
err = cudaDeviceEnablePeerAccess(dst, 0); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err)); | |
} | |
} | |
static string formatBdf(int domain, int bus, int device) | |
{ | |
char buffer[64]; | |
sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device); | |
return string(buffer); | |
} | |
static string getGpuName(int device) | |
{ | |
auto err = cudaSetDevice(device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set GPU: ") + cudaGetErrorString(err)); | |
} | |
cudaDeviceProp prop; | |
err = cudaGetDeviceProperties(&prop, device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err)); | |
} | |
return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); | |
} | |
static bool checkP2P(int i, int j, bool use_p2p) | |
{ | |
if (i == j) | |
{ | |
return false; | |
} | |
if (!use_p2p) | |
{ | |
return true; | |
} | |
int p2p = 0; | |
auto err = cudaDeviceCanAccessPeer(&p2p, i, j); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err)); | |
} | |
if (p2p != 1) | |
{ | |
return false; | |
} | |
err = cudaDeviceCanAccessPeer(&p2p, j, i); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err)); | |
} | |
if (p2p != 1) | |
{ | |
return false; | |
} | |
return true; | |
} | |
static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong) | |
{ | |
cudaError_t err; | |
int numGpus = 0; | |
err = cudaGetDeviceCount(&numGpus); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err)); | |
} | |
if (gpus.empty()) | |
{ | |
for (int i = 0; i < numGpus; ++i) | |
{ | |
gpus.push_back(i); | |
} | |
} | |
for (int i: gpus) | |
{ | |
if (gpus.size() > 1) | |
{ | |
for (int j: gpus) | |
{ | |
if (checkP2P(i, j, p2p)) | |
{ | |
ping = i; | |
pong = j; | |
return; | |
} | |
} | |
} | |
else | |
{ | |
for (int j = 0; j < numGpus; ++j) | |
{ | |
if (checkP2P(i, j, p2p)) | |
{ | |
ping = i; | |
pong = j; | |
return; | |
} | |
} | |
} | |
} | |
throw error("Could not find suitable GPUs, try -m option for host memory"); | |
} | |
static string usageString(const char* name) | |
{ | |
return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]..."); | |
} | |
static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, bool& dtod, bool& all, gpulist& gpus) | |
{ | |
char* str; | |
int opt; | |
int gpu = -1; | |
int numGpus = 0; | |
auto err = cudaGetDeviceCount(&numGpus); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err)); | |
} | |
while ((opt = getopt(argc, argv, ":c:r:n:hg:mda")) != -1) | |
{ | |
switch (opt) | |
{ | |
case 'h': | |
throw error(usageString(argv[0])); | |
case ':': | |
throw error(string("Missing value for option: ") + argv[optind-1]); | |
case '?': | |
throw error(string("Unknown option: ") + argv[optind-1]); | |
case 'c': | |
str = nullptr; | |
count = strtoul(optarg, &str, 0); | |
if (str == NULL || *str != '\0' || count == 0) | |
{ | |
throw error("You're doing it wrong!"); | |
} | |
break; | |
case 'r': | |
case 'n': | |
str = nullptr; | |
repeat = strtoul(optarg, &str, 0); | |
if (str == NULL || *str != '\0' || repeat == 0) | |
{ | |
throw error("You're doing it wrong!"); | |
} | |
break; | |
case 'm': | |
p2p = false; | |
break; | |
case 'a': | |
all = true; | |
break; | |
case 'd': | |
dtod = true; | |
break; | |
case 'g': | |
str = nullptr; | |
gpu = strtol(optarg, &str, 10); | |
if (str == NULL || *str != '\0' || gpu < 0 || gpu >= numGpus) | |
{ | |
throw error("Invalid GPU"); | |
} | |
gpus.push_back(gpu); | |
break; | |
} | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
uint32_t repeat = 10; | |
uint32_t count = 1; | |
bool p2p = true; | |
int ping = -1; | |
int pong = -1; | |
bool all = false; | |
bool dtod = false; | |
try | |
{ | |
gpulist gpus; | |
parseArguments(argc, argv, repeat, count, p2p, dtod, all, gpus); | |
findGpus(gpus, p2p, ping, pong); | |
fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str()); | |
if (p2p) | |
{ | |
enableP2P(ping, pong); | |
} | |
fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str()); | |
if (p2p) | |
{ | |
enableP2P(pong, ping); | |
} | |
fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat); | |
fprintf(stdout, "uva=%s\n", p2p ? "false" : "true"); | |
fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false"); | |
fprintf(stdout, "ping=%d %s\n", | |
ping, getGpuName(ping).c_str()); | |
fprintf(stdout, "pong=%d %s\n", | |
pong, getGpuName(pong).c_str()); | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props, ping); | |
fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate); | |
fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate); | |
fprintf(stdout, "\n"); | |
bandwidthTest(p2p, all, all, dtod, ping, pong, count, repeat); | |
fprintf(stdout, "\n"); | |
} | |
catch (const error& err) | |
{ | |
fprintf(stderr, "%s\n", err.what()); | |
return 1; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment