Skip to content

Instantly share code, notes, and snippets.

@enfiskutensykkel
Created May 24, 2019 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save enfiskutensykkel/d8eb36843e294a57a320533a326ea2db to your computer and use it in GitHub Desktop.
Save enfiskutensykkel/d8eb36843e294a57a320533a326ea2db to your computer and use it in GitHub Desktop.
Simple program for measuring GPU bandwidth
#include <cuda.h>
#include <stdexcept>
#include <string>
#include <vector>
#include <memory>
#include <cstdint>
#include <cstdio>
#include <unistd.h>
using error = std::runtime_error;
using std::string;
using gpulist = std::vector<int>;
using gpuptr = std::shared_ptr<void>;
struct event
{
cudaEvent_t evt;
event()
{
auto err = cudaEventCreate(&evt);
if (err != cudaSuccess)
{
throw error("Could not create event: " + string(cudaGetErrorString(err)));
}
err = cudaEventRecord(evt);
if (err != cudaSuccess)
{
throw error("Could not record event: " + string(cudaGetErrorString(err)));
}
}
~event()
{
cudaEventDestroy(evt);
}
double operator-(const event& other) const
{
float ms = 0;
auto err = cudaEventElapsedTime(&ms, other.evt, evt);
if (err != cudaSuccess)
{
throw error("Could not get elapsed time: " + string(cudaGetErrorString(err)));
}
return ((double) ms) * 1000.0;
}
};
__host__ static void setDevice(int device)
{
auto err = cudaSetDevice(device);
if (err != cudaSuccess)
{
throw error("Could not set device: " + string(cudaGetErrorString(err)));
}
}
__host__ static gpuptr createBuffer(int device, size_t size, bool gpu)
{
cudaError_t err;
void* buffer = nullptr;
if (gpu)
{
setDevice(device);
err = cudaMalloc(&buffer, size);
if (err != cudaSuccess)
{
throw error("Could not allocate device memory: " + string(cudaGetErrorString(err)));
}
return gpuptr(buffer, cudaFree);
}
else
{
err = cudaHostAlloc(&buffer, size, cudaHostAllocMapped);
if (err != cudaSuccess)
{
throw error("Could not allocate mapped host memory: " + string(cudaGetErrorString(err)));
}
return gpuptr(buffer, cudaFreeHost);
}
}
__host__ static void runHost(int dev, gpuptr gpubuf, gpuptr hostbuf, size_t size, size_t count, size_t repeat, cudaMemcpyKind kind)
{
const size_t start = 1024;
const size_t end = size;
for (size_t size = start; size <= end; size <<= 1)
{
for (size_t i = 0; i < repeat; ++i)
{
setDevice(dev);
auto src = hostbuf;
auto dst = gpubuf;
if (kind == cudaMemcpyHostToDevice)
{
src = gpubuf;
dst = hostbuf;
}
event before;
for (size_t j = 0; j < count; ++j)
{
auto err = cudaMemcpy(src.get(), dst.get(), size, kind);
if (err != cudaSuccess)
{
throw error("Could not copy memory: " + string(cudaGetErrorString(err)));
}
}
event after;
auto err = cudaDeviceSynchronize();
if (err != cudaSuccess)
{
throw error(cudaGetErrorString(err));
}
auto time = (after - before) / count;
auto bw = ((double) size) / time;
if (kind == cudaMemcpyHostToDevice)
{
fprintf(stdout, "host %4d HtoD %10zu %12.5f %12.3f\n", dev, size, time, bw);
}
else
{
fprintf(stdout, "%4d host DtoH %10zu %12.5f %12.3f\n", dev, size, time, bw);
}
}
}
}
__global__ static void dostuff(int* ptr)
{
float j = threadIdx.x;
for (int i = 1; i < 10000; ++i)
{
j = (j + 1) / j;
}
if (threadIdx.x == j)
{
ptr[0] = j;
}
}
__host__ static void runP2P(bool p2p, int srcdev, gpuptr srcbuf, int dstdev, gpuptr dstbuf, size_t size, size_t count, size_t repeat)
{
cudaError_t err;
const size_t start = 1024;
const size_t end = size;
for (size_t size = start; size <= end; size <<= 1)
{
for (size_t i = 0; i < repeat; ++i)
{
setDevice(srcdev);
dostuff<<<1, 1>>>((int*) srcbuf.get());
event before;
for (size_t j = 0; j < count; ++j)
{
if (p2p)
{
err = cudaMemcpyPeerAsync(srcbuf.get(), srcdev, dstbuf.get(), dstdev, size);
}
else
{
err = cudaMemcpyAsync(srcbuf.get(), dstbuf.get(), size, cudaMemcpyDeviceToDevice);
}
if (err != cudaSuccess)
{
throw error("Could not copy memory: " + string(cudaGetErrorString(err)));
}
}
event after;
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
{
throw error(cudaGetErrorString(err));
}
auto time = (after - before) / count;
auto bw = ((double) size) / time;
fprintf(stdout, "%4d %4d DtoD %10zu %12.5f %12.3f\n", srcdev, dstdev, size, time, bw);
}
}
}
__host__ static void bandwidthTest(bool p2p, bool dtoh, bool htod, bool dtod, int ping, int pong, size_t count, size_t repeat)
{
const size_t size = 128ULL << 20;
gpuptr src = createBuffer(ping, size, true);
gpuptr dst = createBuffer(pong, size, true);
gpuptr host = createBuffer(-1, size, false);
fprintf(stdout, "%4s %4s %4s %10s %12s %12s\n", "src", "dst", "mode", "size", "usecs", "bandwidth");
if (dtoh)
{
runHost(ping, src, host, size, count, repeat, cudaMemcpyDeviceToHost);
runHost(pong, dst, host, size, count, repeat, cudaMemcpyDeviceToHost);
}
if (htod)
{
runHost(ping, src, host, size, count, repeat, cudaMemcpyHostToDevice);
runHost(pong, dst, host, size, count, repeat, cudaMemcpyHostToDevice);
}
if (dtod)
{
runP2P(p2p, ping, src, pong, dst, size, count, repeat);
runP2P(p2p, pong, dst, ping, src, size, count, repeat);
}
}
static void enableP2P(int src, int dst)
{
cudaError_t err;
err = cudaSetDevice(src);
if (err != cudaSuccess)
{
throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
}
err = cudaDeviceEnablePeerAccess(dst, 0);
if (err != cudaSuccess)
{
throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err));
}
}
static string formatBdf(int domain, int bus, int device)
{
char buffer[64];
sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device);
return string(buffer);
}
static string getGpuName(int device)
{
auto err = cudaSetDevice(device);
if (err != cudaSuccess)
{
throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
}
cudaDeviceProp prop;
err = cudaGetDeviceProperties(&prop, device);
if (err != cudaSuccess)
{
throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err));
}
return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
}
static bool checkP2P(int i, int j, bool use_p2p)
{
if (i == j)
{
return false;
}
if (!use_p2p)
{
return true;
}
int p2p = 0;
auto err = cudaDeviceCanAccessPeer(&p2p, i, j);
if (err != cudaSuccess)
{
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
}
if (p2p != 1)
{
return false;
}
err = cudaDeviceCanAccessPeer(&p2p, j, i);
if (err != cudaSuccess)
{
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
}
if (p2p != 1)
{
return false;
}
return true;
}
static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong)
{
cudaError_t err;
int numGpus = 0;
err = cudaGetDeviceCount(&numGpus);
if (err != cudaSuccess)
{
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
}
if (gpus.empty())
{
for (int i = 0; i < numGpus; ++i)
{
gpus.push_back(i);
}
}
for (int i: gpus)
{
if (gpus.size() > 1)
{
for (int j: gpus)
{
if (checkP2P(i, j, p2p))
{
ping = i;
pong = j;
return;
}
}
}
else
{
for (int j = 0; j < numGpus; ++j)
{
if (checkP2P(i, j, p2p))
{
ping = i;
pong = j;
return;
}
}
}
}
throw error("Could not find suitable GPUs, try -m option for host memory");
}
static string usageString(const char* name)
{
return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]...");
}
static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, bool& dtod, bool& all, gpulist& gpus)
{
char* str;
int opt;
int gpu = -1;
int numGpus = 0;
auto err = cudaGetDeviceCount(&numGpus);
if (err != cudaSuccess)
{
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
}
while ((opt = getopt(argc, argv, ":c:r:n:hg:mda")) != -1)
{
switch (opt)
{
case 'h':
throw error(usageString(argv[0]));
case ':':
throw error(string("Missing value for option: ") + argv[optind-1]);
case '?':
throw error(string("Unknown option: ") + argv[optind-1]);
case 'c':
str = nullptr;
count = strtoul(optarg, &str, 0);
if (str == NULL || *str != '\0' || count == 0)
{
throw error("You're doing it wrong!");
}
break;
case 'r':
case 'n':
str = nullptr;
repeat = strtoul(optarg, &str, 0);
if (str == NULL || *str != '\0' || repeat == 0)
{
throw error("You're doing it wrong!");
}
break;
case 'm':
p2p = false;
break;
case 'a':
all = true;
break;
case 'd':
dtod = true;
break;
case 'g':
str = nullptr;
gpu = strtol(optarg, &str, 10);
if (str == NULL || *str != '\0' || gpu < 0 || gpu >= numGpus)
{
throw error("Invalid GPU");
}
gpus.push_back(gpu);
break;
}
}
}
int main(int argc, char** argv)
{
uint32_t repeat = 10;
uint32_t count = 1;
bool p2p = true;
int ping = -1;
int pong = -1;
bool all = false;
bool dtod = false;
try
{
gpulist gpus;
parseArguments(argc, argv, repeat, count, p2p, dtod, all, gpus);
findGpus(gpus, p2p, ping, pong);
fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str());
if (p2p)
{
enableP2P(ping, pong);
}
fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str());
if (p2p)
{
enableP2P(pong, ping);
}
fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat);
fprintf(stdout, "uva=%s\n", p2p ? "false" : "true");
fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false");
fprintf(stdout, "ping=%d %s\n",
ping, getGpuName(ping).c_str());
fprintf(stdout, "pong=%d %s\n",
pong, getGpuName(pong).c_str());
cudaDeviceProp props;
cudaGetDeviceProperties(&props, ping);
fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate);
fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate);
fprintf(stdout, "\n");
bandwidthTest(p2p, all, all, dtod, ping, pong, count, repeat);
fprintf(stdout, "\n");
}
catch (const error& err)
{
fprintf(stderr, "%s\n", err.what());
return 1;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment