Skip to content

Instantly share code, notes, and snippets.

@enfiskutensykkel
Last active April 3, 2023 15:06
Show Gist options
  • Save enfiskutensykkel/2b0f7afcb35d12477165746f062c7453 to your computer and use it in GitHub Desktop.
Save enfiskutensykkel/2b0f7afcb35d12477165746f062c7453 to your computer and use it in GitHub Desktop.
Simple CUDA program for measuring the ping-pong latency between two GPUs
#include <cuda.h>
#include <stdexcept>
#include <string>
#include <vector>
#include <memory>
#include <cstdint>
#include <cstdio>
#include <unistd.h>
using error = std::runtime_error;
using std::string;
using gpulist = std::vector<int>;
//using clockptr = std::shared_ptr<clock_t>;
//typedef clock_t clockt;
//#define get_clock() clock()
using clockptr = std::shared_ptr<uint64_t>;
typedef uint64_t clockt;
#define get_clock() clock64()
struct gputask
{
int device;
bool hostbuffer;
uint32_t* buffer;
cudaStream_t stream;
gputask(bool p2p, int device, cudaStream_t stream);
gputask(bool p2p, int device);
~gputask();
};
static void freeBuffer(gputask& task)
{
if (!task.hostbuffer)
{
cudaFree(task.buffer);
}
else
{
cudaFreeHost(task.buffer);
}
}
static cudaError_t allocBuffer(gputask& task)
{
cudaError_t err;
if (!task.hostbuffer)
{
err = cudaMalloc((void**) &task.buffer, sizeof(uint32_t));
}
else
{
err = cudaHostAlloc((void**) &task.buffer, sizeof(uint32_t), cudaHostAllocMapped);
}
return err;
}
gputask::gputask(bool p2p, int device, cudaStream_t stream)
: device(device)
, hostbuffer(!p2p)
, buffer(nullptr)
, stream(stream)
{
auto err = cudaSetDevice(device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
err = allocBuffer(*this);
if (err != cudaSuccess)
{
throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
}
}
gputask::gputask(bool p2p, int device)
: device(device)
, hostbuffer(!p2p)
, buffer(nullptr)
{
auto err = cudaSetDevice(device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
err = allocBuffer(*this);
if (err != cudaSuccess)
{
throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
}
err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
//err = cudaStreamCreate(&stream);
if (err != cudaSuccess)
{
freeBuffer(*this);
throw error(string("Failed to create stream: ") + cudaGetErrorString(err));
}
}
gputask::~gputask()
{
if (stream != 0)
{
cudaStreamDestroy(stream);
}
freeBuffer(*this);
}
__global__ static void pingKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n, clockt* times)
{
if (*local != 0)
{
// Assume something is wrong
return;
}
clockt after;
clockt before;
for (uint32_t i = 1; i <= n; ++i)
{
before = get_clock();
*remote = i;
while (*local < i);
after = get_clock();
times[i-1] = after - before;
}
}
__global__ static void pongKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n)
{
if (*remote != 0)
{
// Assume something is wrong
return;
}
for (uint32_t i = 1; i <= n; ++i)
{
while (*local < i);
*remote = i;
}
}
__host__ static void resetBuffer(const gputask& task)
{
auto err = cudaSetDevice(task.device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
err = cudaMemset(task.buffer, 0, sizeof(uint32_t));
if (err != cudaSuccess)
{
throw error(string("Could not memset: ") + cudaGetErrorString(err));
}
}
__host__ static uint32_t getCount(const gputask& task)
{
uint32_t count = 0;
auto err = cudaSetDevice(task.device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
err = cudaMemcpy(&count, task.buffer, sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
throw error("Failed to copy count from GPU memory");
}
return count;
}
__host__ static clockptr createTimes(const gputask& ping, uint32_t count)
{
clockt* times = nullptr;
auto err = cudaSetDevice(ping.device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
err = cudaMalloc(&times, sizeof(clockt) * count);
if (err != cudaSuccess)
{
throw error(string("Could not allocate array: ") + cudaGetErrorString(err));
}
return clockptr(times, [times](void*) { cudaFree(times); });
}
__host__ static void printTimes(const gputask& ping, const clockptr dtimes, uint32_t count)
{
auto err = cudaSetDevice(ping.device);
if (err != cudaSuccess)
{
throw error("Failed to set GPU");
}
cudaDeviceProp props;
err = cudaGetDeviceProperties(&props, ping.device);
if (err != cudaSuccess)
{
throw error("Failed to get device properties: " + string(cudaGetErrorString(err)));
}
clockt* htimes = nullptr;
err = cudaHostAlloc(&htimes, sizeof(clockt) * count, cudaHostAllocDefault);
if (err != cudaSuccess)
{
throw error(string("Failed to allocate host memory: ") + cudaGetErrorString(err));
}
err = cudaMemcpy(htimes, dtimes.get(), sizeof(clockt) * count, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
cudaFreeHost(htimes);
throw error("Failed to copy memory: " + string(cudaGetErrorString(err)));
}
for (uint32_t i = 0; i < count; ++i)
{
uint64_t cycles = htimes[i];
double time = cycles / ((double) props.clockRate);
fprintf(stdout, "%6lu %12.5f\n", cycles, time);
}
cudaFreeHost(htimes);
}
__host__ static void latencyTest(bool p2p, int pingDevice, int pongDevice, uint32_t count, uint32_t repeat)
{
cudaError_t err = cudaSuccess;
gputask ping(p2p, pingDevice, 0);
gputask pong(p2p, pongDevice);
auto times = createTimes(ping, count);
fprintf(stdout, "%-6s %12s\n", "cycles", "ms");
for (uint32_t i = 0; i < repeat; ++i)
{
resetBuffer(ping);
resetBuffer(pong);
err = cudaSetDevice(pong.device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
pongKernel<<<1, 1, 0, pong.stream>>>(ping.buffer, pong.buffer, count);
err = cudaSetDevice(ping.device);
if (err != cudaSuccess)
{
throw error(string("Could not set device: ") + cudaGetErrorString(err));
}
pingKernel<<<1, 1, 0, ping.stream>>>(pong.buffer, ping.buffer, count, times.get());
cudaStreamSynchronize(pong.stream);
cudaStreamSynchronize(ping.stream);
auto pingCount = getCount(ping);
auto pongCount = getCount(pong);
if (pingCount != count || pongCount != count)
{
throw error("Pings and pongs differ from expected count");
}
printTimes(ping, times, count);
}
}
static void enableP2P(int src, int dst)
{
cudaError_t err;
err = cudaSetDevice(src);
if (err != cudaSuccess)
{
throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
}
err = cudaDeviceEnablePeerAccess(dst, 0);
if (err != cudaSuccess)
{
throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err));
}
}
static string formatBdf(int domain, int bus, int device)
{
char buffer[64];
sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device);
return string(buffer);
}
static string getGpuName(int device)
{
auto err = cudaSetDevice(device);
if (err != cudaSuccess)
{
throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
}
cudaDeviceProp prop;
err = cudaGetDeviceProperties(&prop, device);
if (err != cudaSuccess)
{
throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err));
}
return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
}
static bool checkP2P(int i, int j, bool use_p2p)
{
if (i == j)
{
return false;
}
if (!use_p2p)
{
return true;
}
int p2p = 0;
auto err = cudaDeviceCanAccessPeer(&p2p, i, j);
if (err != cudaSuccess)
{
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
}
if (p2p != 1)
{
return false;
}
err = cudaDeviceCanAccessPeer(&p2p, j, i);
if (err != cudaSuccess)
{
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
}
if (p2p != 1)
{
return false;
}
return true;
}
static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong)
{
cudaError_t err;
int numGpus = 0;
err = cudaGetDeviceCount(&numGpus);
if (err != cudaSuccess)
{
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
}
if (gpus.empty())
{
for (int i = 0; i < numGpus; ++i)
{
gpus.push_back(i);
}
}
for (int i: gpus)
{
if (gpus.size() > 1)
{
for (int j: gpus)
{
if (checkP2P(i, j, p2p))
{
ping = i;
pong = j;
return;
}
}
}
else
{
for (int j = 0; j < numGpus; ++j)
{
if (checkP2P(i, j, p2p))
{
ping = i;
pong = j;
return;
}
}
}
}
throw error("Could not find suitable GPUs, try -m option for host memory");
}
static string usageString(const char* name)
{
return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]...");
}
static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, gpulist& gpus)
{
char* str;
int opt;
int gpu = -1;
int numGpus = 0;
auto err = cudaGetDeviceCount(&numGpus);
if (err != cudaSuccess)
{
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
}
while ((opt = getopt(argc, argv, ":c:r:n:hg:m")) != -1)
{
switch (opt)
{
case 'h':
throw error(usageString(argv[0]));
case ':':
throw error(string("Missing value for option: ") + argv[optind-1]);
case '?':
throw error(string("Unknown option: ") + argv[optind-1]);
case 'c':
str = nullptr;
count = strtoul(optarg, &str, 0);
if (str == NULL || *str != '\0' || count == 0)
{
throw error("You're doing it wrong!");
}
break;
case 'r':
case 'n':
str = nullptr;
repeat = strtoul(optarg, &str, 0);
if (str == NULL || *str != '\0' || repeat == 0)
{
throw error("You're doing it wrong!");
}
break;
case 'm':
p2p = false;
break;
case 'g':
str = nullptr;
gpu = strtol(optarg, &str, 10);
if (str == NULL || *str != '\0' || gpu < 0 || gpu >= numGpus)
{
throw error("Invalid GPU");
}
gpus.push_back(gpu);
break;
}
}
}
int main(int argc, char** argv)
{
uint32_t repeat = 1;
uint32_t count = 100;
bool p2p = true;
int ping = -1;
int pong = -1;
try
{
gpulist gpus;
parseArguments(argc, argv, repeat, count, p2p, gpus);
findGpus(gpus, p2p, ping, pong);
fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str());
if (p2p)
{
enableP2P(ping, pong);
}
fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str());
if (p2p)
{
enableP2P(pong, ping);
}
fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat);
fprintf(stdout, "uva=%s\n", p2p ? "false" : "true");
fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false");
fprintf(stdout, "ping=%d %s\n",
ping, getGpuName(ping).c_str());
fprintf(stdout, "pong=%d %s\n",
pong, getGpuName(pong).c_str());
cudaDeviceProp props;
cudaGetDeviceProperties(&props, ping);
if (p2p)
{
fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate);
}
else
{
fprintf(stdout, "memory_rate=\n");
}
fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate);
fprintf(stdout, "\n");
latencyTest(p2p, ping, pong, count, repeat);
fprintf(stdout, "\n");
}
catch (const error& err)
{
fprintf(stderr, "%s\n", err.what());
return 1;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment