Last active
April 3, 2023 15:06
-
-
Save enfiskutensykkel/2b0f7afcb35d12477165746f062c7453 to your computer and use it in GitHub Desktop.
Simple CUDA program for measuring the ping-pong latency between two GPUs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <stdexcept> | |
#include <string> | |
#include <vector> | |
#include <memory> | |
#include <cstdint> | |
#include <cstdio> | |
#include <unistd.h> | |
using error = std::runtime_error; | |
using std::string; | |
using gpulist = std::vector<int>; | |
//using clockptr = std::shared_ptr<clock_t>; | |
//typedef clock_t clockt; | |
//#define get_clock() clock() | |
using clockptr = std::shared_ptr<uint64_t>; | |
typedef uint64_t clockt; | |
#define get_clock() clock64() | |
struct gputask | |
{ | |
int device; | |
bool hostbuffer; | |
uint32_t* buffer; | |
cudaStream_t stream; | |
gputask(bool p2p, int device, cudaStream_t stream); | |
gputask(bool p2p, int device); | |
~gputask(); | |
}; | |
static void freeBuffer(gputask& task) | |
{ | |
if (!task.hostbuffer) | |
{ | |
cudaFree(task.buffer); | |
} | |
else | |
{ | |
cudaFreeHost(task.buffer); | |
} | |
} | |
static cudaError_t allocBuffer(gputask& task) | |
{ | |
cudaError_t err; | |
if (!task.hostbuffer) | |
{ | |
err = cudaMalloc((void**) &task.buffer, sizeof(uint32_t)); | |
} | |
else | |
{ | |
err = cudaHostAlloc((void**) &task.buffer, sizeof(uint32_t), cudaHostAllocMapped); | |
} | |
return err; | |
} | |
gputask::gputask(bool p2p, int device, cudaStream_t stream) | |
: device(device) | |
, hostbuffer(!p2p) | |
, buffer(nullptr) | |
, stream(stream) | |
{ | |
auto err = cudaSetDevice(device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
err = allocBuffer(*this); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err)); | |
} | |
} | |
gputask::gputask(bool p2p, int device) | |
: device(device) | |
, hostbuffer(!p2p) | |
, buffer(nullptr) | |
{ | |
auto err = cudaSetDevice(device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
err = allocBuffer(*this); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err)); | |
} | |
err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); | |
//err = cudaStreamCreate(&stream); | |
if (err != cudaSuccess) | |
{ | |
freeBuffer(*this); | |
throw error(string("Failed to create stream: ") + cudaGetErrorString(err)); | |
} | |
} | |
gputask::~gputask() | |
{ | |
if (stream != 0) | |
{ | |
cudaStreamDestroy(stream); | |
} | |
freeBuffer(*this); | |
} | |
__global__ static void pingKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n, clockt* times) | |
{ | |
if (*local != 0) | |
{ | |
// Assume something is wrong | |
return; | |
} | |
clockt after; | |
clockt before; | |
for (uint32_t i = 1; i <= n; ++i) | |
{ | |
before = get_clock(); | |
*remote = i; | |
while (*local < i); | |
after = get_clock(); | |
times[i-1] = after - before; | |
} | |
} | |
__global__ static void pongKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n) | |
{ | |
if (*remote != 0) | |
{ | |
// Assume something is wrong | |
return; | |
} | |
for (uint32_t i = 1; i <= n; ++i) | |
{ | |
while (*local < i); | |
*remote = i; | |
} | |
} | |
__host__ static void resetBuffer(const gputask& task) | |
{ | |
auto err = cudaSetDevice(task.device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
err = cudaMemset(task.buffer, 0, sizeof(uint32_t)); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not memset: ") + cudaGetErrorString(err)); | |
} | |
} | |
__host__ static uint32_t getCount(const gputask& task) | |
{ | |
uint32_t count = 0; | |
auto err = cudaSetDevice(task.device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
err = cudaMemcpy(&count, task.buffer, sizeof(uint32_t), cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) | |
{ | |
throw error("Failed to copy count from GPU memory"); | |
} | |
return count; | |
} | |
__host__ static clockptr createTimes(const gputask& ping, uint32_t count) | |
{ | |
clockt* times = nullptr; | |
auto err = cudaSetDevice(ping.device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
err = cudaMalloc(×, sizeof(clockt) * count); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not allocate array: ") + cudaGetErrorString(err)); | |
} | |
return clockptr(times, [times](void*) { cudaFree(times); }); | |
} | |
__host__ static void printTimes(const gputask& ping, const clockptr dtimes, uint32_t count) | |
{ | |
auto err = cudaSetDevice(ping.device); | |
if (err != cudaSuccess) | |
{ | |
throw error("Failed to set GPU"); | |
} | |
cudaDeviceProp props; | |
err = cudaGetDeviceProperties(&props, ping.device); | |
if (err != cudaSuccess) | |
{ | |
throw error("Failed to get device properties: " + string(cudaGetErrorString(err))); | |
} | |
clockt* htimes = nullptr; | |
err = cudaHostAlloc(&htimes, sizeof(clockt) * count, cudaHostAllocDefault); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to allocate host memory: ") + cudaGetErrorString(err)); | |
} | |
err = cudaMemcpy(htimes, dtimes.get(), sizeof(clockt) * count, cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) | |
{ | |
cudaFreeHost(htimes); | |
throw error("Failed to copy memory: " + string(cudaGetErrorString(err))); | |
} | |
for (uint32_t i = 0; i < count; ++i) | |
{ | |
uint64_t cycles = htimes[i]; | |
double time = cycles / ((double) props.clockRate); | |
fprintf(stdout, "%6lu %12.5f\n", cycles, time); | |
} | |
cudaFreeHost(htimes); | |
} | |
__host__ static void latencyTest(bool p2p, int pingDevice, int pongDevice, uint32_t count, uint32_t repeat) | |
{ | |
cudaError_t err = cudaSuccess; | |
gputask ping(p2p, pingDevice, 0); | |
gputask pong(p2p, pongDevice); | |
auto times = createTimes(ping, count); | |
fprintf(stdout, "%-6s %12s\n", "cycles", "ms"); | |
for (uint32_t i = 0; i < repeat; ++i) | |
{ | |
resetBuffer(ping); | |
resetBuffer(pong); | |
err = cudaSetDevice(pong.device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
pongKernel<<<1, 1, 0, pong.stream>>>(ping.buffer, pong.buffer, count); | |
err = cudaSetDevice(ping.device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set device: ") + cudaGetErrorString(err)); | |
} | |
pingKernel<<<1, 1, 0, ping.stream>>>(pong.buffer, ping.buffer, count, times.get()); | |
cudaStreamSynchronize(pong.stream); | |
cudaStreamSynchronize(ping.stream); | |
auto pingCount = getCount(ping); | |
auto pongCount = getCount(pong); | |
if (pingCount != count || pongCount != count) | |
{ | |
throw error("Pings and pongs differ from expected count"); | |
} | |
printTimes(ping, times, count); | |
} | |
} | |
static void enableP2P(int src, int dst) | |
{ | |
cudaError_t err; | |
err = cudaSetDevice(src); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set GPU: ") + cudaGetErrorString(err)); | |
} | |
err = cudaDeviceEnablePeerAccess(dst, 0); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err)); | |
} | |
} | |
static string formatBdf(int domain, int bus, int device) | |
{ | |
char buffer[64]; | |
sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device); | |
return string(buffer); | |
} | |
static string getGpuName(int device) | |
{ | |
auto err = cudaSetDevice(device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not set GPU: ") + cudaGetErrorString(err)); | |
} | |
cudaDeviceProp prop; | |
err = cudaGetDeviceProperties(&prop, device); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err)); | |
} | |
return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); | |
} | |
static bool checkP2P(int i, int j, bool use_p2p) | |
{ | |
if (i == j) | |
{ | |
return false; | |
} | |
if (!use_p2p) | |
{ | |
return true; | |
} | |
int p2p = 0; | |
auto err = cudaDeviceCanAccessPeer(&p2p, i, j); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err)); | |
} | |
if (p2p != 1) | |
{ | |
return false; | |
} | |
err = cudaDeviceCanAccessPeer(&p2p, j, i); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err)); | |
} | |
if (p2p != 1) | |
{ | |
return false; | |
} | |
return true; | |
} | |
static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong) | |
{ | |
cudaError_t err; | |
int numGpus = 0; | |
err = cudaGetDeviceCount(&numGpus); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err)); | |
} | |
if (gpus.empty()) | |
{ | |
for (int i = 0; i < numGpus; ++i) | |
{ | |
gpus.push_back(i); | |
} | |
} | |
for (int i: gpus) | |
{ | |
if (gpus.size() > 1) | |
{ | |
for (int j: gpus) | |
{ | |
if (checkP2P(i, j, p2p)) | |
{ | |
ping = i; | |
pong = j; | |
return; | |
} | |
} | |
} | |
else | |
{ | |
for (int j = 0; j < numGpus; ++j) | |
{ | |
if (checkP2P(i, j, p2p)) | |
{ | |
ping = i; | |
pong = j; | |
return; | |
} | |
} | |
} | |
} | |
throw error("Could not find suitable GPUs, try -m option for host memory"); | |
} | |
static string usageString(const char* name) | |
{ | |
return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]..."); | |
} | |
static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, gpulist& gpus) | |
{ | |
char* str; | |
int opt; | |
int gpu = -1; | |
int numGpus = 0; | |
auto err = cudaGetDeviceCount(&numGpus); | |
if (err != cudaSuccess) | |
{ | |
throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err)); | |
} | |
while ((opt = getopt(argc, argv, ":c:r:n:hg:m")) != -1) | |
{ | |
switch (opt) | |
{ | |
case 'h': | |
throw error(usageString(argv[0])); | |
case ':': | |
throw error(string("Missing value for option: ") + argv[optind-1]); | |
case '?': | |
throw error(string("Unknown option: ") + argv[optind-1]); | |
case 'c': | |
str = nullptr; | |
count = strtoul(optarg, &str, 0); | |
if (str == NULL || *str != '\0' || count == 0) | |
{ | |
throw error("You're doing it wrong!"); | |
} | |
break; | |
case 'r': | |
case 'n': | |
str = nullptr; | |
repeat = strtoul(optarg, &str, 0); | |
if (str == NULL || *str != '\0' || repeat == 0) | |
{ | |
throw error("You're doing it wrong!"); | |
} | |
break; | |
case 'm': | |
p2p = false; | |
break; | |
case 'g': | |
str = nullptr; | |
gpu = strtol(optarg, &str, 10); | |
if (str == NULL || *str != '\0' || gpu < 0 || gpu >= numGpus) | |
{ | |
throw error("Invalid GPU"); | |
} | |
gpus.push_back(gpu); | |
break; | |
} | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
uint32_t repeat = 1; | |
uint32_t count = 100; | |
bool p2p = true; | |
int ping = -1; | |
int pong = -1; | |
try | |
{ | |
gpulist gpus; | |
parseArguments(argc, argv, repeat, count, p2p, gpus); | |
findGpus(gpus, p2p, ping, pong); | |
fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str()); | |
if (p2p) | |
{ | |
enableP2P(ping, pong); | |
} | |
fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str()); | |
if (p2p) | |
{ | |
enableP2P(pong, ping); | |
} | |
fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat); | |
fprintf(stdout, "uva=%s\n", p2p ? "false" : "true"); | |
fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false"); | |
fprintf(stdout, "ping=%d %s\n", | |
ping, getGpuName(ping).c_str()); | |
fprintf(stdout, "pong=%d %s\n", | |
pong, getGpuName(pong).c_str()); | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props, ping); | |
if (p2p) | |
{ | |
fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate); | |
} | |
else | |
{ | |
fprintf(stdout, "memory_rate=\n"); | |
} | |
fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate); | |
fprintf(stdout, "\n"); | |
latencyTest(p2p, ping, pong, count, repeat); | |
fprintf(stdout, "\n"); | |
} | |
catch (const error& err) | |
{ | |
fprintf(stderr, "%s\n", err.what()); | |
return 1; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment