Skip to content

Instantly share code, notes, and snippets.

@Ext3h
Last active November 6, 2019 09:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ext3h/6eb2df21873f5524bfd70a0368872d4e to your computer and use it in GitHub Desktop.
Save Ext3h/6eb2df21873f5524bfd70a0368872d4e to your computer and use it in GitHub Desktop.
#include <vector>
#include <thread>
#include <future>
#include <iostream>
#include <string>
#include <algorithm>
#include <cuda.h>
#pragma comment(lib, "cuda.lib")
// Scale up to enqueue more load until PCIe breaks
#define MEMCOPY_ITERATIONS 500
// Scale up to increase PCIe load without increasing driver overhead
const size_t MEMCOPY_SIZE = (1 << 27); // 128M
#define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__);
namespace test {
class container {
public:
std::vector<void*> hostAllocations;
std::vector<CUdeviceptr> gpuAllocations;
CUdevice dev;
CUcontext ctx;
int dev_id;
};
void check(CUresult result, const char* command, const container &ctx, const char* file, int line) {
if (result != CUDA_SUCCESS)
{
static std::mutex cerr_mutex;
std::lock_guard<std::mutex> lock(cerr_mutex);
const char* error = nullptr;
cuGetErrorName(result, &error);
std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl;
std::cerr << command << std::endl;
abort();
}
}
void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) {
ctxs.resize(device_ids.size());
for (int i = 0; i < device_ids.size(); i++)
{
auto &ctx = ctxs[i];
ctx.dev_id = device_ids[i];
CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id));
CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC));
CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id));
ctx.hostAllocations.resize(4);
ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2));
CHECK(cuCtxPushCurrent(ctx.ctx));
for (auto &host : ctx.hostAllocations)
{
CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP));
}
for (auto &gpu : ctx.gpuAllocations)
{
CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE));
}
CHECK(cuCtxPopCurrent(nullptr));
}
}
void deinit(std::vector<container> &ctxs) {
for (auto& ctx : ctxs)
{
CHECK(cuCtxPushCurrent(ctx.ctx));
for (auto &host : ctx.hostAllocations)
{
CHECK(cuMemFreeHost(host));
}
for (auto &gpu : ctx.gpuAllocations)
{
CHECK(cuMemFree(gpu));
}
CHECK(cuCtxPopCurrent(nullptr));
CHECK(cuDevicePrimaryCtxRelease(ctx.dev));
}
ctxs.resize(0);
}
float bandwidth(float time) {
return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000;
}
void bandwidth_host_to_device(std::vector<container> &ctxs)
{
std::cout << "Host to device bandwidth test" << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
for (auto &ctx : ctxs)
{
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
float elapsed;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUstream stream_up;
CUevent start;
CUevent stop;
CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start, 0));
CHECK(cuEventCreate(&stop, 0));
CHECK(cuEventRecord(start, stream_up));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyHtoDAsync(
ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
ctx.hostAllocations[i % ctx.hostAllocations.size()],
MEMCOPY_SIZE, stream_up
));
}
CHECK(cuEventRecord(stop, stream_up));
CHECK(cuEventSynchronize(stop));
CHECK(cuEventElapsedTime(&elapsed, start, stop));
CHECK(cuEventDestroy(start));
CHECK(cuEventDestroy(stop));
CHECK(cuStreamDestroy(stream_up));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(ctx.dev_id, elapsed);
}));
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
}
}
void bandwidth_device_to_host(std::vector<container> &ctxs)
{
std::cout << "Device to host bandwidth test" << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
for (auto &ctx : ctxs)
{
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
float elapsed;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUstream stream_down;
CUevent start;
CUevent stop;
CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start, 0));
CHECK(cuEventCreate(&stop, 0));
CHECK(cuEventRecord(start, stream_down));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyDtoHAsync(
ctx.hostAllocations[i % ctx.hostAllocations.size()],
ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
MEMCOPY_SIZE, stream_down
));
}
CHECK(cuEventRecord(stop, stream_down));
CHECK(cuEventSynchronize(stop));
CHECK(cuEventElapsedTime(&elapsed, start, stop));
CHECK(cuEventDestroy(start));
CHECK(cuEventDestroy(stop));
CHECK(cuStreamDestroy(stream_down));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(ctx.dev_id, elapsed);
}));
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
}
}
void bandwidth_bidrectional(std::vector<container> &ctxs)
{
std::cout << "Bidirectional multi-stream bandwidth test" << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
for (auto &ctx : ctxs)
{
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
float elapsed;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUstream stream_up;
CUstream stream_down;
CUevent start_up;
CUevent start_down;
CUevent stop_up;
CUevent stop_down;
CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start_up, 0));
CHECK(cuEventCreate(&start_down, 0));
CHECK(cuEventCreate(&stop_up, 0));
CHECK(cuEventCreate(&stop_down, 0));
CHECK(cuEventRecord(start_up, stream_up));
CHECK(cuEventRecord(start_down, stream_down));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyHtoDAsync(
ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()],
ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()],
MEMCOPY_SIZE,
stream_up
));
CHECK(cuMemcpyDtoHAsync(
ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()],
ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()],
MEMCOPY_SIZE,
stream_down
));
}
CHECK(cuEventRecord(stop_up, stream_up));
CHECK(cuEventRecord(stop_down, stream_down));
CHECK(cuEventSynchronize(stop_up));
CHECK(cuEventSynchronize(stop_down));
{
float a, b, c, d;
CHECK(cuEventElapsedTime(&a, start_up, stop_up));
CHECK(cuEventElapsedTime(&b, start_up, stop_down));
CHECK(cuEventElapsedTime(&c, start_down, stop_up));
CHECK(cuEventElapsedTime(&d, start_down, stop_down));
elapsed = std::max({ a, b, c, d });
}
CHECK(cuEventDestroy(start_up));
CHECK(cuEventDestroy(start_down));
CHECK(cuEventDestroy(stop_up));
CHECK(cuEventDestroy(stop_down));
CHECK(cuStreamDestroy(stream_up));
CHECK(cuStreamDestroy(stream_down));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(ctx.dev_id, elapsed);
}));
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl;
}
}
void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target)
{
std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
int index = 0;
for (auto &ctx : ctxs)
{
if (&ctx == &target)
{
continue;
}
workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> {
float elapsed;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUstream stream;
CUevent start;
CUevent stop;
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start, 0));
CHECK(cuEventCreate(&stop, 0));
CHECK(cuEventRecord(start, stream));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyPeerAsync(
target.gpuAllocations[(i * index) % target.gpuAllocations.size()],
target.ctx,
ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
ctx.ctx,
MEMCOPY_SIZE, stream
));
}
CHECK(cuEventRecord(stop, stream));
CHECK(cuEventSynchronize(stop));
CHECK(cuEventElapsedTime(&elapsed, start, stop));
CHECK(cuEventDestroy(start));
CHECK(cuEventDestroy(stop));
CHECK(cuStreamDestroy(stream));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(ctx.dev_id, elapsed);
}));
index++;
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
}
}
void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source)
{
std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
int index = 0;
for (auto &ctx : ctxs)
{
if (&ctx == &source)
{
continue;
}
workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> {
float elapsed;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUevent start;
CUevent stop;
CUstream stream;
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start, 0));
CHECK(cuEventCreate(&stop, 0));
CHECK(cuEventRecord(start, stream));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyPeerAsync(
ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
ctx.ctx,
source.gpuAllocations[i % source.gpuAllocations.size()],
source.ctx,
MEMCOPY_SIZE, stream
));
}
CHECK(cuEventRecord(stop, stream));
CHECK(cuEventSynchronize(stop));
CHECK(cuEventElapsedTime(&elapsed, start, stop));
CHECK(cuEventDestroy(start));
CHECK(cuEventDestroy(stop));
CHECK(cuStreamDestroy(stream));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(ctx.dev_id, elapsed);
}));
index++;
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
}
}
void bandwidth_device_to_device_shift(std::vector<container> &ctxs)
{
std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl;
std::vector<std::future<std::tuple<int, float>>> workers;
for (int i = 0; i < ctxs.size(); i++)
{
auto &source = ctxs[i];
auto &target = ctxs[(i + 1) % ctxs.size()];
if (&source == &target)
{
continue;
}
workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> {
float elapsed;
auto &ctx = source;
CHECK(cuCtxPushCurrent(ctx.ctx));
CUevent start;
CUevent stop;
CUstream stream;
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
CHECK(cuEventCreate(&start, 0));
CHECK(cuEventCreate(&stop, 0));
CHECK(cuEventRecord(start, stream));
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CHECK(cuMemcpyPeerAsync(
target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()],
target.ctx,
source.gpuAllocations[(i * 2) % source.gpuAllocations.size()],
source.ctx,
MEMCOPY_SIZE, stream
));
}
CHECK(cuEventRecord(stop, stream));
CHECK(cuEventSynchronize(stop));
CHECK(cuEventElapsedTime(&elapsed, start, stop));
CHECK(cuEventDestroy(start));
CHECK(cuEventDestroy(stop));
CHECK(cuStreamDestroy(stream));
CHECK(cuCtxPopCurrent(nullptr));
return std::make_tuple(source.dev_id, elapsed);
}));
};
for (auto &worker : workers)
{
int dev_id;
float time;
std::tie(dev_id, time) = worker.get();
std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
}
}
}
int main(int argc, char** argv)
{
std::vector<int> device_ids;
std::vector<test::container> ctxs;
if (argc == 1)
{
std::cout << "usage: " << argv[0] << " deviceID deviceID...\n";
std::cout << "defaulting to test all devices\n";
}
if (cuInit(0) != CUDA_SUCCESS)
{
std::cout << "cuInit failed, aborting...\n";
exit(1);
}
if(argc > 1)
{
for (int i = 0; i < argc - 1; i++)
{
int dev = atoi(argv[i + 1]);
CUdevice device;
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
{
std::cout << "Could not get device " << dev << ", aborting\n";
exit(1);
}
device_ids.push_back(dev);
}
}
else
{
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
for (int dev = 0; dev < deviceCount; dev++)
{
CUdevice device;
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
{
std::cout << "Could not get device " << dev << ", aborting\n";
exit(1);
}
device_ids.push_back(dev);
}
}
test::init(device_ids, ctxs);
test::bandwidth_host_to_device(ctxs);
test::bandwidth_device_to_host(ctxs);
test::bandwidth_bidrectional(ctxs);
if (ctxs.size() > 2)
{
for (auto &ctx : ctxs)
{
test::bandwidth_device_to_device_gather(ctxs, ctx);
}
for (auto &ctx : ctxs)
{
test::bandwidth_device_to_device_scatter(ctxs, ctx);
}
}
if (ctxs.size() > 1)
{
test::bandwidth_device_to_device_shift(ctxs);
}
test::deinit(ctxs);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment