Ext3h/cuda_copy_bandwidth.cpp

## cuda_copy_bandwidth.cpp
#include <vector>
#include <thread>
#include <future>
#include <iostream>
#include <string>
#include <algorithm>

#include <cuda.h>
#pragma comment(lib, "cuda.lib")


// Scale up to enqueue more load until PCIe breaks
#define MEMCOPY_ITERATIONS 500
// Scale up to increase PCIe load without increasing driver overhead
const size_t MEMCOPY_SIZE = (1 << 27); // 128M

#define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__);

namespace test {
	class container {
	public:
		std::vector<void*> hostAllocations;
		std::vector<CUdeviceptr> gpuAllocations;
		CUdevice dev;
		CUcontext ctx;
		int dev_id;
	};

	void check(CUresult result, const char* command, const container &ctx, const char* file, int line) {
		if (result  != CUDA_SUCCESS)
		{
			static std::mutex cerr_mutex;
			std::lock_guard<std::mutex> lock(cerr_mutex);
			const char* error = nullptr;
			cuGetErrorName(result, &error);
			std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl;
			std::cerr << command << std::endl;
			abort();
		}
	}

	void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) {
		ctxs.resize(device_ids.size());
		for (int i = 0; i < device_ids.size(); i++)
		{
			auto &ctx = ctxs[i];
			ctx.dev_id = device_ids[i];

			CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id));
			CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC));
			CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id));
			ctx.hostAllocations.resize(4);
			ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2));
			CHECK(cuCtxPushCurrent(ctx.ctx));
			for (auto &host : ctx.hostAllocations)
			{
				CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP));
			}
			for (auto &gpu : ctx.gpuAllocations)
			{
				CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE));
			}
			CHECK(cuCtxPopCurrent(nullptr));
		}
	}

	void deinit(std::vector<container> &ctxs) {
		for (auto& ctx : ctxs)
		{
			CHECK(cuCtxPushCurrent(ctx.ctx));
			for (auto &host : ctx.hostAllocations)
			{
				CHECK(cuMemFreeHost(host));
			}
			for (auto &gpu : ctx.gpuAllocations)
			{
				CHECK(cuMemFree(gpu));
			}
			CHECK(cuCtxPopCurrent(nullptr));
			CHECK(cuDevicePrimaryCtxRelease(ctx.dev));
		}
		ctxs.resize(0);
	}

	float bandwidth(float time) {
		return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000;
	}
	void bandwidth_host_to_device(std::vector<container> &ctxs)
	{
		std::cout << "Host to device bandwidth test" << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		for (auto &ctx : ctxs)
		{
			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
				float elapsed;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUstream stream_up;
				CUevent start;
				CUevent stop;
				CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start, 0));
				CHECK(cuEventCreate(&stop, 0));
				CHECK(cuEventRecord(start, stream_up));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyHtoDAsync(
						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
						ctx.hostAllocations[i % ctx.hostAllocations.size()],
						MEMCOPY_SIZE, stream_up
					));
				}
				CHECK(cuEventRecord(stop, stream_up));
				CHECK(cuEventSynchronize(stop));
				CHECK(cuEventElapsedTime(&elapsed, start, stop));
				CHECK(cuEventDestroy(start));
				CHECK(cuEventDestroy(stop));
				CHECK(cuStreamDestroy(stream_up));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(ctx.dev_id, elapsed);
			}));
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
		}
	}
	void bandwidth_device_to_host(std::vector<container> &ctxs)
	{
		std::cout << "Device to host bandwidth test" << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		for (auto &ctx : ctxs)
		{
			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
				float elapsed;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUstream stream_down;
				CUevent start;
				CUevent stop;
				CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start, 0));
				CHECK(cuEventCreate(&stop, 0));
				CHECK(cuEventRecord(start, stream_down));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyDtoHAsync(
						ctx.hostAllocations[i % ctx.hostAllocations.size()],
						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
						MEMCOPY_SIZE, stream_down
					));
				}
				CHECK(cuEventRecord(stop, stream_down));
				CHECK(cuEventSynchronize(stop));
				CHECK(cuEventElapsedTime(&elapsed, start, stop));
				CHECK(cuEventDestroy(start));
				CHECK(cuEventDestroy(stop));
				CHECK(cuStreamDestroy(stream_down));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(ctx.dev_id, elapsed);
			}));
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
		}
	}

	void bandwidth_bidrectional(std::vector<container> &ctxs)
	{
		std::cout << "Bidirectional multi-stream bandwidth test" << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		for (auto &ctx : ctxs)
		{
			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
				float elapsed;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUstream stream_up;
				CUstream stream_down;
				CUevent start_up;
				CUevent start_down;
				CUevent stop_up;
				CUevent stop_down;
				CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
				CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start_up, 0));
				CHECK(cuEventCreate(&start_down, 0));
				CHECK(cuEventCreate(&stop_up, 0));
				CHECK(cuEventCreate(&stop_down, 0));
				CHECK(cuEventRecord(start_up, stream_up));
				CHECK(cuEventRecord(start_down, stream_down));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyHtoDAsync(
						ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()],
						ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()],
						MEMCOPY_SIZE,
						stream_up
					));
					CHECK(cuMemcpyDtoHAsync(
						ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()],
						ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()],
						MEMCOPY_SIZE,
						stream_down
					));
				}
				CHECK(cuEventRecord(stop_up, stream_up));
				CHECK(cuEventRecord(stop_down, stream_down));
				CHECK(cuEventSynchronize(stop_up));
				CHECK(cuEventSynchronize(stop_down));
				{
					float a, b, c, d;
					CHECK(cuEventElapsedTime(&a, start_up, stop_up));
					CHECK(cuEventElapsedTime(&b, start_up, stop_down));
					CHECK(cuEventElapsedTime(&c, start_down, stop_up));
					CHECK(cuEventElapsedTime(&d, start_down, stop_down));
					elapsed = std::max({ a, b, c, d });
				}
				CHECK(cuEventDestroy(start_up));
				CHECK(cuEventDestroy(start_down));
				CHECK(cuEventDestroy(stop_up));
				CHECK(cuEventDestroy(stop_down));
				CHECK(cuStreamDestroy(stream_up));
				CHECK(cuStreamDestroy(stream_down));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(ctx.dev_id, elapsed);
			}));
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl;
		}
	}

	void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target)
	{
		std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		int index = 0;
		for (auto &ctx : ctxs)
		{
			if (&ctx == &target)
			{
				continue;
			}
			workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> {
				float elapsed;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUstream stream;
				CUevent start;
				CUevent stop;
				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start, 0));
				CHECK(cuEventCreate(&stop, 0));
				CHECK(cuEventRecord(start, stream));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyPeerAsync(
						target.gpuAllocations[(i * index) % target.gpuAllocations.size()],
						target.ctx,
						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
						ctx.ctx,
						MEMCOPY_SIZE, stream
					));
				}
				CHECK(cuEventRecord(stop, stream));
				CHECK(cuEventSynchronize(stop));
				CHECK(cuEventElapsedTime(&elapsed, start, stop));
				CHECK(cuEventDestroy(start));
				CHECK(cuEventDestroy(stop));
				CHECK(cuStreamDestroy(stream));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(ctx.dev_id, elapsed);
			}));
			index++;
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
		}
	}

	void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source)
	{
		std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		int index = 0;
		for (auto &ctx : ctxs)
		{
			if (&ctx == &source)
			{
				continue;
			}
			workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> {
				float elapsed;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUevent start;
				CUevent stop;
				CUstream stream;
				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start, 0));
				CHECK(cuEventCreate(&stop, 0));
				CHECK(cuEventRecord(start, stream));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyPeerAsync(
						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
						ctx.ctx,
						source.gpuAllocations[i % source.gpuAllocations.size()],
						source.ctx,
						MEMCOPY_SIZE, stream
					));
				}
				CHECK(cuEventRecord(stop, stream));
				CHECK(cuEventSynchronize(stop));
				CHECK(cuEventElapsedTime(&elapsed, start, stop));
				CHECK(cuEventDestroy(start));
				CHECK(cuEventDestroy(stop));
				CHECK(cuStreamDestroy(stream));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(ctx.dev_id, elapsed);
			}));
			index++;
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
		}
	}
	void bandwidth_device_to_device_shift(std::vector<container> &ctxs)
	{
		std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl;
		std::vector<std::future<std::tuple<int, float>>> workers;
		for (int i = 0; i < ctxs.size(); i++)
		{
			auto &source = ctxs[i];
			auto &target = ctxs[(i + 1) % ctxs.size()];
			if (&source == &target)
			{
				continue;
			}
			workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> {
				float elapsed;
				auto &ctx = source;
				CHECK(cuCtxPushCurrent(ctx.ctx));
				CUevent start;
				CUevent stop;
				CUstream stream;
				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
				CHECK(cuEventCreate(&start, 0));
				CHECK(cuEventCreate(&stop, 0));
				CHECK(cuEventRecord(start, stream));
				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
				{
					CHECK(cuMemcpyPeerAsync(
						target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()],
						target.ctx,
						source.gpuAllocations[(i * 2) % source.gpuAllocations.size()],
						source.ctx,
						MEMCOPY_SIZE, stream
					));
				}
				CHECK(cuEventRecord(stop, stream));
				CHECK(cuEventSynchronize(stop));
				CHECK(cuEventElapsedTime(&elapsed, start, stop));
				CHECK(cuEventDestroy(start));
				CHECK(cuEventDestroy(stop));
				CHECK(cuStreamDestroy(stream));
				CHECK(cuCtxPopCurrent(nullptr));
				return std::make_tuple(source.dev_id, elapsed);
			}));
		};
		for (auto &worker : workers)
		{
			int dev_id;
			float time;
			std::tie(dev_id, time) = worker.get();
			std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
		}
	}
}

int main(int argc, char** argv)
{
	std::vector<int> device_ids;
	std::vector<test::container> ctxs;
	if (argc == 1)
	{
		std::cout << "usage: " << argv[0] << " deviceID deviceID...\n";
		std::cout << "defaulting to test all devices\n";
	}
	if (cuInit(0) != CUDA_SUCCESS)
	{
		std::cout << "cuInit failed, aborting...\n";
		exit(1);
	}
	if(argc > 1)
	{
		for (int i = 0; i < argc - 1; i++)
		{
			int dev = atoi(argv[i + 1]);
			CUdevice device;
			if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
			{
				std::cout << "Could not get device " << dev << ", aborting\n";
				exit(1);
			}
			device_ids.push_back(dev);
		}
	}
	else
	{
		int deviceCount = 0;
		cuDeviceGetCount(&deviceCount);
		for (int dev = 0; dev < deviceCount; dev++)
		{
			CUdevice device;
			if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
			{
				std::cout << "Could not get device " << dev << ", aborting\n";
				exit(1);
			}
			device_ids.push_back(dev);
		}
	}
	test::init(device_ids, ctxs);
	test::bandwidth_host_to_device(ctxs);
	test::bandwidth_device_to_host(ctxs);
	test::bandwidth_bidrectional(ctxs);
	if (ctxs.size() > 2)
	{
		for (auto &ctx : ctxs)
		{
			test::bandwidth_device_to_device_gather(ctxs, ctx);
		}
		for (auto &ctx : ctxs)
		{
			test::bandwidth_device_to_device_scatter(ctxs, ctx);
		}
	}
	if (ctxs.size() > 1)
	{
		test::bandwidth_device_to_device_shift(ctxs);
	}
	test::deinit(ctxs);
	return 0;
}
	#include <vector>
	#include <thread>
	#include <future>
	#include <iostream>
	#include <string>
	#include <algorithm>

	#include <cuda.h>
	#pragma comment(lib, "cuda.lib")


	// Scale up to enqueue more load until PCIe breaks
	#define MEMCOPY_ITERATIONS 500
	// Scale up to increase PCIe load without increasing driver overhead
	const size_t MEMCOPY_SIZE = (1 << 27); // 128M

	#define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__);

	namespace test {
	class container {
	public:
	std::vector<void*> hostAllocations;
	std::vector<CUdeviceptr> gpuAllocations;
	CUdevice dev;
	CUcontext ctx;
	int dev_id;
	};

	void check(CUresult result, const char* command, const container &ctx, const char* file, int line) {
	if (result != CUDA_SUCCESS)
	{
	static std::mutex cerr_mutex;
	std::lock_guard<std::mutex> lock(cerr_mutex);
	const char* error = nullptr;
	cuGetErrorName(result, &error);
	std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl;
	std::cerr << command << std::endl;
	abort();
	}
	}

	void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) {
	ctxs.resize(device_ids.size());
	for (int i = 0; i < device_ids.size(); i++)
	{
	auto &ctx = ctxs[i];
	ctx.dev_id = device_ids[i];

	CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id));
	CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC));
	CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id));
	ctx.hostAllocations.resize(4);
	ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2));
	CHECK(cuCtxPushCurrent(ctx.ctx));
	for (auto &host : ctx.hostAllocations)
	{
	CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP));
	}
	for (auto &gpu : ctx.gpuAllocations)
	{
	CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE));
	}
	CHECK(cuCtxPopCurrent(nullptr));
	}
	}

	void deinit(std::vector<container> &ctxs) {
	for (auto& ctx : ctxs)
	{
	CHECK(cuCtxPushCurrent(ctx.ctx));
	for (auto &host : ctx.hostAllocations)
	{
	CHECK(cuMemFreeHost(host));
	}
	for (auto &gpu : ctx.gpuAllocations)
	{
	CHECK(cuMemFree(gpu));
	}
	CHECK(cuCtxPopCurrent(nullptr));
	CHECK(cuDevicePrimaryCtxRelease(ctx.dev));
	}
	ctxs.resize(0);
	}

	float bandwidth(float time) {
	return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000;
	}
	void bandwidth_host_to_device(std::vector<container> &ctxs)
	{
	std::cout << "Host to device bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_up;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream_up));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyHtoDAsync(
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.hostAllocations[i % ctx.hostAllocations.size()],
	MEMCOPY_SIZE, stream_up
	));
	}
	CHECK(cuEventRecord(stop, stream_up));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream_up));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	void bandwidth_device_to_host(std::vector<container> &ctxs)
	{
	std::cout << "Device to host bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_down;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream_down));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyDtoHAsync(
	ctx.hostAllocations[i % ctx.hostAllocations.size()],
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	MEMCOPY_SIZE, stream_down
	));
	}
	CHECK(cuEventRecord(stop, stream_down));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream_down));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_bidrectional(std::vector<container> &ctxs)
	{
	std::cout << "Bidirectional multi-stream bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_up;
	CUstream stream_down;
	CUevent start_up;
	CUevent start_down;
	CUevent stop_up;
	CUevent stop_down;
	CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
	CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start_up, 0));
	CHECK(cuEventCreate(&start_down, 0));
	CHECK(cuEventCreate(&stop_up, 0));
	CHECK(cuEventCreate(&stop_down, 0));
	CHECK(cuEventRecord(start_up, stream_up));
	CHECK(cuEventRecord(start_down, stream_down));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyHtoDAsync(
	ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()],
	ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()],
	MEMCOPY_SIZE,
	stream_up
	));
	CHECK(cuMemcpyDtoHAsync(
	ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()],
	ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()],
	MEMCOPY_SIZE,
	stream_down
	));
	}
	CHECK(cuEventRecord(stop_up, stream_up));
	CHECK(cuEventRecord(stop_down, stream_down));
	CHECK(cuEventSynchronize(stop_up));
	CHECK(cuEventSynchronize(stop_down));
	{
	float a, b, c, d;
	CHECK(cuEventElapsedTime(&a, start_up, stop_up));
	CHECK(cuEventElapsedTime(&b, start_up, stop_down));
	CHECK(cuEventElapsedTime(&c, start_down, stop_up));
	CHECK(cuEventElapsedTime(&d, start_down, stop_down));
	elapsed = std::max({ a, b, c, d });
	}
	CHECK(cuEventDestroy(start_up));
	CHECK(cuEventDestroy(start_down));
	CHECK(cuEventDestroy(stop_up));
	CHECK(cuEventDestroy(stop_down));
	CHECK(cuStreamDestroy(stream_up));
	CHECK(cuStreamDestroy(stream_down));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target)
	{
	std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	int index = 0;
	for (auto &ctx : ctxs)
	{
	if (&ctx == &target)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	target.gpuAllocations[(i * index) % target.gpuAllocations.size()],
	target.ctx,
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	index++;
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source)
	{
	std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	int index = 0;
	for (auto &ctx : ctxs)
	{
	if (&ctx == &source)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUevent start;
	CUevent stop;
	CUstream stream;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.ctx,
	source.gpuAllocations[i % source.gpuAllocations.size()],
	source.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	index++;
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	void bandwidth_device_to_device_shift(std::vector<container> &ctxs)
	{
	std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (int i = 0; i < ctxs.size(); i++)
	{
	auto &source = ctxs[i];
	auto &target = ctxs[(i + 1) % ctxs.size()];
	if (&source == &target)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> {
	float elapsed;
	auto &ctx = source;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUevent start;
	CUevent stop;
	CUstream stream;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()],
	target.ctx,
	source.gpuAllocations[(i * 2) % source.gpuAllocations.size()],
	source.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(source.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	}

	int main(int argc, char** argv)
	{
	std::vector<int> device_ids;
	std::vector<test::container> ctxs;
	if (argc == 1)
	{
	std::cout << "usage: " << argv[0] << " deviceID deviceID...\n";
	std::cout << "defaulting to test all devices\n";
	}
	if (cuInit(0) != CUDA_SUCCESS)
	{
	std::cout << "cuInit failed, aborting...\n";
	exit(1);
	}
	if(argc > 1)
	{
	for (int i = 0; i < argc - 1; i++)
	{
	int dev = atoi(argv[i + 1]);
	CUdevice device;
	if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
	{
	std::cout << "Could not get device " << dev << ", aborting\n";
	exit(1);
	}
	device_ids.push_back(dev);
	}
	}
	else
	{
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	for (int dev = 0; dev < deviceCount; dev++)
	{
	CUdevice device;
	if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
	{
	std::cout << "Could not get device " << dev << ", aborting\n";
	exit(1);
	}
	device_ids.push_back(dev);
	}
	}
	test::init(device_ids, ctxs);
	test::bandwidth_host_to_device(ctxs);
	test::bandwidth_device_to_host(ctxs);
	test::bandwidth_bidrectional(ctxs);
	if (ctxs.size() > 2)
	{
	for (auto &ctx : ctxs)
	{
	test::bandwidth_device_to_device_gather(ctxs, ctx);
	}
	for (auto &ctx : ctxs)
	{
	test::bandwidth_device_to_device_scatter(ctxs, ctx);
	}
	}
	if (ctxs.size() > 1)
	{
	test::bandwidth_device_to_device_shift(ctxs);
	}
	test::deinit(ctxs);
	return 0;
	}