enfiskutensykkel/ping-pong.cu

## ping-pong.cu
#include <cuda.h>
#include <stdexcept>
#include <string>
#include <vector>
#include <memory>
#include <cstdint>
#include <cstdio>
#include <unistd.h>

using error = std::runtime_error;
using std::string;
using gpulist = std::vector<int>;
//using clockptr = std::shared_ptr<clock_t>;
//typedef clock_t clockt;
//#define get_clock() clock()

using clockptr = std::shared_ptr<uint64_t>;
typedef uint64_t clockt;
#define get_clock() clock64()

struct gputask
{
    int             device;
    bool            hostbuffer;
    uint32_t*       buffer;
    cudaStream_t    stream;

    gputask(bool p2p, int device, cudaStream_t stream);
    gputask(bool p2p, int device);
    ~gputask();
};


static void freeBuffer(gputask& task)
{
    if (!task.hostbuffer)
    {
        cudaFree(task.buffer);
    }
    else
    {
        cudaFreeHost(task.buffer);
    }
}


static cudaError_t allocBuffer(gputask& task)
{
    cudaError_t err;

    if (!task.hostbuffer)
    {
        err = cudaMalloc((void**) &task.buffer, sizeof(uint32_t));
    }
    else
    {
        err = cudaHostAlloc((void**) &task.buffer, sizeof(uint32_t), cudaHostAllocMapped);
    }

    return err;
}


gputask::gputask(bool p2p, int device, cudaStream_t stream)
    : device(device)
    , hostbuffer(!p2p)
    , buffer(nullptr)
    , stream(stream)
{
    auto err = cudaSetDevice(device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set device: ") + cudaGetErrorString(err));
    }

    err = allocBuffer(*this);
    if (err != cudaSuccess)
    {
        throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
    }
}


gputask::gputask(bool p2p, int device)
    : device(device)
    , hostbuffer(!p2p)
    , buffer(nullptr)
{
    auto err = cudaSetDevice(device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set device: ") + cudaGetErrorString(err));
    }

    err = allocBuffer(*this);
    if (err != cudaSuccess)
    {
        throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
    }

    err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    //err = cudaStreamCreate(&stream);
    if (err != cudaSuccess)
    {
        freeBuffer(*this);
        throw error(string("Failed to create stream: ") + cudaGetErrorString(err));
    }
}


gputask::~gputask()
{
    if (stream != 0)
    {
        cudaStreamDestroy(stream);
    }
    freeBuffer(*this);
}


__global__ static void pingKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n, clockt* times)
{
    if (*local != 0)
    {
        // Assume something is wrong
        return;
    }

    clockt after;
    clockt before;

    for (uint32_t i = 1; i <= n; ++i)
    {
        before = get_clock();

        *remote = i;
        while (*local < i);
        after = get_clock();

        times[i-1] = after - before;
    }
}


__global__ static void pongKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n)
{
    if (*remote != 0)
    {
        // Assume something is wrong
        return;
    }

    for (uint32_t i = 1; i <= n; ++i)
    {
        while (*local < i);
        *remote = i;
    }
}


__host__ static void resetBuffer(const gputask& task)
{
    auto err = cudaSetDevice(task.device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set device: ") + cudaGetErrorString(err));
    }

    err = cudaMemset(task.buffer, 0, sizeof(uint32_t));
    if (err != cudaSuccess)
    {
        throw error(string("Could not memset: ") + cudaGetErrorString(err));
    }
}


__host__ static uint32_t getCount(const gputask& task)
{
    uint32_t count = 0;

    auto err = cudaSetDevice(task.device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set device: ") + cudaGetErrorString(err));
    }

    err = cudaMemcpy(&count, task.buffer, sizeof(uint32_t), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)
    {
        throw error("Failed to copy count from GPU memory");
    }

    return count;
}


__host__ static clockptr createTimes(const gputask& ping, uint32_t count)
{
    clockt* times = nullptr;

    auto err = cudaSetDevice(ping.device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set device: ") + cudaGetErrorString(err));
    }

    err = cudaMalloc(&times, sizeof(clockt) * count);
    if (err != cudaSuccess)
    {
        throw error(string("Could not allocate array: ") + cudaGetErrorString(err));
    }

    return clockptr(times, [times](void*) { cudaFree(times); });
}


__host__ static void printTimes(const gputask& ping, const clockptr dtimes, uint32_t count)
{
    auto err = cudaSetDevice(ping.device);
    if (err != cudaSuccess)
    {
        throw error("Failed to set GPU");
    }

    cudaDeviceProp props;
    err = cudaGetDeviceProperties(&props, ping.device);
    if (err != cudaSuccess)
    {
        throw error("Failed to get device properties: " + string(cudaGetErrorString(err)));
    }

    clockt* htimes = nullptr;
    err = cudaHostAlloc(&htimes, sizeof(clockt) * count, cudaHostAllocDefault);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to allocate host memory: ") + cudaGetErrorString(err));
    }

    err = cudaMemcpy(htimes, dtimes.get(), sizeof(clockt) * count, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)
    {
        cudaFreeHost(htimes);
        throw error("Failed to copy memory: " + string(cudaGetErrorString(err)));
    }

    for (uint32_t i = 0; i < count; ++i)
    {
        uint64_t cycles = htimes[i];
        double time = cycles / ((double) props.clockRate);
        fprintf(stdout, "%6lu %12.5f\n", cycles, time);
    }

    cudaFreeHost(htimes);
}


__host__ static void latencyTest(bool p2p, int pingDevice, int pongDevice, uint32_t count, uint32_t repeat)
{
    cudaError_t err = cudaSuccess;
    gputask ping(p2p, pingDevice, 0);
    gputask pong(p2p, pongDevice);

    auto times = createTimes(ping, count);

    fprintf(stdout, "%-6s %12s\n", "cycles", "ms");
    for (uint32_t i = 0; i < repeat; ++i)
    {
        resetBuffer(ping);
        resetBuffer(pong);

        err = cudaSetDevice(pong.device);
        if (err != cudaSuccess)
        {
            throw error(string("Could not set device: ") + cudaGetErrorString(err));
        }

        pongKernel<<<1, 1, 0, pong.stream>>>(ping.buffer, pong.buffer, count);

        err = cudaSetDevice(ping.device);
        if (err != cudaSuccess)
        {
            throw error(string("Could not set device: ") + cudaGetErrorString(err));
        }

        pingKernel<<<1, 1, 0, ping.stream>>>(pong.buffer, ping.buffer, count, times.get());

        cudaStreamSynchronize(pong.stream);
        cudaStreamSynchronize(ping.stream);

        auto pingCount = getCount(ping);
        auto pongCount = getCount(pong);

        if (pingCount != count || pongCount != count)
        {
            throw error("Pings and pongs differ from expected count");
        }

        printTimes(ping, times, count);
    }
}


static void enableP2P(int src, int dst)
{
    cudaError_t err;

    err = cudaSetDevice(src);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
    }

    err = cudaDeviceEnablePeerAccess(dst, 0);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err));
    }
}


static string formatBdf(int domain, int bus, int device)
{
    char buffer[64];
    sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device);
    return string(buffer);
}


static string getGpuName(int device)
{
    auto err = cudaSetDevice(device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
    }

    cudaDeviceProp prop;
    err = cudaGetDeviceProperties(&prop, device);
    if (err != cudaSuccess)
    {
        throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err));
    }

    return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
}


static bool checkP2P(int i, int j, bool use_p2p)
{
    if (i == j)
    {
        return false;
    }

    if (!use_p2p)
    {
        return true;
    }

    int p2p = 0;
    auto err = cudaDeviceCanAccessPeer(&p2p, i, j);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
    }

    if (p2p != 1)
    {
        return false;
    }

    err = cudaDeviceCanAccessPeer(&p2p, j, i);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
    }

    if (p2p != 1)
    {
        return false;
    }

    return true;
}


static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong)
{
    cudaError_t err;
    int numGpus = 0;

    err = cudaGetDeviceCount(&numGpus);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
    }

    if (gpus.empty())
    {
        for (int i = 0; i < numGpus; ++i)
        {
            gpus.push_back(i);
        }
    }

    for (int i: gpus)
    {
        if (gpus.size() > 1)
        {
            for (int j: gpus)
            {
                if (checkP2P(i, j, p2p))
                {
                    ping = i;
                    pong = j;
                    return;
                }
            }
        }
        else
        {
            for (int j = 0; j < numGpus; ++j)
            {
                if (checkP2P(i, j, p2p))
                {
                    ping = i;
                    pong = j;
                    return;
                }
            }
        }
    }

    throw error("Could not find suitable GPUs, try -m option for host memory");
}


static string usageString(const char* name)
{
    return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]...");
}


static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, gpulist& gpus)
{
    char* str;
    int opt;
    int gpu = -1;
    int numGpus = 0;

    auto err = cudaGetDeviceCount(&numGpus);
    if (err != cudaSuccess)
    {
        throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
    }

    while ((opt = getopt(argc, argv, ":c:r:n:hg:m")) != -1)
    {
        switch (opt)
        {
            case 'h':
                throw error(usageString(argv[0]));

            case ':':
                throw error(string("Missing value for option: ") + argv[optind-1]);

            case '?':
                throw error(string("Unknown option: ") + argv[optind-1]);

            case 'c':
                str = nullptr;
                count = strtoul(optarg, &str, 0);
                if (str == NULL || *str != '\0' || count == 0)
                {
                    throw error("You're doing it wrong!");
                }
                break;

            case 'r':
            case 'n':
                str = nullptr;
                repeat = strtoul(optarg, &str, 0);
                if (str == NULL || *str != '\0' || repeat == 0)
                {
                    throw error("You're doing it wrong!");
                }
                break;

            case 'm':
                p2p = false;
                break;

            case 'g':
                str = nullptr;
                gpu = strtol(optarg, &str, 10);
                if (str == NULL || *str != '\0' || gpu < 0 || gpu >= numGpus)
                {
                    throw error("Invalid GPU");
                }
                gpus.push_back(gpu);
                break;
        }
    }
}


int main(int argc, char** argv)
{
    uint32_t repeat = 1;
    uint32_t count = 100;
    bool p2p = true;
    int ping = -1;
    int pong = -1;

    try
    {
        gpulist gpus;
        parseArguments(argc, argv, repeat, count, p2p, gpus);

        findGpus(gpus, p2p, ping, pong);

        fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str());
        if (p2p)
        {
            enableP2P(ping, pong);
        }

        fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str());
        if (p2p)
        {
            enableP2P(pong, ping);
        }

        fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat);

        fprintf(stdout, "uva=%s\n", p2p ? "false" : "true");
        fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false");
        fprintf(stdout, "ping=%d %s\n",
                ping, getGpuName(ping).c_str());
        fprintf(stdout, "pong=%d %s\n",
                pong, getGpuName(pong).c_str());

        cudaDeviceProp props;
        cudaGetDeviceProperties(&props, ping);

        if (p2p)
        {
            fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate);
        }
        else
        {
            fprintf(stdout, "memory_rate=\n");
        }
        fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate);

        fprintf(stdout, "\n");
        latencyTest(p2p, ping, pong, count, repeat);
        fprintf(stdout, "\n");
    }
    catch (const error& err)
    {
        fprintf(stderr, "%s\n", err.what());
        return 1;
    }

    return 0;
}
	#include <cuda.h>
	#include <stdexcept>
	#include <string>
	#include <vector>
	#include <memory>
	#include <cstdint>
	#include <cstdio>
	#include <unistd.h>

	using error = std::runtime_error;
	using std::string;
	using gpulist = std::vector<int>;
	//using clockptr = std::shared_ptr<clock_t>;
	//typedef clock_t clockt;
	//#define get_clock() clock()

	using clockptr = std::shared_ptr<uint64_t>;
	typedef uint64_t clockt;
	#define get_clock() clock64()

	struct gputask
	{
	int device;
	bool hostbuffer;
	uint32_t* buffer;
	cudaStream_t stream;

	gputask(bool p2p, int device, cudaStream_t stream);
	gputask(bool p2p, int device);
	~gputask();
	};


	static void freeBuffer(gputask& task)
	{
	if (!task.hostbuffer)
	{
	cudaFree(task.buffer);
	}
	else
	{
	cudaFreeHost(task.buffer);
	}
	}


	static cudaError_t allocBuffer(gputask& task)
	{
	cudaError_t err;

	if (!task.hostbuffer)
	{
	err = cudaMalloc((void**) &task.buffer, sizeof(uint32_t));
	}
	else
	{
	err = cudaHostAlloc((void**) &task.buffer, sizeof(uint32_t), cudaHostAllocMapped);
	}

	return err;
	}


	gputask::gputask(bool p2p, int device, cudaStream_t stream)
	: device(device)
	, hostbuffer(!p2p)
	, buffer(nullptr)
	, stream(stream)
	{
	auto err = cudaSetDevice(device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	err = allocBuffer(*this);
	if (err != cudaSuccess)
	{
	throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
	}
	}


	gputask::gputask(bool p2p, int device)
	: device(device)
	, hostbuffer(!p2p)
	, buffer(nullptr)
	{
	auto err = cudaSetDevice(device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	err = allocBuffer(*this);
	if (err != cudaSuccess)
	{
	throw error(string("Could not allocate device memory: ") + cudaGetErrorString(err));
	}

	err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
	//err = cudaStreamCreate(&stream);
	if (err != cudaSuccess)
	{
	freeBuffer(*this);
	throw error(string("Failed to create stream: ") + cudaGetErrorString(err));
	}
	}


	gputask::~gputask()
	{
	if (stream != 0)
	{
	cudaStreamDestroy(stream);
	}
	freeBuffer(*this);
	}


	__global__ static void pingKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n, clockt* times)
	{
	if (*local != 0)
	{
	// Assume something is wrong
	return;
	}

	clockt after;
	clockt before;

	for (uint32_t i = 1; i <= n; ++i)
	{
	before = get_clock();

	*remote = i;
	while (*local < i);
	after = get_clock();

	times[i-1] = after - before;
	}
	}


	__global__ static void pongKernel(volatile uint32_t* remote, volatile uint32_t* local, uint32_t n)
	{
	if (*remote != 0)
	{
	// Assume something is wrong
	return;
	}

	for (uint32_t i = 1; i <= n; ++i)
	{
	while (*local < i);
	*remote = i;
	}
	}


	__host__ static void resetBuffer(const gputask& task)
	{
	auto err = cudaSetDevice(task.device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	err = cudaMemset(task.buffer, 0, sizeof(uint32_t));
	if (err != cudaSuccess)
	{
	throw error(string("Could not memset: ") + cudaGetErrorString(err));
	}
	}


	__host__ static uint32_t getCount(const gputask& task)
	{
	uint32_t count = 0;

	auto err = cudaSetDevice(task.device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	err = cudaMemcpy(&count, task.buffer, sizeof(uint32_t), cudaMemcpyDeviceToHost);
	if (err != cudaSuccess)
	{
	throw error("Failed to copy count from GPU memory");
	}

	return count;
	}



	__host__ static clockptr createTimes(const gputask& ping, uint32_t count)
	{
	clockt* times = nullptr;

	auto err = cudaSetDevice(ping.device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	err = cudaMalloc(&times, sizeof(clockt) * count);
	if (err != cudaSuccess)
	{
	throw error(string("Could not allocate array: ") + cudaGetErrorString(err));
	}

	return clockptr(times, [times](void*) { cudaFree(times); });
	}


	__host__ static void printTimes(const gputask& ping, const clockptr dtimes, uint32_t count)
	{
	auto err = cudaSetDevice(ping.device);
	if (err != cudaSuccess)
	{
	throw error("Failed to set GPU");
	}

	cudaDeviceProp props;
	err = cudaGetDeviceProperties(&props, ping.device);
	if (err != cudaSuccess)
	{
	throw error("Failed to get device properties: " + string(cudaGetErrorString(err)));
	}

	clockt* htimes = nullptr;
	err = cudaHostAlloc(&htimes, sizeof(clockt) * count, cudaHostAllocDefault);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to allocate host memory: ") + cudaGetErrorString(err));
	}

	err = cudaMemcpy(htimes, dtimes.get(), sizeof(clockt) * count, cudaMemcpyDeviceToHost);
	if (err != cudaSuccess)
	{
	cudaFreeHost(htimes);
	throw error("Failed to copy memory: " + string(cudaGetErrorString(err)));
	}

	for (uint32_t i = 0; i < count; ++i)
	{
	uint64_t cycles = htimes[i];
	double time = cycles / ((double) props.clockRate);
	fprintf(stdout, "%6lu %12.5f\n", cycles, time);
	}

	cudaFreeHost(htimes);
	}



	__host__ static void latencyTest(bool p2p, int pingDevice, int pongDevice, uint32_t count, uint32_t repeat)
	{
	cudaError_t err = cudaSuccess;
	gputask ping(p2p, pingDevice, 0);
	gputask pong(p2p, pongDevice);

	auto times = createTimes(ping, count);

	fprintf(stdout, "%-6s %12s\n", "cycles", "ms");
	for (uint32_t i = 0; i < repeat; ++i)
	{
	resetBuffer(ping);
	resetBuffer(pong);

	err = cudaSetDevice(pong.device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	pongKernel<<<1, 1, 0, pong.stream>>>(ping.buffer, pong.buffer, count);

	err = cudaSetDevice(ping.device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set device: ") + cudaGetErrorString(err));
	}

	pingKernel<<<1, 1, 0, ping.stream>>>(pong.buffer, ping.buffer, count, times.get());

	cudaStreamSynchronize(pong.stream);
	cudaStreamSynchronize(ping.stream);

	auto pingCount = getCount(ping);
	auto pongCount = getCount(pong);

	if (pingCount != count \|\| pongCount != count)
	{
	throw error("Pings and pongs differ from expected count");
	}

	printTimes(ping, times, count);
	}
	}


	static void enableP2P(int src, int dst)
	{
	cudaError_t err;

	err = cudaSetDevice(src);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
	}

	err = cudaDeviceEnablePeerAccess(dst, 0);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to enable P2P for GPU: ") + cudaGetErrorString(err));
	}
	}


	static string formatBdf(int domain, int bus, int device)
	{
	char buffer[64];
	sprintf(buffer, "%04x:%02x:%02x.0", domain, bus, device);
	return string(buffer);
	}


	static string getGpuName(int device)
	{
	auto err = cudaSetDevice(device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not set GPU: ") + cudaGetErrorString(err));
	}

	cudaDeviceProp prop;
	err = cudaGetDeviceProperties(&prop, device);
	if (err != cudaSuccess)
	{
	throw error(string("Could not get GPU properties: ") + cudaGetErrorString(err));
	}

	return string(prop.name) + " " + formatBdf(prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
	}


	static bool checkP2P(int i, int j, bool use_p2p)
	{
	if (i == j)
	{
	return false;
	}

	if (!use_p2p)
	{
	return true;
	}

	int p2p = 0;
	auto err = cudaDeviceCanAccessPeer(&p2p, i, j);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
	}

	if (p2p != 1)
	{
	return false;
	}

	err = cudaDeviceCanAccessPeer(&p2p, j, i);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to check for P2P access: ") + cudaGetErrorString(err));
	}

	if (p2p != 1)
	{
	return false;
	}

	return true;
	}



	static void findGpus(gpulist& gpus, bool p2p, int& ping, int& pong)
	{
	cudaError_t err;
	int numGpus = 0;

	err = cudaGetDeviceCount(&numGpus);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
	}

	if (gpus.empty())
	{
	for (int i = 0; i < numGpus; ++i)
	{
	gpus.push_back(i);
	}
	}

	for (int i: gpus)
	{
	if (gpus.size() > 1)
	{
	for (int j: gpus)
	{
	if (checkP2P(i, j, p2p))
	{
	ping = i;
	pong = j;
	return;
	}
	}
	}
	else
	{
	for (int j = 0; j < numGpus; ++j)
	{
	if (checkP2P(i, j, p2p))
	{
	ping = i;
	pong = j;
	return;
	}
	}
	}
	}

	throw error("Could not find suitable GPUs, try -m option for host memory");
	}


	static string usageString(const char* name)
	{
	return string("Usage: ") + name + string(" [-c <count>] [-r <repeat>] [-m] [-g <gpu>]...");
	}



	static void parseArguments(int argc, char** argv, uint32_t& repeat, uint32_t& count, bool& p2p, gpulist& gpus)
	{
	char* str;
	int opt;
	int gpu = -1;
	int numGpus = 0;

	auto err = cudaGetDeviceCount(&numGpus);
	if (err != cudaSuccess)
	{
	throw error(string("Failed to get GPU count: ") + cudaGetErrorString(err));
	}

	while ((opt = getopt(argc, argv, ":c:r:n:hg:m")) != -1)
	{
	switch (opt)
	{
	case 'h':
	throw error(usageString(argv[0]));

	case ':':
	throw error(string("Missing value for option: ") + argv[optind-1]);

	case '?':
	throw error(string("Unknown option: ") + argv[optind-1]);

	case 'c':
	str = nullptr;
	count = strtoul(optarg, &str, 0);
	if (str == NULL \|\| *str != '\0' \|\| count == 0)
	{
	throw error("You're doing it wrong!");
	}
	break;

	case 'r':
	case 'n':
	str = nullptr;
	repeat = strtoul(optarg, &str, 0);
	if (str == NULL \|\| *str != '\0' \|\| repeat == 0)
	{
	throw error("You're doing it wrong!");
	}
	break;

	case 'm':
	p2p = false;
	break;

	case 'g':
	str = nullptr;
	gpu = strtol(optarg, &str, 10);
	if (str == NULL \|\| *str != '\0' \|\| gpu < 0 \|\| gpu >= numGpus)
	{
	throw error("Invalid GPU");
	}
	gpus.push_back(gpu);
	break;
	}
	}
	}



	int main(int argc, char** argv)
	{
	uint32_t repeat = 1;
	uint32_t count = 100;
	bool p2p = true;
	int ping = -1;
	int pong = -1;

	try
	{
	gpulist gpus;
	parseArguments(argc, argv, repeat, count, p2p, gpus);

	findGpus(gpus, p2p, ping, pong);

	fprintf(stderr, "PING %d %s\n", ping, getGpuName(ping).c_str());
	if (p2p)
	{
	enableP2P(ping, pong);
	}

	fprintf(stderr, "PONG %d %s\n", pong, getGpuName(pong).c_str());
	if (p2p)
	{
	enableP2P(pong, ping);
	}

	fprintf(stdout, "count=%u\nrepeat=%u\n", count, repeat);

	fprintf(stdout, "uva=%s\n", p2p ? "false" : "true");
	fprintf(stdout, "p2p=%s\n", p2p ? "true" : "false");
	fprintf(stdout, "ping=%d %s\n",
	ping, getGpuName(ping).c_str());
	fprintf(stdout, "pong=%d %s\n",
	pong, getGpuName(pong).c_str());

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, ping);

	if (p2p)
	{
	fprintf(stdout, "memory_rate=%d kHz\n", props.memoryClockRate);
	}
	else
	{
	fprintf(stdout, "memory_rate=\n");
	}
	fprintf(stdout, "clock_rate=%d kHz\n", props.clockRate);

	fprintf(stdout, "\n");
	latencyTest(p2p, ping, pong, count, repeat);
	fprintf(stdout, "\n");
	}
	catch (const error& err)
	{
	fprintf(stderr, "%s\n", err.what());
	return 1;
	}

	return 0;
	}