neoblizz/stream.cu Secret

## stream.cu
// This tests the time for inter-stream synchronisation.
// We fire up N streams per GPU on M devices and do
// waitevents between all NxM streams, to time the inter
// stream sync. These are always all-to-all syncs.
#include <stdio.h>


// Kernel just spins for a certain amount of time, to allow other launches to be
// issued so that we're not host-launch limited
static __global__ void delay_kernel(long long cycles) {
    long long start = clock64();
    while (clock64() - start < cycles);
}

struct devRes {
    cudaStream_t *stream;
    cudaEvent_t *ev;
};

union SWSync {
    char gpuData[16];   // Per GPU data (each GPU writes one byte)
    struct {
        unsigned long long low;
        unsigned long long high;
    } aggData;          // Allows reading the data in a single word
    int4 nvccData;      // Used so that NVCC will emit a single 128-bit operation
};

// The software-sync kernel writes into "syncdata" at the offset indicated by its GPU ID.
// We then poll "syncdata" until all GPUs have written, then exit.
// "state" is either 0 or 1, which indicates the direction we're switching the sync values.
static __global__ void SWSyncKernel(SWSync *syncData, int gpuid, int numGpus, char state) {
    SWSync inSync;
    syncData->gpuData[gpuid] = state;       // Write the new value to gpuData

    inSync.nvccData = syncData->nvccData;   // Read the PCI data while we construct our comparison values

    unsigned long long lowcompare = 0, highcompare = 0;
    if (state != 0) {
        unsigned long long lowmask, highmask;
        if (numGpus < 8) {
            lowmask = (1ULL << (numGpus * 8)) - 1ULL;
            lowcompare = 0x0101010101010101ULL & lowmask;
            highcompare = 0;
        }
        else if (numGpus < 16) {
            lowcompare = 0x0101010101010101ULL;
            highmask = (1ULL << ((numGpus - 8) * 8)) - 1ULL;
            highcompare = 0x0101010101010101ULL & highmask;
        }
        else {
            lowcompare = 0x0101010101010101ULL;
            highcompare = 0x0101010101010101ULL;
        }
    }

    // Now spin, polling the sysmem storage until everyone's value is set
    while ((inSync.aggData.low != lowcompare) && (inSync.aggData.high != highcompare)) {
        inSync.nvccData = syncData->nvccData;   // Read the PCI data
    }
}


// Implements all-to-all event sync between all streams (e.g. worst case algorithm)
void algorithm_all_to_all(int numDevices, int streamCount, devRes *gpu) {
    // Now record events in every single stream on every device
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 0; s < streamCount; s++) {
            //delay_kernel<<< 1, 1, 0, testStream >>>(0);             // Launch a dummy kernel into every stream
            cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
        }
    }

    // Then push waits on all events into every stream on every device
    for (int dev = 0; dev < numDevices; dev++) {                                    // For each device
        cudaSetDevice(dev);
        for (int s = 0; s < streamCount; s++) {                                     // For each stream on that device
            for (int waitdev = 0; waitdev < numDevices; waitdev++) {                // We'll wait on each device
                for (int waitstream = 0; waitstream < streamCount; waitstream++) {  // Each stream there
                    if (waitdev != dev || waitstream != s) {                        // Don't wait on ourself
                        cudaStreamWaitEvent(gpu[dev].stream[s], gpu[waitdev].ev[waitstream], 0);
                    }
                }
            }
        }
    }
}


// This is a linear-sync algorithm: each stream waits for the stream on
// its left then hands off to the stream on its right. Finally everyone
// waits for the rightmost stream to signal.
void algorithm_linear(int numDevices, int streamCount, devRes *gpu) {
    // We sync all streams on a single device locally, then sync between devices
    // We'll use the same "pass to the right" algorithm on each device

    // Begin by waiting internally on each device
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 0; s < streamCount; s++) {
            //delay_kernel<<< 1, 1, 0, testStream >>>(0);             // Launch a dummy kernel into every stream
            cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);

            // Last stream doesn't push a wait on anyone
            if ((s + 1) < streamCount) {
                cudaStreamWaitEvent(gpu[dev].stream[s + 1], gpu[dev].ev[s], 0);
            }
        }
    }

    // Now wait between the last streams on each device
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        int s = streamCount - 1;
        cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);    // Record "waiting finished" event
        if ((dev + 1) < numDevices) {
            cudaStreamWaitEvent(gpu[dev + 1].stream[s], gpu[dev].ev[s], 0);
        }
    }

    // Now we can signal back across local GPUs, waiting for the final device & stream
    cudaEventRecord(gpu[numDevices - 1].ev[streamCount - 1], gpu[numDevices - 1].stream[streamCount - 1]);
    for (int dev = 0; dev < numDevices - 1; dev++) {
        cudaSetDevice(dev);
        cudaStreamWaitEvent(gpu[dev].stream[0], gpu[numDevices - 1].ev[streamCount - 1], 0);
        cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);

        // Make everyone on my GPU wait for this local stream 0 event (reduces cross-GPU syncs)
        for (int s = 1; s < streamCount; s++) {
            cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
        }
    }
}

// Here we sync within a GPU, then between stream 0 on each GPU
void algorithm_reduction(int numDevices, int streamCount, devRes *gpu) {
    // Let's do an interesting fan-in-fan-out mechanism
    // First we sync locally to stream 0 on each GPU using events, because at least that's fast
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 1; s < streamCount; s++) {
            cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
            cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0);
        }

        // Record a completion event on stream 0
        cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
    }

    // Now sync between GPUs
    // 1: GPU 0 syncs on everyone then records an event
    for (int dev = 1; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        cudaStreamWaitEvent(gpu[0].stream[0], gpu[dev].ev[0], 0);
    }
    cudaEventRecord(gpu[0].ev[0], gpu[0].stream[0]);

    // 2: Then propagate that event back to the other GPUs on stream 0
    for (int dev = 1; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        cudaStreamWaitEvent(gpu[dev].stream[0], gpu[0].ev[0], 0);
        cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
    }

    // 3: Finally propagate the stream 0 event to all other streams per GPU
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 1; s < streamCount; s++) {
            cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
        }
    }
}

// This is a software sync, based on writing then polling a sysmem location
// so that kernels can signal between each other
void algorithm_software(int numDevices, int streamCount, devRes *gpu, SWSync *syncData) {
    static char state = 0;
    state = !state;             // Toggle the state of the barrier

    // First we sync locally to stream 0 on each GPU using events, because at least that's fast
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 1; s < streamCount; s++) {
            cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
            cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0);
        }
    }

    // Now we launch our sync kernel onto each GPU's stream 0 - it'll complete when all GPUs have hit the barrier
    cudaError_t err;
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        SWSyncKernel << < 1, 1, 0, gpu[dev].stream[0] >> > (syncData, dev, numDevices, state);
        if ((err = cudaGetLastError()) != cudaSuccess) printf("Kernel launch error: %s\n", cudaGetErrorString(err));
        cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
    }

    // Finally all streams must wait for the sync kernel to finish, and we know everyone is finished
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);
        for (int s = 1; s < streamCount; s++) {
            cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
        }
    }

}

// Args are:
//  numStreams = number of streams per device
//  numDevices = number of devices to span (starting with 0)
//  numSyncs = number of syncs we do during the test
//  timingAll = 0 for timing events ignoring the test stream fan-out, and 1 to capture the test stream fan-out
void stream_sync(int numStreams, int numDevices, int numSyncs, int timingAll) {
    printf("stream_sync test: numStreams=%d, numDevices=%d, numSyncs=%d, timingAll=%d\n", numStreams, numDevices, numSyncs, timingAll);

    devRes gpu[16];       // At most 16 devices

    // For fun, we'll map peer access between all devices
    for (int dev = 0; dev < numDevices; dev++) {
        cudaSetDevice(dev);

        // Peer map all other devices from this one
        for (int peerdev = 0; peerdev < numDevices; peerdev++) {
            if (dev != peerdev) {
                cudaDeviceEnablePeerAccess(peerdev, 0);
            }
        }

        // Create the streams for this device while we're at it
        gpu[dev].stream = new cudaStream_t[numStreams];
        for (int i = 0; i < numStreams; i++) {
            cudaStreamCreate(&gpu[dev].stream[i]);
        }

        // One event per stream on this device
        gpu[dev].ev = new cudaEvent_t[numStreams];
        for (int i = 0; i < numStreams; i++) {
            cudaEventCreate(&gpu[dev].ev[i]);
        }
    }
    cudaSetDevice(0);

    // Create 16 bytes of sysmem for the software-sync algorithm to use
    SWSync *syncData;
    if (cudaHostAlloc(&syncData, sizeof(syncData), cudaHostAllocPortable | cudaHostAllocWriteCombined | cudaHostAllocMapped) != cudaSuccess) {
        printf("Failed to allocate sysmem: %s\n", cudaGetErrorString(cudaGetLastError()));
        exit(1);
    }
    syncData->aggData.low = syncData->aggData.high = 0;     // Reset our sync data (first transition will be 0->1)
    SWSync *devSyncData;
    cudaHostGetDevicePointer(&devSyncData, syncData, 0);
    //printf("Allocated syncData (%p) = %.16llx-%.16llx - devSyncData = %p\n", syncData, syncData->aggData.low, syncData->aggData.high, devSyncData);

    // Simple end-to-end timing
    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaStream_t testStream;
    cudaStreamCreate(&testStream);

    printf("stream_count, num_devices, num_syncs, all-to-all, linear, per-gpu, sw-kernel\n");
    fflush(stdout);

    for (int streamCount = 1; streamCount <= numStreams; streamCount++) {
        float timer[4];
        for (int algorithm = 0; algorithm < 4; algorithm++) {
            cudaSetDevice(0);
            delay_kernel << < 1, 1, 0, testStream >> > (500000000);     // Allow enough time to launch everything
            cudaEventRecord(start, testStream);

            // Start by hooking all stream 0 of each GPU into our test stream
            for (int dev = 0; dev < numDevices; dev++) {
                cudaSetDevice(dev);
                cudaStreamWaitEvent(gpu[dev].stream[0], start, 0);
                cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
                if(!timingAll) cudaStreamWaitEvent(testStream, gpu[dev].ev[0], 0);

                // Now make all other streams on this GPU wait for stream 0 - it's faster
                // than having everyone poll on GPU 0
                for (int s = 0; s < streamCount; s++) {
                    cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);

                    // If I'm not timing the fan-out, then hook every stream back in to the test stream
                    if(!timingAll) {
                        cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
                        cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0);
                    }
                }
            }

            if (!timingAll) {
                cudaSetDevice(0);
                cudaEventRecord(start, testStream);
            }

            // Issue as many syncs as have been requested
            for (int syncid = 0; syncid < numSyncs; syncid++) {
                switch (algorithm) {
                case 0:
                    algorithm_all_to_all(numDevices, streamCount, gpu);
                    break;

                case 1:
                    algorithm_linear(numDevices, streamCount, gpu);
                    break;

                case 2:
                    algorithm_reduction(numDevices, streamCount, gpu);
                    break;

                case 3:
                    algorithm_software(numDevices, streamCount, gpu, devSyncData);
                    break;

                default:
                    printf("Unrecognised sync algorithm\n");
                    break;
                }
            }

            // To record the fastest possible timing event, hook GPU 0 stream 0 back into the test stream
            // Note that not all other GPUs may have seen their waitevent operations yet, but we are
            // guaranteed that they have at least been issued and that GPU 0 stream 0 has seen completion
            // on all other streams & GPUs.
            if (!timingAll) {
                cudaSetDevice(0);
                cudaStreamWaitEvent(testStream, gpu[0].ev[0], 0);
                cudaEventRecord(end, testStream);
            }

            // End by hooking all test streams back into our main stream
            for (int dev = 0; dev < numDevices; dev++) {
                cudaSetDevice(dev);
                for (int s = 0; s < streamCount; s++) {
                    cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
                    cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0);
                }
            }

            // For fan-out-fan-in timing, we record the end event after everything has fanned back in
            if (timingAll) {
                cudaEventRecord(end, testStream);
            }
            cudaStreamSynchronize(testStream);
            cudaEventElapsedTime(&timer[algorithm], start, end);
        }

        printf("%d, %d, %d, %f, %f, %f, %f\n", streamCount, numDevices, numSyncs, timer[0], timer[1], timer[2], timer[3]);
        fflush(stdout);

        // Flush all work on each device, just to be sure
        for (int dev = 0; dev < numDevices; dev++) {
            cudaSetDevice(dev);
            cudaDeviceSynchronize();
        }
    }
}
	// This tests the time for inter-stream synchronisation.
	// We fire up N streams per GPU on M devices and do
	// waitevents between all NxM streams, to time the inter
	// stream sync. These are always all-to-all syncs.
	#include <stdio.h>


	// Kernel just spins for a certain amount of time, to allow other launches to be
	// issued so that we're not host-launch limited
	static __global__ void delay_kernel(long long cycles) {
	long long start = clock64();
	while (clock64() - start < cycles);
	}

	struct devRes {
	cudaStream_t *stream;
	cudaEvent_t *ev;
	};

	union SWSync {
	char gpuData[16]; // Per GPU data (each GPU writes one byte)
	struct {
	unsigned long long low;
	unsigned long long high;
	} aggData; // Allows reading the data in a single word
	int4 nvccData; // Used so that NVCC will emit a single 128-bit operation
	};

	// The software-sync kernel writes into "syncdata" at the offset indicated by its GPU ID.
	// We then poll "syncdata" until all GPUs have written, then exit.
	// "state" is either 0 or 1, which indicates the direction we're switching the sync values.
	static __global__ void SWSyncKernel(SWSync *syncData, int gpuid, int numGpus, char state) {
	SWSync inSync;
	syncData->gpuData[gpuid] = state; // Write the new value to gpuData

	inSync.nvccData = syncData->nvccData; // Read the PCI data while we construct our comparison values

	unsigned long long lowcompare = 0, highcompare = 0;
	if (state != 0) {
	unsigned long long lowmask, highmask;
	if (numGpus < 8) {
	lowmask = (1ULL << (numGpus * 8)) - 1ULL;
	lowcompare = 0x0101010101010101ULL & lowmask;
	highcompare = 0;
	}
	else if (numGpus < 16) {
	lowcompare = 0x0101010101010101ULL;
	highmask = (1ULL << ((numGpus - 8) * 8)) - 1ULL;
	highcompare = 0x0101010101010101ULL & highmask;
	}
	else {
	lowcompare = 0x0101010101010101ULL;
	highcompare = 0x0101010101010101ULL;
	}
	}

	// Now spin, polling the sysmem storage until everyone's value is set
	while ((inSync.aggData.low != lowcompare) && (inSync.aggData.high != highcompare)) {
	inSync.nvccData = syncData->nvccData; // Read the PCI data
	}
	}


	// Implements all-to-all event sync between all streams (e.g. worst case algorithm)
	void algorithm_all_to_all(int numDevices, int streamCount, devRes *gpu) {
	// Now record events in every single stream on every device
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 0; s < streamCount; s++) {
	//delay_kernel<<< 1, 1, 0, testStream >>>(0); // Launch a dummy kernel into every stream
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
	}
	}

	// Then push waits on all events into every stream on every device
	for (int dev = 0; dev < numDevices; dev++) { // For each device
	cudaSetDevice(dev);
	for (int s = 0; s < streamCount; s++) { // For each stream on that device
	for (int waitdev = 0; waitdev < numDevices; waitdev++) { // We'll wait on each device
	for (int waitstream = 0; waitstream < streamCount; waitstream++) { // Each stream there
	if (waitdev != dev \|\| waitstream != s) { // Don't wait on ourself
	cudaStreamWaitEvent(gpu[dev].stream[s], gpu[waitdev].ev[waitstream], 0);
	}
	}
	}
	}
	}
	}


	// This is a linear-sync algorithm: each stream waits for the stream on
	// its left then hands off to the stream on its right. Finally everyone
	// waits for the rightmost stream to signal.
	void algorithm_linear(int numDevices, int streamCount, devRes *gpu) {
	// We sync all streams on a single device locally, then sync between devices
	// We'll use the same "pass to the right" algorithm on each device

	// Begin by waiting internally on each device
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 0; s < streamCount; s++) {
	//delay_kernel<<< 1, 1, 0, testStream >>>(0); // Launch a dummy kernel into every stream
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);

	// Last stream doesn't push a wait on anyone
	if ((s + 1) < streamCount) {
	cudaStreamWaitEvent(gpu[dev].stream[s + 1], gpu[dev].ev[s], 0);
	}
	}
	}

	// Now wait between the last streams on each device
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	int s = streamCount - 1;
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); // Record "waiting finished" event
	if ((dev + 1) < numDevices) {
	cudaStreamWaitEvent(gpu[dev + 1].stream[s], gpu[dev].ev[s], 0);
	}
	}

	// Now we can signal back across local GPUs, waiting for the final device & stream
	cudaEventRecord(gpu[numDevices - 1].ev[streamCount - 1], gpu[numDevices - 1].stream[streamCount - 1]);
	for (int dev = 0; dev < numDevices - 1; dev++) {
	cudaSetDevice(dev);
	cudaStreamWaitEvent(gpu[dev].stream[0], gpu[numDevices - 1].ev[streamCount - 1], 0);
	cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);

	// Make everyone on my GPU wait for this local stream 0 event (reduces cross-GPU syncs)
	for (int s = 1; s < streamCount; s++) {
	cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
	}
	}
	}

	// Here we sync within a GPU, then between stream 0 on each GPU
	void algorithm_reduction(int numDevices, int streamCount, devRes *gpu) {
	// Let's do an interesting fan-in-fan-out mechanism
	// First we sync locally to stream 0 on each GPU using events, because at least that's fast
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 1; s < streamCount; s++) {
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
	cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0);
	}

	// Record a completion event on stream 0
	cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
	}

	// Now sync between GPUs
	// 1: GPU 0 syncs on everyone then records an event
	for (int dev = 1; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	cudaStreamWaitEvent(gpu[0].stream[0], gpu[dev].ev[0], 0);
	}
	cudaEventRecord(gpu[0].ev[0], gpu[0].stream[0]);

	// 2: Then propagate that event back to the other GPUs on stream 0
	for (int dev = 1; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	cudaStreamWaitEvent(gpu[dev].stream[0], gpu[0].ev[0], 0);
	cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
	}

	// 3: Finally propagate the stream 0 event to all other streams per GPU
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 1; s < streamCount; s++) {
	cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
	}
	}
	}

	// This is a software sync, based on writing then polling a sysmem location
	// so that kernels can signal between each other
	void algorithm_software(int numDevices, int streamCount, devRes gpu, SWSync syncData) {
	static char state = 0;
	state = !state; // Toggle the state of the barrier

	// First we sync locally to stream 0 on each GPU using events, because at least that's fast
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 1; s < streamCount; s++) {
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
	cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0);
	}
	}

	// Now we launch our sync kernel onto each GPU's stream 0 - it'll complete when all GPUs have hit the barrier
	cudaError_t err;
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	SWSyncKernel << < 1, 1, 0, gpu[dev].stream[0] >> > (syncData, dev, numDevices, state);
	if ((err = cudaGetLastError()) != cudaSuccess) printf("Kernel launch error: %s\n", cudaGetErrorString(err));
	cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
	}

	// Finally all streams must wait for the sync kernel to finish, and we know everyone is finished
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 1; s < streamCount; s++) {
	cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);
	}
	}

	}

	// Args are:
	// numStreams = number of streams per device
	// numDevices = number of devices to span (starting with 0)
	// numSyncs = number of syncs we do during the test
	// timingAll = 0 for timing events ignoring the test stream fan-out, and 1 to capture the test stream fan-out
	void stream_sync(int numStreams, int numDevices, int numSyncs, int timingAll) {
	printf("stream_sync test: numStreams=%d, numDevices=%d, numSyncs=%d, timingAll=%d\n", numStreams, numDevices, numSyncs, timingAll);

	devRes gpu[16]; // At most 16 devices

	// For fun, we'll map peer access between all devices
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);

	// Peer map all other devices from this one
	for (int peerdev = 0; peerdev < numDevices; peerdev++) {
	if (dev != peerdev) {
	cudaDeviceEnablePeerAccess(peerdev, 0);
	}
	}

	// Create the streams for this device while we're at it
	gpu[dev].stream = new cudaStream_t[numStreams];
	for (int i = 0; i < numStreams; i++) {
	cudaStreamCreate(&gpu[dev].stream[i]);
	}

	// One event per stream on this device
	gpu[dev].ev = new cudaEvent_t[numStreams];
	for (int i = 0; i < numStreams; i++) {
	cudaEventCreate(&gpu[dev].ev[i]);
	}
	}
	cudaSetDevice(0);

	// Create 16 bytes of sysmem for the software-sync algorithm to use
	SWSync *syncData;
	if (cudaHostAlloc(&syncData, sizeof(syncData), cudaHostAllocPortable \| cudaHostAllocWriteCombined \| cudaHostAllocMapped) != cudaSuccess) {
	printf("Failed to allocate sysmem: %s\n", cudaGetErrorString(cudaGetLastError()));
	exit(1);
	}
	syncData->aggData.low = syncData->aggData.high = 0; // Reset our sync data (first transition will be 0->1)
	SWSync *devSyncData;
	cudaHostGetDevicePointer(&devSyncData, syncData, 0);
	//printf("Allocated syncData (%p) = %.16llx-%.16llx - devSyncData = %p\n", syncData, syncData->aggData.low, syncData->aggData.high, devSyncData);

	// Simple end-to-end timing
	cudaEvent_t start, end;
	cudaEventCreate(&start);
	cudaEventCreate(&end);
	cudaStream_t testStream;
	cudaStreamCreate(&testStream);

	printf("stream_count, num_devices, num_syncs, all-to-all, linear, per-gpu, sw-kernel\n");
	fflush(stdout);

	for (int streamCount = 1; streamCount <= numStreams; streamCount++) {
	float timer[4];
	for (int algorithm = 0; algorithm < 4; algorithm++) {
	cudaSetDevice(0);
	delay_kernel << < 1, 1, 0, testStream >> > (500000000); // Allow enough time to launch everything
	cudaEventRecord(start, testStream);

	// Start by hooking all stream 0 of each GPU into our test stream
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	cudaStreamWaitEvent(gpu[dev].stream[0], start, 0);
	cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]);
	if(!timingAll) cudaStreamWaitEvent(testStream, gpu[dev].ev[0], 0);

	// Now make all other streams on this GPU wait for stream 0 - it's faster
	// than having everyone poll on GPU 0
	for (int s = 0; s < streamCount; s++) {
	cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0);

	// If I'm not timing the fan-out, then hook every stream back in to the test stream
	if(!timingAll) {
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
	cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0);
	}
	}
	}

	if (!timingAll) {
	cudaSetDevice(0);
	cudaEventRecord(start, testStream);
	}

	// Issue as many syncs as have been requested
	for (int syncid = 0; syncid < numSyncs; syncid++) {
	switch (algorithm) {
	case 0:
	algorithm_all_to_all(numDevices, streamCount, gpu);
	break;

	case 1:
	algorithm_linear(numDevices, streamCount, gpu);
	break;

	case 2:
	algorithm_reduction(numDevices, streamCount, gpu);
	break;

	case 3:
	algorithm_software(numDevices, streamCount, gpu, devSyncData);
	break;

	default:
	printf("Unrecognised sync algorithm\n");
	break;
	}
	}

	// To record the fastest possible timing event, hook GPU 0 stream 0 back into the test stream
	// Note that not all other GPUs may have seen their waitevent operations yet, but we are
	// guaranteed that they have at least been issued and that GPU 0 stream 0 has seen completion
	// on all other streams & GPUs.
	if (!timingAll) {
	cudaSetDevice(0);
	cudaStreamWaitEvent(testStream, gpu[0].ev[0], 0);
	cudaEventRecord(end, testStream);
	}

	// End by hooking all test streams back into our main stream
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	for (int s = 0; s < streamCount; s++) {
	cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]);
	cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0);
	}
	}

	// For fan-out-fan-in timing, we record the end event after everything has fanned back in
	if (timingAll) {
	cudaEventRecord(end, testStream);
	}
	cudaStreamSynchronize(testStream);
	cudaEventElapsedTime(&timer[algorithm], start, end);
	}

	printf("%d, %d, %d, %f, %f, %f, %f\n", streamCount, numDevices, numSyncs, timer[0], timer[1], timer[2], timer[3]);
	fflush(stdout);

	// Flush all work on each device, just to be sure
	for (int dev = 0; dev < numDevices; dev++) {
	cudaSetDevice(dev);
	cudaDeviceSynchronize();
	}
	}
	}