-
-
Save neoblizz/add4e3fde36fb6628eaf7c2277112844 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This tests the time for inter-stream synchronisation. | |
// We fire up N streams per GPU on M devices and do | |
// waitevents between all NxM streams, to time the inter | |
// stream sync. These are always all-to-all syncs. | |
#include <stdio.h> | |
// Kernel just spins for a certain amount of time, to allow other launches to be | |
// issued so that we're not host-launch limited | |
static __global__ void delay_kernel(long long cycles) { | |
long long start = clock64(); | |
while (clock64() - start < cycles); | |
} | |
struct devRes { | |
cudaStream_t *stream; | |
cudaEvent_t *ev; | |
}; | |
union SWSync { | |
char gpuData[16]; // Per GPU data (each GPU writes one byte) | |
struct { | |
unsigned long long low; | |
unsigned long long high; | |
} aggData; // Allows reading the data in a single word | |
int4 nvccData; // Used so that NVCC will emit a single 128-bit operation | |
}; | |
// The software-sync kernel writes into "syncdata" at the offset indicated by its GPU ID. | |
// We then poll "syncdata" until all GPUs have written, then exit. | |
// "state" is either 0 or 1, which indicates the direction we're switching the sync values. | |
static __global__ void SWSyncKernel(SWSync *syncData, int gpuid, int numGpus, char state) { | |
SWSync inSync; | |
syncData->gpuData[gpuid] = state; // Write the new value to gpuData | |
inSync.nvccData = syncData->nvccData; // Read the PCI data while we construct our comparison values | |
unsigned long long lowcompare = 0, highcompare = 0; | |
if (state != 0) { | |
unsigned long long lowmask, highmask; | |
if (numGpus < 8) { | |
lowmask = (1ULL << (numGpus * 8)) - 1ULL; | |
lowcompare = 0x0101010101010101ULL & lowmask; | |
highcompare = 0; | |
} | |
else if (numGpus < 16) { | |
lowcompare = 0x0101010101010101ULL; | |
highmask = (1ULL << ((numGpus - 8) * 8)) - 1ULL; | |
highcompare = 0x0101010101010101ULL & highmask; | |
} | |
else { | |
lowcompare = 0x0101010101010101ULL; | |
highcompare = 0x0101010101010101ULL; | |
} | |
} | |
// Now spin, polling the sysmem storage until everyone's value is set | |
while ((inSync.aggData.low != lowcompare) && (inSync.aggData.high != highcompare)) { | |
inSync.nvccData = syncData->nvccData; // Read the PCI data | |
} | |
} | |
// Implements all-to-all event sync between all streams (e.g. worst case algorithm) | |
void algorithm_all_to_all(int numDevices, int streamCount, devRes *gpu) { | |
// Now record events in every single stream on every device | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 0; s < streamCount; s++) { | |
//delay_kernel<<< 1, 1, 0, testStream >>>(0); // Launch a dummy kernel into every stream | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
} | |
} | |
// Then push waits on all events into every stream on every device | |
for (int dev = 0; dev < numDevices; dev++) { // For each device | |
cudaSetDevice(dev); | |
for (int s = 0; s < streamCount; s++) { // For each stream on that device | |
for (int waitdev = 0; waitdev < numDevices; waitdev++) { // We'll wait on each device | |
for (int waitstream = 0; waitstream < streamCount; waitstream++) { // Each stream there | |
if (waitdev != dev || waitstream != s) { // Don't wait on ourself | |
cudaStreamWaitEvent(gpu[dev].stream[s], gpu[waitdev].ev[waitstream], 0); | |
} | |
} | |
} | |
} | |
} | |
} | |
// This is a linear-sync algorithm: each stream waits for the stream on | |
// its left then hands off to the stream on its right. Finally everyone | |
// waits for the rightmost stream to signal. | |
void algorithm_linear(int numDevices, int streamCount, devRes *gpu) { | |
// We sync all streams on a single device locally, then sync between devices | |
// We'll use the same "pass to the right" algorithm on each device | |
// Begin by waiting internally on each device | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 0; s < streamCount; s++) { | |
//delay_kernel<<< 1, 1, 0, testStream >>>(0); // Launch a dummy kernel into every stream | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
// Last stream doesn't push a wait on anyone | |
if ((s + 1) < streamCount) { | |
cudaStreamWaitEvent(gpu[dev].stream[s + 1], gpu[dev].ev[s], 0); | |
} | |
} | |
} | |
// Now wait between the last streams on each device | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
int s = streamCount - 1; | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); // Record "waiting finished" event | |
if ((dev + 1) < numDevices) { | |
cudaStreamWaitEvent(gpu[dev + 1].stream[s], gpu[dev].ev[s], 0); | |
} | |
} | |
// Now we can signal back across local GPUs, waiting for the final device & stream | |
cudaEventRecord(gpu[numDevices - 1].ev[streamCount - 1], gpu[numDevices - 1].stream[streamCount - 1]); | |
for (int dev = 0; dev < numDevices - 1; dev++) { | |
cudaSetDevice(dev); | |
cudaStreamWaitEvent(gpu[dev].stream[0], gpu[numDevices - 1].ev[streamCount - 1], 0); | |
cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]); | |
// Make everyone on my GPU wait for this local stream 0 event (reduces cross-GPU syncs) | |
for (int s = 1; s < streamCount; s++) { | |
cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0); | |
} | |
} | |
} | |
// Here we sync within a GPU, then between stream 0 on each GPU | |
void algorithm_reduction(int numDevices, int streamCount, devRes *gpu) { | |
// Let's do an interesting fan-in-fan-out mechanism | |
// First we sync locally to stream 0 on each GPU using events, because at least that's fast | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 1; s < streamCount; s++) { | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0); | |
} | |
// Record a completion event on stream 0 | |
cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]); | |
} | |
// Now sync between GPUs | |
// 1: GPU 0 syncs on everyone then records an event | |
for (int dev = 1; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
cudaStreamWaitEvent(gpu[0].stream[0], gpu[dev].ev[0], 0); | |
} | |
cudaEventRecord(gpu[0].ev[0], gpu[0].stream[0]); | |
// 2: Then propagate that event back to the other GPUs on stream 0 | |
for (int dev = 1; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
cudaStreamWaitEvent(gpu[dev].stream[0], gpu[0].ev[0], 0); | |
cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]); | |
} | |
// 3: Finally propagate the stream 0 event to all other streams per GPU | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 1; s < streamCount; s++) { | |
cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0); | |
} | |
} | |
} | |
// This is a software sync, based on writing then polling a sysmem location | |
// so that kernels can signal between each other | |
void algorithm_software(int numDevices, int streamCount, devRes *gpu, SWSync *syncData) { | |
static char state = 0; | |
state = !state; // Toggle the state of the barrier | |
// First we sync locally to stream 0 on each GPU using events, because at least that's fast | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 1; s < streamCount; s++) { | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
cudaStreamWaitEvent(gpu[dev].stream[0], gpu[dev].ev[s], 0); | |
} | |
} | |
// Now we launch our sync kernel onto each GPU's stream 0 - it'll complete when all GPUs have hit the barrier | |
cudaError_t err; | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
SWSyncKernel << < 1, 1, 0, gpu[dev].stream[0] >> > (syncData, dev, numDevices, state); | |
if ((err = cudaGetLastError()) != cudaSuccess) printf("Kernel launch error: %s\n", cudaGetErrorString(err)); | |
cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]); | |
} | |
// Finally all streams must wait for the sync kernel to finish, and we know everyone is finished | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 1; s < streamCount; s++) { | |
cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0); | |
} | |
} | |
} | |
// Args are: | |
// numStreams = number of streams per device | |
// numDevices = number of devices to span (starting with 0) | |
// numSyncs = number of syncs we do during the test | |
// timingAll = 0 for timing events ignoring the test stream fan-out, and 1 to capture the test stream fan-out | |
void stream_sync(int numStreams, int numDevices, int numSyncs, int timingAll) { | |
printf("stream_sync test: numStreams=%d, numDevices=%d, numSyncs=%d, timingAll=%d\n", numStreams, numDevices, numSyncs, timingAll); | |
devRes gpu[16]; // At most 16 devices | |
// For fun, we'll map peer access between all devices | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
// Peer map all other devices from this one | |
for (int peerdev = 0; peerdev < numDevices; peerdev++) { | |
if (dev != peerdev) { | |
cudaDeviceEnablePeerAccess(peerdev, 0); | |
} | |
} | |
// Create the streams for this device while we're at it | |
gpu[dev].stream = new cudaStream_t[numStreams]; | |
for (int i = 0; i < numStreams; i++) { | |
cudaStreamCreate(&gpu[dev].stream[i]); | |
} | |
// One event per stream on this device | |
gpu[dev].ev = new cudaEvent_t[numStreams]; | |
for (int i = 0; i < numStreams; i++) { | |
cudaEventCreate(&gpu[dev].ev[i]); | |
} | |
} | |
cudaSetDevice(0); | |
// Create 16 bytes of sysmem for the software-sync algorithm to use | |
SWSync *syncData; | |
if (cudaHostAlloc(&syncData, sizeof(syncData), cudaHostAllocPortable | cudaHostAllocWriteCombined | cudaHostAllocMapped) != cudaSuccess) { | |
printf("Failed to allocate sysmem: %s\n", cudaGetErrorString(cudaGetLastError())); | |
exit(1); | |
} | |
syncData->aggData.low = syncData->aggData.high = 0; // Reset our sync data (first transition will be 0->1) | |
SWSync *devSyncData; | |
cudaHostGetDevicePointer(&devSyncData, syncData, 0); | |
//printf("Allocated syncData (%p) = %.16llx-%.16llx - devSyncData = %p\n", syncData, syncData->aggData.low, syncData->aggData.high, devSyncData); | |
// Simple end-to-end timing | |
cudaEvent_t start, end; | |
cudaEventCreate(&start); | |
cudaEventCreate(&end); | |
cudaStream_t testStream; | |
cudaStreamCreate(&testStream); | |
printf("stream_count, num_devices, num_syncs, all-to-all, linear, per-gpu, sw-kernel\n"); | |
fflush(stdout); | |
for (int streamCount = 1; streamCount <= numStreams; streamCount++) { | |
float timer[4]; | |
for (int algorithm = 0; algorithm < 4; algorithm++) { | |
cudaSetDevice(0); | |
delay_kernel << < 1, 1, 0, testStream >> > (500000000); // Allow enough time to launch everything | |
cudaEventRecord(start, testStream); | |
// Start by hooking all stream 0 of each GPU into our test stream | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
cudaStreamWaitEvent(gpu[dev].stream[0], start, 0); | |
cudaEventRecord(gpu[dev].ev[0], gpu[dev].stream[0]); | |
if(!timingAll) cudaStreamWaitEvent(testStream, gpu[dev].ev[0], 0); | |
// Now make all other streams on this GPU wait for stream 0 - it's faster | |
// than having everyone poll on GPU 0 | |
for (int s = 0; s < streamCount; s++) { | |
cudaStreamWaitEvent(gpu[dev].stream[s], gpu[dev].ev[0], 0); | |
// If I'm not timing the fan-out, then hook every stream back in to the test stream | |
if(!timingAll) { | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0); | |
} | |
} | |
} | |
if (!timingAll) { | |
cudaSetDevice(0); | |
cudaEventRecord(start, testStream); | |
} | |
// Issue as many syncs as have been requested | |
for (int syncid = 0; syncid < numSyncs; syncid++) { | |
switch (algorithm) { | |
case 0: | |
algorithm_all_to_all(numDevices, streamCount, gpu); | |
break; | |
case 1: | |
algorithm_linear(numDevices, streamCount, gpu); | |
break; | |
case 2: | |
algorithm_reduction(numDevices, streamCount, gpu); | |
break; | |
case 3: | |
algorithm_software(numDevices, streamCount, gpu, devSyncData); | |
break; | |
default: | |
printf("Unrecognised sync algorithm\n"); | |
break; | |
} | |
} | |
// To record the fastest possible timing event, hook GPU 0 stream 0 back into the test stream | |
// Note that not all other GPUs may have seen their waitevent operations yet, but we are | |
// guaranteed that they have at least been issued and that GPU 0 stream 0 has seen completion | |
// on all other streams & GPUs. | |
if (!timingAll) { | |
cudaSetDevice(0); | |
cudaStreamWaitEvent(testStream, gpu[0].ev[0], 0); | |
cudaEventRecord(end, testStream); | |
} | |
// End by hooking all test streams back into our main stream | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
for (int s = 0; s < streamCount; s++) { | |
cudaEventRecord(gpu[dev].ev[s], gpu[dev].stream[s]); | |
cudaStreamWaitEvent(testStream, gpu[dev].ev[s], 0); | |
} | |
} | |
// For fan-out-fan-in timing, we record the end event after everything has fanned back in | |
if (timingAll) { | |
cudaEventRecord(end, testStream); | |
} | |
cudaStreamSynchronize(testStream); | |
cudaEventElapsedTime(&timer[algorithm], start, end); | |
} | |
printf("%d, %d, %d, %f, %f, %f, %f\n", streamCount, numDevices, numSyncs, timer[0], timer[1], timer[2], timer[3]); | |
fflush(stdout); | |
// Flush all work on each device, just to be sure | |
for (int dev = 0; dev < numDevices; dev++) { | |
cudaSetDevice(dev); | |
cudaDeviceSynchronize(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment