Created
November 19, 2017 17:57
-
-
Save alsrgv/445107a83fdf077bc150b809241f7d53 to your computer and use it in GitHub Desktop.
Test NCCL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Code from http://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#onedevprothrd | |
#define _BSD_SOURCE | |
#include <stdio.h> | |
#include "cuda_runtime.h" | |
#include "nccl.h" | |
#include "mpi.h" | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <unistd.h> | |
#define EXIT_FAILURE 1 | |
#define MPICHECK(cmd) do { \ | |
int e = cmd; \ | |
if( e != MPI_SUCCESS ) { \ | |
printf("Failed: MPI error %s:%d '%d'\n", \ | |
__FILE__,__LINE__, e); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
#define CUDACHECK(cmd) do { \ | |
cudaError_t e = cmd; \ | |
if( e != cudaSuccess ) { \ | |
printf("Failed: Cuda error %s:%d '%s'\n", \ | |
__FILE__,__LINE__,cudaGetErrorString(e)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
#define NCCLCHECK(cmd) do { \ | |
ncclResult_t r = cmd; \ | |
if (r!= ncclSuccess) { \ | |
printf("Failed, NCCL error %s:%d '%s'\n", \ | |
__FILE__,__LINE__,ncclGetErrorString(r)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
static uint64_t getHostHash(const char* string) { | |
// Based on DJB2, result = result * 33 + char | |
uint64_t result = 5381; | |
for (int c = 0; string[c] != '\0'; c++){ | |
result = ((result << 5) + result) + string[c]; | |
} | |
return result; | |
} | |
static void getHostName(char* hostname, int maxlen) { | |
gethostname(hostname, maxlen); | |
for (int i=0; i< maxlen; i++) { | |
if (hostname[i] == '.') { | |
hostname[i] = '\0'; | |
return; | |
} | |
} | |
} | |
int main(int argc, char* argv[]) { | |
int size = 32 * 1024 * 1024; | |
int myRank, nRanks, localRank = 0; | |
//initializing MPI | |
MPICHECK(MPI_Init(&argc, &argv)); | |
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); | |
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); | |
//calculating localRank which is used in selecting a GPU | |
uint64_t hostHashs[nRanks]; | |
char hostname[1024]; | |
getHostName(hostname, 1024); | |
hostHashs[myRank] = getHostHash(hostname); | |
MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, | |
hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); | |
for (int p = 0; p < nRanks; p++) { | |
if (p == myRank) break; | |
if (hostHashs[p] == hostHashs[myRank]) localRank++; | |
} | |
ncclUniqueId id; | |
ncclComm_t comm; | |
float *sendbuff, *recvbuff; | |
cudaStream_t s; | |
//generating NCCL unique ID at one process and broadcasting it to all | |
if (myRank == 0) ncclGetUniqueId(&id); | |
MPICHECK(MPI_Bcast((void *) &id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); | |
//picking a GPU based on localRank, allocate device buffers | |
CUDACHECK(cudaSetDevice(localRank)); | |
CUDACHECK(cudaMalloc((void**)&sendbuff, size * sizeof(float))); | |
CUDACHECK(cudaMalloc((void**)&recvbuff, size * sizeof(float))); | |
CUDACHECK(cudaStreamCreate(&s)); | |
//initializing NCCL | |
NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank)); | |
//communicating using NCCL | |
NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum, | |
comm, s)); | |
//completing NCCL operation by synchronizing on the CUDA stream | |
CUDACHECK(cudaStreamSynchronize(s)); | |
//free device buffers | |
CUDACHECK(cudaFree(sendbuff)); | |
CUDACHECK(cudaFree(recvbuff)); | |
//finalizing NCCL | |
ncclCommDestroy(comm); | |
//finalizing MPI | |
MPICHECK(MPI_Finalize()); | |
printf("[MPI Rank %d] Success. \n", myRank); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment