-
-
Save kwen2501/186641efe951f344cd9a6d39353d19d6 to your computer and use it in GitHub Desktop.
A test that creates and destroys a NCCL communicator in a loop
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include "cuda_runtime.h" | |
#include "nccl.h" | |
#include "mpi.h" | |
#include <unistd.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#define MPICHECK(cmd) do { \ | |
int e = cmd; \ | |
if( e != MPI_SUCCESS ) { \ | |
printf("Failed: MPI error %s:%d '%d'\n", \ | |
__FILE__,__LINE__, e); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
#define CUDACHECK(cmd) do { \ | |
cudaError_t e = cmd; \ | |
if( e != cudaSuccess ) { \ | |
printf("Failed: Cuda error %s:%d '%s'\n", \ | |
__FILE__,__LINE__,cudaGetErrorString(e)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
#define NCCLCHECK(cmd) do { \ | |
ncclResult_t r = cmd; \ | |
if (r!= ncclSuccess) { \ | |
printf("Failed, NCCL error %s:%d '%s'\n", \ | |
__FILE__,__LINE__,ncclGetErrorString(r)); \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while(0) | |
static uint64_t getHostHash(const char* string) { | |
// Based on DJB2a, result = result * 33 ^ char | |
uint64_t result = 5381; | |
for (int c = 0; string[c] != '\0'; c++){ | |
result = ((result << 5) + result) ^ string[c]; | |
} | |
return result; | |
} | |
static void getHostName(char* hostname, int maxlen) { | |
gethostname(hostname, maxlen); | |
for (int i=0; i< maxlen; i++) { | |
if (hostname[i] == '.') { | |
hostname[i] = '\0'; | |
return; | |
} | |
} | |
} | |
static void getMemInfo() { | |
int id; | |
CUDACHECK(cudaGetDevice(&id)); | |
size_t free, total; | |
CUDACHECK(cudaMemGetInfo(&free, &total)); | |
printf("GPU %d: total %ld MiB free %ld MiB\n", id, total/1024/1024, free/1024/1024); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
int myRank, nRanks, localRank = 0; | |
//initializing MPI | |
MPICHECK(MPI_Init(&argc, &argv)); | |
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); | |
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); | |
//calculating localRank based on hostname which is used in selecting a GPU | |
uint64_t hostHashs[nRanks]; | |
char hostname[1024]; | |
getHostName(hostname, 1024); | |
hostHashs[myRank] = getHostHash(hostname); | |
MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); | |
for (int p=0; p<nRanks; p++) { | |
if (p == myRank) break; | |
if (hostHashs[p] == hostHashs[myRank]) localRank++; | |
} | |
CUDACHECK(cudaSetDevice(localRank)); | |
// Make sure CUDA runtime is initialized | |
CUDACHECK(cudaFree(NULL)); | |
for (int i = 0; i < 10; i++) { | |
MPI_Barrier(MPI_COMM_WORLD); | |
if (myRank == 0) { | |
printf("========== After %2d (init, destroy) ==========\n", i); | |
fflush(stdout); | |
} | |
sleep(1); | |
MPI_Barrier(MPI_COMM_WORLD); | |
getMemInfo(); | |
fflush(stdout); | |
ncclUniqueId id; | |
ncclComm_t comm; | |
//get NCCL unique ID at rank 0 and broadcast it to all others | |
if (myRank == 0) ncclGetUniqueId(&id); | |
MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); | |
//initializing NCCL | |
NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank)); | |
//finalizing NCCL | |
ncclCommDestroy(comm); | |
} | |
//finalizing MPI | |
MPICHECK(MPI_Finalize()); | |
printf("[MPI Rank %d] Success \n", myRank); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment