Skip to content

Instantly share code, notes, and snippets.

@NanXiao
Last active December 22, 2017 09:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NanXiao/94a16cc74c29762484c7373ac3cc47f8 to your computer and use it in GitHub Desktop.
Save NanXiao/94a16cc74c29762484c7373ac3cc47f8 to your computer and use it in GitHub Desktop.
/* compile:
* /opt/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_60,code=sm_60 -std=c++11 -O3 -I/usr/local/nccl/include/ -L/opt/cuda/lib64 -lcudart -lrt -L/usr/local/nccl/lib -lcurand -lnccl -lnvToolsExt -o nccl_ex nccl_ex.cu
*/
#include "nccl.h"
#include <stdio.h>
#define GPU_COUNT (4)
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Cuda failure %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("NCCL failure %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
int main(void)
{
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t) * GPU_COUNT);
int gpuArray[GPU_COUNT];
for (int i = 0; i < GPU_COUNT; i++)
{
gpuArray[i] = i;
}
NCCLCHECK(ncclCommInitAll(comms, GPU_COUNT, gpuArray));
void* sendbuffs[GPU_COUNT];
void* recvbuffs[GPU_COUNT];
cudaStream_t streams[GPU_COUNT];
for (int i = 0; i < GPU_COUNT; i++)
{
int count = 0, device = 0, rank = 0;
NCCLCHECK(ncclCommCount(comms[i], &count));
NCCLCHECK(ncclCommCuDevice(comms[i], &device));
NCCLCHECK(ncclCommUserRank(comms[i], &rank));
printf("count is %d, device is %d, rank is %d\n", count, device, rank);
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaMalloc(sendbuffs + i, sizeof(ncclInt)));
CUDACHECK(cudaMemcpy(sendbuffs[i], &i, sizeof(i), cudaMemcpyHostToDevice));
CUDACHECK(cudaMalloc(recvbuffs + i, sizeof(ncclInt)));
CUDACHECK(cudaStreamCreate(streams + i));
}
NCCLCHECK(ncclGroupStart());
//NCCLCHECK(ncclReduce(sendbuffs[0], NULL, 1, ncclInt, ncclSum, 0, comms[0], streams[0]));
for (int i = 0; i < GPU_COUNT; i++)
{
//NCCLCHECK(ncclReduce(sendbuffs[i], recvbuffs[i], 1, ncclInt, ncclSum, 0, comms[i], streams[i]));
NCCLCHECK(ncclAllReduce(sendbuffs[i], recvbuffs[i], 1, ncclInt, ncclSum, comms[i], streams[i]));
}
NCCLCHECK(ncclGroupEnd());
for (int i = 0; i < GPU_COUNT; i++) {
cudaError_t err = cudaErrorNotReady;
while (err == cudaErrorNotReady) {
err = cudaStreamQuery(streams[i]);
}
CUDACHECK(err);
}
for (int i = 0; i < GPU_COUNT; i++)
{
int res = 0;
cudaMemcpy(&res, recvbuffs[i], sizeof(int), cudaMemcpyDeviceToHost);
printf("res is %d\n", res);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment