Zhengyang Liu zhengyang92

## reduce.c
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \

## reduce.c
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \

## gist:4bddd9b103092a572b04aa90c3e1de60
 time accl --topo NVLinkOnly --chunks 2 --size 4 --steps 4 --from-step 1  --collectives _all_reduce --features cuda_ipc --prefix /home/t-liuzhe/work/collcc/out

send 0 from 0 to 3 at time 1
send 1 from 3 to 0 at time 1
send 1 from 1 to 3 at time 0
send 2 from 2 to 0 at time 0
send 2 from 0 to 3 at time 1
send 3 from 3 to 0 at time 1
send 3 from 3 to 1 at time 0
send 4 from 3 to 2 at time 1

## gist:29f61f396791aba9213ad9c36a52bcc3
time accl --topo NVLinkOnly --chunks 2 --size 4 --steps 4 --from-step 1  --collectives _all_reduce --features cuda_ipc --prefix /home/t-liuzhe/work/collcc/out

send 0 from 3 to 1 at time 1
send 0 from 0 to 3 at time 0
send 2 from 2 to 1 at time 0
send 3 from 3 to 1 at time 1
send 5 from 1 to 0 at time 1
send 6 from 1 to 0 at time 1
send 6 from 2 to 1 at time 0
send 7 from 3 to 0 at time 0

## gist:abd5ddd889ef39156cf41ba548f47df3
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \

## gist:9456a9da178a0dad405ca430631793b2
NORTHAMERICA.t-liuzhe@GCR-DGX-01:~/work/omb/mpi/collective$ mpirun -np 8 osu_allreduce_accl -d cuda
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020

## gist:94eeadb7cd9d91f6511695779207aa99
chunks == 1
bandwidth at time 0
0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 1
0 0 0 0 0 0 0 0
0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0

## gist:8c8b4e70dbc8b7bd0d69793b2ab02fb0
NORTHAMERICA.t-liuzhe@GCR-DGX-01:~/work/omb/mpi/collective$ mpirun -np 8 osu_allreduce_nccl -m 48:10000000 -d cuda
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version
Allreduce NCCL version

## gist:2eeca192a8db87f594eafe99d14b4b5d
# OSU MPI-CUDA Allgather Latency Test v5.6.2
# Size       Avg Latency(us)
192                    75.68
384                    76.04
768                    75.72
1536                   81.04
3072                   80.32
6144                   82.26
12288                  94.67
24576                 100.29

## allreduce.cu
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
	#include<stdio.h>
	#include<assert.h>
	#include<stdlib.h>
	#include<string.h>
	#include<stdint.h>
	#include "mpi.h"
	#include "accl_util.h"
	#include "cuda_runtime_api.h"
	#define CUDACHECK(cmd) do { \
	cudaError_t e = cmd; \
	time accl --topo NVLinkOnly --chunks 2 --size 4 --steps 4 --from-step 1 --collectives _all_reduce --features cuda_ipc --prefix /home/t-liuzhe/work/collcc/out

	send 0 from 0 to 3 at time 1
	send 1 from 3 to 0 at time 1
	send 1 from 1 to 3 at time 0
	send 2 from 2 to 0 at time 0
	send 2 from 0 to 3 at time 1
	send 3 from 3 to 0 at time 1
	send 3 from 3 to 1 at time 0
	send 4 from 3 to 2 at time 1
	time accl --topo NVLinkOnly --chunks 2 --size 4 --steps 4 --from-step 1 --collectives _all_reduce --features cuda_ipc --prefix /home/t-liuzhe/work/collcc/out

	send 0 from 3 to 1 at time 1
	send 0 from 0 to 3 at time 0
	send 2 from 2 to 1 at time 0
	send 3 from 3 to 1 at time 1
	send 5 from 1 to 0 at time 1
	send 6 from 1 to 0 at time 1
	send 6 from 2 to 1 at time 0
	send 7 from 3 to 0 at time 0
	NORTHAMERICA.t-liuzhe@GCR-DGX-01:~/work/omb/mpi/collective$ mpirun -np 8 osu_allreduce_accl -d cuda
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	Allreduce ACCL a52f3e2-dirty Mon Jun 22 12:49:29 PDT 2020
	chunks == 1
	bandwidth at time 0
	0 1 0 0 0 0 0 0
	0 0 0 0 0 0 0 0
	0 0 0 1 0 0 0 0
	0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 1
	0 0 0 0 0 0 0 0
	0 0 0 0 0 1 0 0
	0 0 0 0 0 0 0 0
	NORTHAMERICA.t-liuzhe@GCR-DGX-01:~/work/omb/mpi/collective$ mpirun -np 8 osu_allreduce_nccl -m 48:10000000 -d cuda
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	Allreduce NCCL version
	# OSU MPI-CUDA Allgather Latency Test v5.6.2
	# Size Avg Latency(us)
	192 75.68
	384 76.04
	768 75.72
	1536 81.04
	3072 80.32
	6144 82.26
	12288 94.67
	24576 100.29