Skip to content

Instantly share code, notes, and snippets.

@bosilca
Created November 13, 2016 07:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bosilca/34c289c62bd6e3bb439dfc1f11bcef00 to your computer and use it in GitHub Desktop.
Save bosilca/34c289c62bd6e3bb439dfc1f11bcef00 to your computer and use it in GitHub Desktop.
Quick example to check the performance of MPI_Allreduce from GPU buffers.
#include <mpi.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
/**
* mpic++ -g -Wall -I/opt/cuda/8.0/include cuda_check.cc -o cuda_check -L/opt/cuda/8.0/lib64 -lcudart
*/
#define ENV_LOCAL_RANK "OMPI_COMM_WORLD_LOCAL_RANK"
#define REPEAT 1000
#define COUNT 1000000
#define MPI_CHECK(WHAT) \
do { \
int _rc = (WHAT); \
if( MPI_SUCCESS != _rc ) { \
std::cout << "Error " << _rc << " in " << #WHAT; \
} \
} while (0)
void SetDeviceBeforeInit() {
char *localRankStr = NULL;
int rank = 0, devCount = 0;
cudaError_t cudaStat;
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK)) != NULL) {
rank = atoi(localRankStr);
}
cudaDeviceReset();
cudaThreadExit();
cudaGetDeviceCount(&devCount);
std::cout << "Device count " << devCount << " Local Rank: " << rank % devCount << std::endl;
cudaStat = cudaSetDevice(rank % devCount);
if(cudaStat != cudaSuccess)
std::cout << "ERROR DEVICE SET FAILED\n";
}
int main(int argc, char** argv) {
//sleep(20);
SetDeviceBeforeInit();
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int strategy;
const int count(atoi(argv[1]));
float *data_gpu1, *data_gpu2;
float *data_cpu1, *data_cpu2;
size_t data_size = count * sizeof(float);
cudaMalloc((void**)&data_gpu1, data_size);
cudaMalloc((void**)&data_gpu2, data_size);
cudaMallocHost((void**)&data_cpu1, data_size);
cudaMallocHost((void**)&data_cpu2, data_size);
/* Let's initialize the source data with something sensible */
time_t t;
srand(time(&t)); /* trivial randomization needs */
for( int i = 0; i < count; i++ ) {
data_cpu1[i] = (float)rand() / (float)RAND_MAX;
}
cudaMemcpy(data_gpu1, data_cpu1, data_size, cudaMemcpyDeviceToHost);
/* Let's do a quick data exchange to make sure the MPI knows about all the
* GPU buffers.
*/
MPI_Request req;
MPI_Irecv(data_gpu2, count, MPI_FLOAT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &req);
MPI_Send(data_gpu1, count, MPI_FLOAT, (rank + 1) % size, 0, MPI_COMM_WORLD);
MPI_Wait(&req, MPI_STATUS_IGNORE);
double t1, t2;
for( strategy = 0; strategy < 2; strategy++ ) {
MPI_Barrier(MPI_COMM_WORLD);
t1 = MPI_Wtime();
for( int i = 0; i < REPEAT; i++ ) {
if (strategy == 0) {
MPI_CHECK(MPI_Allreduce(data_gpu1, data_gpu2, count, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
} else {
cudaMemcpy((void*)data_cpu1, (void*)data_gpu1, data_size, cudaMemcpyDeviceToHost);
MPI_CHECK(MPI_Allreduce(data_cpu1, (void*)data_cpu2, count, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
cudaMemcpy((void*)data_gpu2, (void*)data_cpu2, data_size, cudaMemcpyHostToDevice);
}
MPI_Barrier(MPI_COMM_WORLD);
}
t2 = MPI_Wtime();
std::cout << "Rank " << rank << " Allreduce (" << count << " floats) on ";
if( 0 == strategy )
std::cout << "GPU buffers";
else
std::cout << "CPU buffers (+d2h +h2d)";
std::cout << ": time " << (t2-t1) / REPEAT << " second.\n";
}
cudaFreeHost((void*)data_cpu1);
cudaFreeHost((void*)data_cpu2);
cudaFree((void*)data_gpu1);
cudaFree((void*)data_gpu2);
MPI_Finalize();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment