Created
November 13, 2016 07:45
-
-
Save bosilca/34c289c62bd6e3bb439dfc1f11bcef00 to your computer and use it in GitHub Desktop.
Quick example to check the performance of MPI_Allreduce from GPU buffers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <mpi.h> | |
#include <cuda_runtime.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#include <iostream> | |
/** | |
* mpic++ -g -Wall -I/opt/cuda/8.0/include cuda_check.cc -o cuda_check -L/opt/cuda/8.0/lib64 -lcudart | |
*/ | |
#define ENV_LOCAL_RANK "OMPI_COMM_WORLD_LOCAL_RANK" | |
#define REPEAT 1000 | |
#define COUNT 1000000 | |
#define MPI_CHECK(WHAT) \ | |
do { \ | |
int _rc = (WHAT); \ | |
if( MPI_SUCCESS != _rc ) { \ | |
std::cout << "Error " << _rc << " in " << #WHAT; \ | |
} \ | |
} while (0) | |
void SetDeviceBeforeInit() { | |
char *localRankStr = NULL; | |
int rank = 0, devCount = 0; | |
cudaError_t cudaStat; | |
// We extract the local rank initialization using an environment variable | |
if ((localRankStr = getenv(ENV_LOCAL_RANK)) != NULL) { | |
rank = atoi(localRankStr); | |
} | |
cudaDeviceReset(); | |
cudaThreadExit(); | |
cudaGetDeviceCount(&devCount); | |
std::cout << "Device count " << devCount << " Local Rank: " << rank % devCount << std::endl; | |
cudaStat = cudaSetDevice(rank % devCount); | |
if(cudaStat != cudaSuccess) | |
std::cout << "ERROR DEVICE SET FAILED\n"; | |
} | |
int main(int argc, char** argv) { | |
//sleep(20); | |
SetDeviceBeforeInit(); | |
MPI_Init(&argc, &argv); | |
int rank, size; | |
MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
MPI_Comm_size(MPI_COMM_WORLD, &size); | |
int strategy; | |
const int count(atoi(argv[1])); | |
float *data_gpu1, *data_gpu2; | |
float *data_cpu1, *data_cpu2; | |
size_t data_size = count * sizeof(float); | |
cudaMalloc((void**)&data_gpu1, data_size); | |
cudaMalloc((void**)&data_gpu2, data_size); | |
cudaMallocHost((void**)&data_cpu1, data_size); | |
cudaMallocHost((void**)&data_cpu2, data_size); | |
/* Let's initialize the source data with something sensible */ | |
time_t t; | |
srand(time(&t)); /* trivial randomization needs */ | |
for( int i = 0; i < count; i++ ) { | |
data_cpu1[i] = (float)rand() / (float)RAND_MAX; | |
} | |
cudaMemcpy(data_gpu1, data_cpu1, data_size, cudaMemcpyDeviceToHost); | |
/* Let's do a quick data exchange to make sure the MPI knows about all the | |
* GPU buffers. | |
*/ | |
MPI_Request req; | |
MPI_Irecv(data_gpu2, count, MPI_FLOAT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &req); | |
MPI_Send(data_gpu1, count, MPI_FLOAT, (rank + 1) % size, 0, MPI_COMM_WORLD); | |
MPI_Wait(&req, MPI_STATUS_IGNORE); | |
double t1, t2; | |
for( strategy = 0; strategy < 2; strategy++ ) { | |
MPI_Barrier(MPI_COMM_WORLD); | |
t1 = MPI_Wtime(); | |
for( int i = 0; i < REPEAT; i++ ) { | |
if (strategy == 0) { | |
MPI_CHECK(MPI_Allreduce(data_gpu1, data_gpu2, count, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD)); | |
} else { | |
cudaMemcpy((void*)data_cpu1, (void*)data_gpu1, data_size, cudaMemcpyDeviceToHost); | |
MPI_CHECK(MPI_Allreduce(data_cpu1, (void*)data_cpu2, count, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD)); | |
cudaMemcpy((void*)data_gpu2, (void*)data_cpu2, data_size, cudaMemcpyHostToDevice); | |
} | |
MPI_Barrier(MPI_COMM_WORLD); | |
} | |
t2 = MPI_Wtime(); | |
std::cout << "Rank " << rank << " Allreduce (" << count << " floats) on "; | |
if( 0 == strategy ) | |
std::cout << "GPU buffers"; | |
else | |
std::cout << "CPU buffers (+d2h +h2d)"; | |
std::cout << ": time " << (t2-t1) / REPEAT << " second.\n"; | |
} | |
cudaFreeHost((void*)data_cpu1); | |
cudaFreeHost((void*)data_cpu2); | |
cudaFree((void*)data_gpu1); | |
cudaFree((void*)data_gpu2); | |
MPI_Finalize(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment