Skip to content

Instantly share code, notes, and snippets.

@sparticlesteve
Created January 8, 2020 03:45
Show Gist options
  • Save sparticlesteve/53751369bbff2b51c1ef474f3254c609 to your computer and use it in GitHub Desktop.
Save sparticlesteve/53751369bbff2b51c1ef474f3254c609 to your computer and use it in GitHub Desktop.
import torch
import torch.distributed as dist
# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32
# Initialize
dist.init_process_group(backend='mpi')
rank, n_ranks = dist.get_rank(), dist.get_world_size()
local_rank = rank % ranks_per_node
# First, try allocating a small tensor on every gpu from every rank
for i in range(ranks_per_node):
_ = torch.randn(1).to(torch.device('cuda', i))
# Now select our gpu
device = torch.device('cuda', local_rank)
print('MPI rank', rank, 'size', n_ranks, 'device', device)
# Allocate a tensor
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())
# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())
# Do an allreduce
dist.all_reduce(x)
print('allreduce result:', x.sum())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment