sparticlesteve/test_pytorch_mpi_cuda_2.py

## test_pytorch_mpi_cuda_2.py
import torch
import torch.distributed as dist

# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32

# Initialize
dist.init_process_group(backend='mpi')
rank, n_ranks = dist.get_rank(), dist.get_world_size()
local_rank = rank % ranks_per_node

# First, try allocating a small tensor on every gpu from every rank
for i in range(ranks_per_node):
    _ = torch.randn(1).to(torch.device('cuda', i))

# Now select our gpu
device = torch.device('cuda', local_rank)
print('MPI rank', rank, 'size', n_ranks, 'device', device)

# Allocate a tensor
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())

# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())

# Do an allreduce
dist.all_reduce(x)
print('allreduce result:', x.sum())
	import torch
	import torch.distributed as dist

	# Configuration
	ranks_per_node = 8
	shape = 2**17
	dtype = torch.float32

	# Initialize
	dist.init_process_group(backend='mpi')
	rank, n_ranks = dist.get_rank(), dist.get_world_size()
	local_rank = rank % ranks_per_node

	# First, try allocating a small tensor on every gpu from every rank
	for i in range(ranks_per_node):
	_ = torch.randn(1).to(torch.device('cuda', i))

	# Now select our gpu
	device = torch.device('cuda', local_rank)
	print('MPI rank', rank, 'size', n_ranks, 'device', device)

	# Allocate a tensor
	x = torch.randn(shape, dtype=dtype).to(device)
	print('local result:', x.sum())

	# Do a broadcast from rank 0
	dist.broadcast(x, 0)
	print('broadcast result:', x.sum())

	# Do an allreduce
	dist.all_reduce(x)
	print('allreduce result:', x.sum())