Skip to content

Instantly share code, notes, and snippets.

@csullivan
Created July 17, 2023 05:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csullivan/11437b5e62eb2a24f82f22ff8e1cb277 to your computer and use it in GitHub Desktop.
Save csullivan/11437b5e62eb2a24f82f22ff8e1cb277 to your computer and use it in GitHub Desktop.
Minimal NCCL torch.distributed example
import os
import torch
import torch.distributed as dist
def read_file_and_all_reduce():
# Get the rank and world size from environment variables
rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
# initialize the process group
dist.init_process_group(backend='nccl')
torch.cuda.set_device(rank)
# Open file and read number corresponding to rank
with open('distribute.txt', 'r') as file:
number = int(file.read().strip()[rank])
tensor = torch.tensor(number, dtype=torch.float32).cuda()
# Apply all reduce
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
print(f'Rank {rank} has data {tensor.item()}')
def main():
read_file_and_all_reduce()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment