Skip to content

Instantly share code, notes, and snippets.

@bri25yu
Last active May 22, 2023 00:34
Show Gist options
  • Save bri25yu/31c6a339450e66bc4faafd3e4990165d to your computer and use it in GitHub Desktop.
Save bri25yu/31c6a339450e66bc4faafd3e4990165d to your computer and use it in GitHub Desktop.
a18 torch.distributed.barrier hangs
# On a18, not using nccl backend resolves
torchrun --nproc-per-node 8 test_barrier.py
# On a18, using nccl backend hangs
torchrun --nproc-per-node 8 test_barrier.py --use_nccl
# On other nodes, this script never hangs
torchrun --nproc-per-node 8 test_barrier.py
torchrun --nproc-per-node 8 test_barrier.py --use_nccl
from os import environ
from argparse import ArgumentParser
import torch
from torch.distributed import barrier, init_process_group
parser = ArgumentParser()
parser.add_argument("--use_nccl", action="store_true")
args = parser.parse_args()
BACKEND = "nccl" if args.use_nccl else None
get_local_rank = lambda: int(environ["LOCAL_RANK"])
get_world_size = lambda: int(environ["WORLD_SIZE"])
local_rank = get_local_rank()
if local_rank == 0:
print(f"pytorch version {torch.__version__}")
if args.use_nccl:
print(f"using nccl version {torch.cuda.nccl.version()}")
else:
print("not using nccl backend")
init_process_group(backend=BACKEND, world_size=get_world_size(), rank=get_local_rank())
print(local_rank, "before barrier")
barrier()
print(local_rank, "after barrier")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment