Last active
May 22, 2023 00:34
-
-
Save bri25yu/31c6a339450e66bc4faafd3e4990165d to your computer and use it in GitHub Desktop.
a18 torch.distributed.barrier hangs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# On a18, not using nccl backend resolves | |
torchrun --nproc-per-node 8 test_barrier.py | |
# On a18, using nccl backend hangs | |
torchrun --nproc-per-node 8 test_barrier.py --use_nccl | |
# On other nodes, this script never hangs | |
torchrun --nproc-per-node 8 test_barrier.py | |
torchrun --nproc-per-node 8 test_barrier.py --use_nccl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import environ | |
from argparse import ArgumentParser | |
import torch | |
from torch.distributed import barrier, init_process_group | |
parser = ArgumentParser() | |
parser.add_argument("--use_nccl", action="store_true") | |
args = parser.parse_args() | |
BACKEND = "nccl" if args.use_nccl else None | |
get_local_rank = lambda: int(environ["LOCAL_RANK"]) | |
get_world_size = lambda: int(environ["WORLD_SIZE"]) | |
local_rank = get_local_rank() | |
if local_rank == 0: | |
print(f"pytorch version {torch.__version__}") | |
if args.use_nccl: | |
print(f"using nccl version {torch.cuda.nccl.version()}") | |
else: | |
print("not using nccl backend") | |
init_process_group(backend=BACKEND, world_size=get_world_size(), rank=get_local_rank()) | |
print(local_rank, "before barrier") | |
barrier() | |
print(local_rank, "after barrier") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment