Created
August 11, 2020 20:22
-
-
Save froody/e37a7d818d44d7626e3ce3855bd50e61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# code | |
def dist_init(rank, world_size, hostname=None): | |
if hostname == None: | |
hostname = "localhost" | |
print(f"dist init r={rank}, world={world_size}, host={hostname}") | |
os.environ["MASTER_ADDR"] = hostname | |
os.environ["MASTER_PORT"] = "10638" | |
os.environ["WORLD_SIZE"] = str(world_size) | |
os.environ["RANK"] = str(rank) | |
if torch.__version__ == "1.6.0": | |
init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" | |
print(f"going to try init_process_group: {init_method}") | |
torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=init_method) | |
print(f"got nccl, now barrier {rank}") | |
torch.distributed.barrier() | |
print(f"got nccl, now rpc {rank}") | |
os.environ["MASTER_ADDR"] = hostname | |
os.environ["MASTER_PORT"] = "10639" | |
init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" | |
print(f"going to try init_rpc {init_method}") | |
rpc.init_rpc( | |
f"Test{rank}", | |
rank=rank, | |
world_size=world_size, | |
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method) | |
) | |
print(f"got rpc {rank}") | |
else: | |
rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size) | |
print(f"got rpc {rank}") | |
# Rank 0,1 | |
dist init r=0, world=4, host=100.96.162.234 | |
going to try init_process_group: tcp://100.96.162.234:10638 | |
dist init r=1, world=4, host=100.96.162.234 | |
going to try init_process_group: tcp://100.96.162.234:10638 | |
got nccl, now barrier 1 | |
got nccl, now barrier 0 | |
got nccl, now rpc 1 | |
got nccl, now rpc 0 | |
going to try init_rpc tcp://100.96.162.234:10639 | |
going to try init_rpc tcp://100.96.162.234:10639 | |
Traceback (most recent call last): | |
File "./benchmarks/transformer.py", line 329, in <module> | |
bench_multi_process(args) | |
File "./benchmarks/transformer.py", line 311, in bench_multi_process | |
mp.spawn(run_worker, args=(world_size, args, all_at_once), nprocs=world_size, join=True) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn | |
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes | |
while not context.join(): | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join | |
raise Exception(msg) | |
Exception: | |
-- Process 1 terminated with the following error: | |
Traceback (most recent call last): | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap | |
fn(i, *args) | |
File "/private/home/tbirch/src/fairscale/benchmarks/transformer.py", line 286, in run_worker | |
dist_init(rank + args.rank_base, world_size, hostname=args.host) | |
File "/private/home/tbirch/src/fairscale/tests/nn/model_parallel/commons.py", line 80, in dist_init | |
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 90, in init_rpc | |
api._init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 299, in _init_rpc_backend | |
rpc_backend_options=rpc_backend_options, | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 94, in init_backend | |
return backend.value.init_backend_handler(*args, **kwargs) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 138, in _process_group_init_backend_handler | |
group = _init_process_group(store, rank, world_size) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 117, in _init_process_group | |
group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout) | |
RuntimeError: [/opt/conda/conda-bld/pytorch_1595629403081/work/third_party/gloo/gloo/transport/tcp/pair.cc:769] connect [127.0.1.1]:1921: Connection refused | |
# Rank 2,3 | |
dist init r=2, world=4, host=100.96.162.234 | |
going to try init_process_group: tcp://100.96.162.234:10638 | |
got nccl, now barrier 2 | |
dist init r=3, world=4, host=100.96.162.234 | |
going to try init_process_group: tcp://100.96.162.234:10638 | |
got nccl, now barrier 3 | |
got nccl, now rpc 3 | |
got nccl, now rpc 2 | |
going to try init_rpc tcp://100.96.162.234:10639 | |
going to try init_rpc tcp://100.96.162.234:10639 | |
Traceback (most recent call last): | |
File "./benchmarks/transformer.py", line 329, in <module> | |
bench_multi_process(args) | |
File "./benchmarks/transformer.py", line 311, in bench_multi_process | |
mp.spawn(run_worker, args=(world_size, args, all_at_once), nprocs=world_size, join=True) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn | |
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes | |
while not context.join(): | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join | |
raise Exception(msg) | |
Exception: | |
-- Process 0 terminated with the following error: | |
Traceback (most recent call last): | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap | |
fn(i, *args) | |
File "/private/home/tbirch/src/fairscale/benchmarks/transformer.py", line 286, in run_worker | |
dist_init(rank + args.rank_base, world_size, hostname=args.host) | |
File "/private/home/tbirch/src/fairscale/tests/nn/model_parallel/commons.py", line 80, in dist_init | |
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 90, in init_rpc | |
api._init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 299, in _init_rpc_backend | |
rpc_backend_options=rpc_backend_options, | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 94, in init_backend | |
return backend.value.init_backend_handler(*args, **kwargs) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 138, in _process_group_init_backend_handler | |
group = _init_process_group(store, rank, world_size) | |
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 117, in _init_process_group | |
group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout) | |
RuntimeError: [/opt/conda/conda-bld/pytorch_1595629403081/work/third_party/gloo/gloo/transport/tcp/pair.cc:769] connect [127.0.1.1]:4516: Connection refused |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment