Skip to content

Instantly share code, notes, and snippets.

@froody
Created August 11, 2020 20:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save froody/e37a7d818d44d7626e3ce3855bd50e61 to your computer and use it in GitHub Desktop.
Save froody/e37a7d818d44d7626e3ce3855bd50e61 to your computer and use it in GitHub Desktop.
# code
def dist_init(rank, world_size, hostname=None):
if hostname == None:
hostname = "localhost"
print(f"dist init r={rank}, world={world_size}, host={hostname}")
os.environ["MASTER_ADDR"] = hostname
os.environ["MASTER_PORT"] = "10638"
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["RANK"] = str(rank)
if torch.__version__ == "1.6.0":
init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
print(f"going to try init_process_group: {init_method}")
torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=init_method)
print(f"got nccl, now barrier {rank}")
torch.distributed.barrier()
print(f"got nccl, now rpc {rank}")
os.environ["MASTER_ADDR"] = hostname
os.environ["MASTER_PORT"] = "10639"
init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
print(f"going to try init_rpc {init_method}")
rpc.init_rpc(
f"Test{rank}",
rank=rank,
world_size=world_size,
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method)
)
print(f"got rpc {rank}")
else:
rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size)
print(f"got rpc {rank}")
# Rank 0,1
dist init r=0, world=4, host=100.96.162.234
going to try init_process_group: tcp://100.96.162.234:10638
dist init r=1, world=4, host=100.96.162.234
going to try init_process_group: tcp://100.96.162.234:10638
got nccl, now barrier 1
got nccl, now barrier 0
got nccl, now rpc 1
got nccl, now rpc 0
going to try init_rpc tcp://100.96.162.234:10639
going to try init_rpc tcp://100.96.162.234:10639
Traceback (most recent call last):
File "./benchmarks/transformer.py", line 329, in <module>
bench_multi_process(args)
File "./benchmarks/transformer.py", line 311, in bench_multi_process
mp.spawn(run_worker, args=(world_size, args, all_at_once), nprocs=world_size, join=True)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/private/home/tbirch/src/fairscale/benchmarks/transformer.py", line 286, in run_worker
dist_init(rank + args.rank_base, world_size, hostname=args.host)
File "/private/home/tbirch/src/fairscale/tests/nn/model_parallel/commons.py", line 80, in dist_init
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 90, in init_rpc
api._init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 299, in _init_rpc_backend
rpc_backend_options=rpc_backend_options,
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 94, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 138, in _process_group_init_backend_handler
group = _init_process_group(store, rank, world_size)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 117, in _init_process_group
group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout)
RuntimeError: [/opt/conda/conda-bld/pytorch_1595629403081/work/third_party/gloo/gloo/transport/tcp/pair.cc:769] connect [127.0.1.1]:1921: Connection refused
# Rank 2,3
dist init r=2, world=4, host=100.96.162.234
going to try init_process_group: tcp://100.96.162.234:10638
got nccl, now barrier 2
dist init r=3, world=4, host=100.96.162.234
going to try init_process_group: tcp://100.96.162.234:10638
got nccl, now barrier 3
got nccl, now rpc 3
got nccl, now rpc 2
going to try init_rpc tcp://100.96.162.234:10639
going to try init_rpc tcp://100.96.162.234:10639
Traceback (most recent call last):
File "./benchmarks/transformer.py", line 329, in <module>
bench_multi_process(args)
File "./benchmarks/transformer.py", line 311, in bench_multi_process
mp.spawn(run_worker, args=(world_size, args, all_at_once), nprocs=world_size, join=True)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/private/home/tbirch/src/fairscale/benchmarks/transformer.py", line 286, in run_worker
dist_init(rank + args.rank_base, world_size, hostname=args.host)
File "/private/home/tbirch/src/fairscale/tests/nn/model_parallel/commons.py", line 80, in dist_init
rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(init_method=init_method)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/__init__.py", line 90, in init_rpc
api._init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/api.py", line 299, in _init_rpc_backend
rpc_backend_options=rpc_backend_options,
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 94, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 138, in _process_group_init_backend_handler
group = _init_process_group(store, rank, world_size)
File "/private/home/tbirch/.conda/envs/torch160/lib/python3.7/site-packages/torch/distributed/rpc/backend_registry.py", line 117, in _init_process_group
group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout)
RuntimeError: [/opt/conda/conda-bld/pytorch_1595629403081/work/third_party/gloo/gloo/transport/tcp/pair.cc:769] connect [127.0.1.1]:4516: Connection refused
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment