Created
February 2, 2024 11:07
error info when run dist dgl in docker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(sgnn) root@sgnn_node_0:/sgnn/example/dgl# python3 ./launch.py --workspace /sgnn/example/dgl --num_trainers 1 --num_samplers 0 --num_servers 1 --part_config /sgnn/data/dgl_partition/products/ogb-product.json --ip_config ./ip_config.txt "python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000" | |
The number of OMP threads per trainer is set to 20 | |
cleanup process runs | |
Arguments: Namespace(backend='gloo', batch_size=1000, batch_size_eval=100000, dropout=0.5, eval_every=5, fan_out='10,25', graph_name='ogbn-products', ip_config='ip_config.txt', local_rank=None, log_every=20, lr=0.003, n_classes=0, num_epochs=10, num_gpus=0, num_hidden=16, num_layers=2, pad_data=False, part_config=None) | |
sgnn_node_2: Initializing DistDGL. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:140: Sender with NetType~socket is created. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:159: Receiver with NetType~socket is created. | |
Arguments: Namespace(backend='gloo', batch_size=1000, batch_size_eval=100000, dropout=0.5, eval_every=5, fan_out='10,25', graph_name='ogbn-products', ip_config='ip_config.txt', local_rank=None, log_every=20, lr=0.003, n_classes=0, num_epochs=10, num_gpus=0, num_hidden=16, num_layers=2, pad_data=False, part_config=None) | |
sgnn_node_3: Initializing DistDGL. | |
Arguments: Namespace(backend='gloo', batch_size=1000, batch_size_eval=100000, dropout=0.5, eval_every=5, fan_out='10,25', graph_name='ogbn-products', ip_config='ip_config.txt', local_rank=None, log_every=20, lr=0.003, n_classes=0, num_epochs=10, num_gpus=0, num_hidden=16, num_layers=2, pad_data=False, part_config=None) | |
sgnn_node_0: Initializing DistDGL. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:140: Sender with NetType~socket is created. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:159: Receiver with NetType~socket is created. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:140: Sender with NetType~socket is created. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:159: Receiver with NetType~socket is created. | |
Arguments: Namespace(backend='gloo', batch_size=1000, batch_size_eval=100000, dropout=0.5, eval_every=5, fan_out='10,25', graph_name='ogbn-products', ip_config='ip_config.txt', local_rank=None, log_every=20, lr=0.003, n_classes=0, num_epochs=10, num_gpus=0, num_hidden=16, num_layers=2, pad_data=False, part_config=None) | |
sgnn_node_1: Initializing DistDGL. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:140: Sender with NetType~socket is created. | |
[11:02:58] /home/bear/workspace/dgl_0.9/src/rpc/rpc.cc:159: Receiver with NetType~socket is created. | |
bash: line 1: 181 Bus error (core dumped) /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000 | |
Called process error Command 'ssh -o StrictHostKeyChecking=no -p 22 43.0.0.9 'cd /sgnn/example/dgl; (export DGL_ROLE=server DGL_NUM_SAMPLER=0 OMP_NUM_THREADS=1 DGL_NUM_CLIENT=4 DGL_CONF_PATH=/sgnn/data/dgl_partition/products/ogb-product.json DGL_IP_CONFIG=./ip_config.txt DGL_NUM_SERVER=1 DGL_GRAPH_FORMAT=csc DGL_SERVER_ID=1; /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000)'' returned non-zero exit status 135. | |
bash: line 1: 287 Bus error (core dumped) /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000 | |
Called process error Command 'ssh -o StrictHostKeyChecking=no -p 22 43.0.0.8 'cd /sgnn/example/dgl; (export DGL_ROLE=server DGL_NUM_SAMPLER=0 OMP_NUM_THREADS=1 DGL_NUM_CLIENT=4 DGL_CONF_PATH=/sgnn/data/dgl_partition/products/ogb-product.json DGL_IP_CONFIG=./ip_config.txt DGL_NUM_SERVER=1 DGL_GRAPH_FORMAT=csc DGL_SERVER_ID=0; /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000)'' returned non-zero exit status 135. | |
bash: line 1: 101 Bus error (core dumped) /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000 | |
Called process error Command 'ssh -o StrictHostKeyChecking=no -p 22 43.0.0.10 'cd /sgnn/example/dgl; (export DGL_ROLE=server DGL_NUM_SAMPLER=0 OMP_NUM_THREADS=1 DGL_NUM_CLIENT=4 DGL_CONF_PATH=/sgnn/data/dgl_partition/products/ogb-product.json DGL_IP_CONFIG=./ip_config.txt DGL_NUM_SERVER=1 DGL_GRAPH_FORMAT=csc DGL_SERVER_ID=2; /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000)'' returned non-zero exit status 135. | |
bash: line 1: 101 Bus error (core dumped) /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000 | |
Called process error Command 'ssh -o StrictHostKeyChecking=no -p 22 43.0.0.11 'cd /sgnn/example/dgl; (export DGL_ROLE=server DGL_NUM_SAMPLER=0 OMP_NUM_THREADS=1 DGL_NUM_CLIENT=4 DGL_CONF_PATH=/sgnn/data/dgl_partition/products/ogb-product.json DGL_IP_CONFIG=./ip_config.txt DGL_NUM_SERVER=1 DGL_GRAPH_FORMAT=csc DGL_SERVER_ID=3; /miniconda3/envs/sgnn/bin/python3 ddp_sage.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --batch_size 1000)'' returned non-zero exit status 135. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment