cd /mnt/home/dberenberg/projects/metagenomics
module load slurm
source load_env
source huggingface_meta/bin/activate
salloc -N2 -p gpu --gres=gpu:v100-32gb:02
# distributed_util.py generates commands to execute on each node
python distributed_util.py | bash
Relevant scripts are:
/mnt/home/dberenberg/projects/metagenomics/distributed_util.py
: generatessh <host> "worker.sh <args>"
command for each node/mnt/home/dberenberg/projects/metagenomics/worker.sh
: loads environment variables and runs training script/mnt/home/dberenberg/projects/metagenomics/train_gpt2_dist.py
: training script
RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:825, invalid usage, NCCL version 2.7.8
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
workergpu22:1810802:1810802 [0] NCCL INFO Bootstrap : Using [0]eno1:10.128.149.70<0>
workergpu22:1810802:1810802 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
workergpu22:1810802:1810802 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
workergpu22:1810802:1810802 [0] NCCL INFO NET/Socket : Using [0]eno1:10.128.149.70<0>
workergpu22:1810802:1810802 [0] NCCL INFO Using network Socket
NCCL version 2.7.8+cuda10.2workergpu22:1810803:1810803 [0] NCCL INFO Bootstrap : Using [0]eno1:10.128.149.70<0>
workergpu22:1810803:1810803 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
workergpu22:1810803:1810803 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
workergpu22:1810803:1810803 [0] NCCL INFO NET/Socket : Using [0]eno1:10.128.149.70<0>
workergpu22:1810803:1810803 [0] NCCL INFO Using network Socket
workergpu30:2353157:2353157 [0] NCCL INFO Bootstrap : Using [0]eno1:10.128.149.78<0>
workergpu30:2353157:2353157 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
workergpu30:2353157:2353157 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
workergpu30:2353157:2353157 [0] NCCL INFO NET/Socket : Using [0]eno1:10.128.149.78<0>
workergpu30:2353157:2353157 [0] NCCL INFO Using network Socket
workergpu30:2353158:2353158 [0] NCCL INFO Bootstrap : Using [0]eno1:10.128.149.78<0>
workergpu30:2353158:2353158 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
workergpu30:2353158:2353158 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
workergpu30:2353158:2353158 [0] NCCL INFO NET/Socket : Using [0]eno1:10.128.149.78<0>
workergpu30:2353158:2353158 [0] NCCL INFO Using network Socket
workergpu30:2353158:2353391 [0] init.cc:573 NCCL WARN Duplicate GPU detected : rank 3 and rank 2 both on CUDA device 1a000
workergpu30:2353158:2353391 [0] NCCL INFO init.cc:840 -> 5
workergpu30:2353158:2353391 [0] NCCL INFO group.cc:73 -> 5 [Async thread]
workergpu22:1810803:1810929 [0] init.cc:573 NCCL WARN Duplicate GPU detected : rank 1 and rank 0 both on CUDA device 3b000
workergpu22:1810803:1810929 [0] NCCL INFO init.cc:840 -> 5
workergpu22:1810803:1810929 [0] NCCL INFO group.cc:73 -> 5 [Async thread]