Skip to content

Instantly share code, notes, and snippets.

@nousr
Created July 18, 2022 15:29
Show Gist options
  • Save nousr/be343a6084305ec9c3d793543520159e to your computer and use it in GitHub Desktop.
Save nousr/be343a6084305ec9c3d793543520159e to your computer and use it in GitHub Desktop.
Loading intelmpi version 2021.4.0
go 8
compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
hostname = compute-od-gpu-dy-p4d-24xlarge-8
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
myuser=zion
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
hostname = compute-od-gpu-dy-p4d-24xlarge-15
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
myuser=zion
COUNT_NODE=8
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin
hostname = compute-od-gpu-dy-p4d-24xlarge-12
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
hostname = compute-od-gpu-dy-p4d-24xlarge-10
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
hostname = compute-od-gpu-dy-p4d-24xlarge-9
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
hostname = compute-od-gpu-dy-p4d-24xlarge-13
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15
hostname = compute-od-gpu-dy-p4d-24xlarge-14
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
hostname = compute-od-gpu-dy-p4d-24xlarge-11
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8
MASTER_PORT= 12802
THEID=0
python3 version = Python 3.8.5
THEID=7
THEID=4
THEID=1
python3 version = Python 3.8.5
THEID=5
THEID=2
python3 version = Python 3.8.5
THEID=6
python3 version = Python 3.8.5
python3 version = Python 3.8.5
python3 version = Python 3.8.5
python3 version = Python 3.8.5
THEID=3
python3 version = Python 3.8.5
Loading configuration from /fsx/nousr/DALLE2-pytorch/configs/prior.json
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Using network AWS Libfabric
NCCL version 2.12.7+cuda11.4
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0>
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0>
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0>
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0>
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0>
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0>
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0>
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0>
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Selected Provider is efa
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO Using network AWS Libfabric
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Trees [0] 41/-1/-1->40->49 [1] 41/-1/-1->40->47 [2] 41/-1/-1->40->47 [3] 41/-1/-1->40->47 [4] 41/32/-1->40->25 [5] 41/-1/-1->40->47 [6] 41/-1/-1->40->47 [7] 41/-1/-1->40->47
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Trees [0] 42/-1/-1->41->40 [1] -1/-1/-1->41->40 [2] 42/-1/-1->41->40 [3] 42/-1/-1->41->40 [4] 42/48/-1->41->40 [5] -1/-1/-1->41->40 [6] 42/-1/-1->41->40 [7] 42/-1/-1->41->40
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->51 [2] 43/-1/-1->42->41 [3] 43/-1/-1->42->41 [4] 43/-1/-1->42->41 [5] 43/34/-1->42->27 [6] 43/-1/-1->42->41 [7] 43/-1/-1->42->41
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 [2] 39/-1/-1->38->37 [3] 39/54/-1->38->6 [4] 39/-1/-1->38->37 [5] 39/-1/-1->38->37 [6] 39/-1/-1->38->37 [7] 39/-1/-1->38->46
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] 32/-1/-1->39->38 [2] 32/-1/-1->39->38 [3] 32/22/-1->39->38 [4] -1/-1/-1->39->38 [5] 32/-1/-1->39->38 [6] 32/-1/-1->39->38 [7] 32/-1/-1->39->38
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/-1/-1->37->36 [2] 38/20/-1->37->36 [3] -1/-1/-1->37->36 [4] 38/-1/-1->37->36 [5] 38/-1/-1->37->36 [6] 38/-1/-1->37->36 [7] -1/-1/-1->37->36
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Trees [0] 37/-1/-1->36->35 [1] 37/-1/-1->36->35 [2] 37/52/-1->36->4 [3] 37/-1/-1->36->35 [4] 37/-1/-1->36->35 [5] 37/-1/-1->36->35 [6] 37/-1/-1->36->44 [7] 37/-1/-1->36->35
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Trees [0] 36/-1/-1->35->34 [1] 36/18/-1->35->34 [2] -1/-1/-1->35->34 [3] 36/-1/-1->35->34 [4] 36/-1/-1->35->34 [5] 36/-1/-1->35->34 [6] -1/-1/-1->35->34 [7] 36/-1/-1->35->34
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Trees [0] 44/-1/-1->43->42 [1] 44/-1/-1->43->42 [2] -1/-1/-1->43->42 [3] 44/-1/-1->43->42 [4] 44/-1/-1->43->42 [5] 44/50/-1->43->42 [6] -1/-1/-1->43->42 [7] 44/-1/-1->43->42
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/50/-1->34->2 [2] 35/-1/-1->34->33 [3] 35/-1/-1->34->33 [4] 35/-1/-1->34->33 [5] 35/-1/-1->34->42 [6] 35/-1/-1->34->33 [7] 35/-1/-1->34->33
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] 24/-1/-1->31->30 [2] 24/-1/-1->31->30 [3] 24/-1/-1->31->30 [4] -1/-1/-1->31->30 [5] 24/-1/-1->31->30 [6] 24/-1/-1->31->30 [7] 24/46/-1->31->30
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 [2] 31/-1/-1->30->29 [3] 31/-1/-1->30->22 [4] 31/-1/-1->30->29 [5] 31/-1/-1->30->29 [6] 31/-1/-1->30->29 [7] 31/14/-1->30->62
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/-1/-1->45->44 [2] 46/-1/-1->45->44 [3] -1/-1/-1->45->44 [4] 46/-1/-1->45->44 [5] 46/-1/-1->45->44 [6] 46/52/-1->45->44 [7] -1/-1/-1->45->44
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] 40/-1/-1->47->46 [2] 40/-1/-1->47->46 [3] 40/-1/-1->47->46 [4] -1/-1/-1->47->46 [5] 40/-1/-1->47->46 [6] 40/-1/-1->47->46 [7] 40/54/-1->47->46
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Trees [0] 45/-1/-1->44->43 [1] 45/-1/-1->44->43 [2] 45/-1/-1->44->53 [3] 45/-1/-1->44->43 [4] 45/-1/-1->44->43 [5] 45/-1/-1->44->43 [6] 45/36/-1->44->29 [7] 45/-1/-1->44->43
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] -1/-1/-1->33->32 [2] 34/-1/-1->33->32 [3] 34/-1/-1->33->32 [4] 34/-1/-1->33->32 [5] -1/-1/-1->33->32 [6] 34/-1/-1->33->32 [7] 34/-1/-1->33->32
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->39 [2] 33/-1/-1->32->39 [3] 33/-1/-1->32->39 [4] 33/-1/-1->32->40 [5] 33/-1/-1->32->39 [6] 33/-1/-1->32->39 [7] 33/-1/-1->32->39
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/-1/-1->29->28 [2] 30/-1/-1->29->28 [3] -1/-1/-1->29->28 [4] 30/-1/-1->29->28 [5] 30/-1/-1->29->28 [6] 30/44/-1->29->28 [7] -1/-1/-1->29->28
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Trees [0] 26/-1/-1->25->24 [1] -1/-1/-1->25->24 [2] 26/-1/-1->25->24 [3] 26/-1/-1->25->24 [4] 26/40/-1->25->24 [5] -1/-1/-1->25->24 [6] 26/-1/-1->25->24 [7] 26/-1/-1->25->24
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 [2] 47/-1/-1->46->45 [3] 47/-1/-1->46->55 [4] 47/-1/-1->46->45 [5] 47/-1/-1->46->45 [6] 47/-1/-1->46->45 [7] 47/38/-1->46->31
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/58/-1->50->34 [2] 51/-1/-1->50->49 [3] 51/-1/-1->50->49 [4] 51/-1/-1->50->49 [5] 51/-1/-1->50->43 [6] 51/-1/-1->50->49 [7] 51/-1/-1->50->49
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] -1/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/22/-1->15->14
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->23 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->13 [7] 15/6/-1->14->30
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] 16/-1/-1->23->22 [2] 16/-1/-1->23->22 [3] 16/14/-1->23->22 [4] -1/-1/-1->23->22 [5] 16/-1/-1->23->22 [6] 16/-1/-1->23->22 [7] 16/-1/-1->23->22
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 [2] 23/-1/-1->22->21 [3] 23/30/-1->22->39 [4] 23/-1/-1->22->21 [5] 23/-1/-1->22->21 [6] 23/-1/-1->22->21 [7] 23/-1/-1->22->15
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] -1/-1/-1->49->48 [2] 50/-1/-1->49->48 [3] 50/-1/-1->49->48 [4] 50/-1/-1->49->48 [5] -1/-1/-1->49->48 [6] 50/-1/-1->49->48 [7] 50/-1/-1->49->48
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->55 [2] 49/-1/-1->48->55 [3] 49/-1/-1->48->55 [4] 49/-1/-1->48->41 [5] 49/-1/-1->48->55 [6] 49/-1/-1->48->55 [7] 49/-1/-1->48->55
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Trees [0] 25/-1/-1->24->16 [1] 25/-1/-1->24->31 [2] 25/-1/-1->24->31 [3] 25/-1/-1->24->31 [4] 25/8/-1->24->56 [5] 25/-1/-1->24->31 [6] 25/-1/-1->24->31 [7] 25/-1/-1->24->31
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] -1/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->12 [6] 14/20/-1->13->12 [7] -1/-1/-1->13->12
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/-1/-1->21->20 [2] 22/12/-1->21->20 [3] -1/-1/-1->21->20 [4] 22/-1/-1->21->20 [5] 22/-1/-1->21->20 [6] 22/-1/-1->21->20 [7] -1/-1/-1->21->20
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Trees [0] 21/-1/-1->20->19 [1] 21/-1/-1->20->19 [2] 21/28/-1->20->37 [3] 21/-1/-1->20->19 [4] 21/-1/-1->20->19 [5] 21/-1/-1->20->19 [6] 21/-1/-1->20->13 [7] 21/-1/-1->20->19
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/26/-1->18->35 [2] 19/-1/-1->18->17 [3] 19/-1/-1->18->17 [4] 19/-1/-1->18->17 [5] 19/-1/-1->18->11 [6] 19/-1/-1->18->17 [7] 19/-1/-1->18->17
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Trees [0] 20/-1/-1->19->18 [1] 20/10/-1->19->18 [2] -1/-1/-1->19->18 [3] 20/-1/-1->19->18 [4] 20/-1/-1->19->18 [5] 20/-1/-1->19->18 [6] -1/-1/-1->19->18 [7] 20/-1/-1->19->18
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/-1/-1->7->6
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->18 [2] 27/-1/-1->26->25 [3] 27/-1/-1->26->25 [4] 27/-1/-1->26->25 [5] 27/10/-1->26->58 [6] 27/-1/-1->26->25 [7] 27/-1/-1->26->25
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Trees [0] 29/-1/-1->28->27 [1] 29/-1/-1->28->27 [2] 29/-1/-1->28->20 [3] 29/-1/-1->28->27 [4] 29/-1/-1->28->27 [5] 29/-1/-1->28->27 [6] 29/12/-1->28->60 [7] 29/-1/-1->28->27
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Trees [0] 28/-1/-1->27->26 [1] 28/-1/-1->27->26 [2] -1/-1/-1->27->26 [3] 28/-1/-1->27->26 [4] 28/-1/-1->27->26 [5] 28/42/-1->27->26 [6] -1/-1/-1->27->26 [7] 28/-1/-1->27->26
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->21 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->11 [5] 13/-1/-1->12->11 [6] 13/4/-1->12->28 [7] 13/-1/-1->12->11
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] -1/-1/-1->11->10 [3] 12/-1/-1->11->10 [4] 12/-1/-1->11->10 [5] 12/18/-1->11->10 [6] -1/-1/-1->11->10 [7] 12/-1/-1->11->10
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->23 [2] 17/-1/-1->16->23 [3] 17/-1/-1->16->23 [4] 17/-1/-1->16->9 [5] 17/-1/-1->16->23 [6] 17/-1/-1->16->23 [7] 17/-1/-1->16->23
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/38/-1->6->-1 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->14
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] -1/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] -1/-1/-1->5->4
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] 48/-1/-1->55->54 [2] 48/-1/-1->55->54 [3] 48/46/-1->55->54 [4] -1/-1/-1->55->54 [5] 48/-1/-1->55->54 [6] 48/-1/-1->55->54 [7] 48/-1/-1->55->54
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 [2] 55/-1/-1->54->53 [3] 55/62/-1->54->38 [4] 55/-1/-1->54->53 [5] 55/-1/-1->54->53 [6] 55/-1/-1->54->53 [7] 55/-1/-1->54->47
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->19 [2] 11/-1/-1->10->9 [3] 11/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/2/-1->10->26 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] -1/-1/-1->9->8 [2] 10/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/16/-1->9->8 [5] -1/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] -1/-1/-1->17->16 [2] 18/-1/-1->17->16 [3] 18/-1/-1->17->16 [4] 18/-1/-1->17->16 [5] -1/-1/-1->17->16 [6] 18/-1/-1->17->16 [7] 18/-1/-1->17->16
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/36/-1->4->-1 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->12 [7] 5/-1/-1->4->3
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/-1/-1->53->52 [2] 54/44/-1->53->52 [3] -1/-1/-1->53->52 [4] 54/-1/-1->53->52 [5] 54/-1/-1->53->52 [6] 54/-1/-1->53->52 [7] -1/-1/-1->53->52
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Trees [0] 52/-1/-1->51->50 [1] 52/42/-1->51->50 [2] -1/-1/-1->51->50 [3] 52/-1/-1->51->50 [4] 52/-1/-1->51->50 [5] 52/-1/-1->51->50 [6] -1/-1/-1->51->50 [7] 52/-1/-1->51->50
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] 56/-1/-1->63->62 [2] 56/-1/-1->63->62 [3] 56/-1/-1->63->62 [4] -1/-1/-1->63->62 [5] 56/-1/-1->63->62 [6] 56/-1/-1->63->62 [7] 56/-1/-1->63->62
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Trees [0] 58/-1/-1->57->56 [1] -1/-1/-1->57->56 [2] 58/-1/-1->57->56 [3] 58/-1/-1->57->56 [4] 58/-1/-1->57->56 [5] -1/-1/-1->57->56 [6] 58/-1/-1->57->56 [7] 58/-1/-1->57->56
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Trees [0] 57/-1/-1->56->48 [1] 57/-1/-1->56->63 [2] 57/-1/-1->56->63 [3] 57/-1/-1->56->63 [4] 57/24/-1->56->-1 [5] 57/-1/-1->56->63 [6] 57/-1/-1->56->63 [7] 57/-1/-1->56->63
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->50 [2] 59/-1/-1->58->57 [3] 59/-1/-1->58->57 [4] 59/-1/-1->58->57 [5] 59/26/-1->58->-1 [6] 59/-1/-1->58->57 [7] 59/-1/-1->58->57
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Trees [0] 9/-1/-1->8->17 [1] 9/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/0/-1->8->24 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/34/-1->2->-1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->10 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] 4/-1/-1->3->2
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Trees [0] 53/-1/-1->52->51 [1] 53/-1/-1->52->51 [2] 53/60/-1->52->36 [3] 53/-1/-1->52->51 [4] 53/-1/-1->52->51 [5] 53/-1/-1->52->51 [6] 53/-1/-1->52->45 [7] 53/-1/-1->52->51
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 [2] 62/-1/-1->61->60 [3] -1/-1/-1->61->60 [4] 62/-1/-1->61->60 [5] 62/-1/-1->61->60 [6] 62/-1/-1->61->60 [7] -1/-1/-1->61->60
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 [2] 63/-1/-1->62->61 [3] 63/-1/-1->62->54 [4] 63/-1/-1->62->61 [5] 63/-1/-1->62->61 [6] 63/-1/-1->62->61 [7] 63/30/-1->62->-1
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 16 23 22 21
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Trees [0] 61/-1/-1->60->59 [1] 61/-1/-1->60->59 [2] 61/-1/-1->60->52 [3] 61/-1/-1->60->59 [4] 61/-1/-1->60->59 [5] 61/-1/-1->60->59 [6] 61/28/-1->60->-1 [7] 61/-1/-1->60->59
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01/08 : 0 3 10 15 14 13 12 9 8 11 18 23 22 21 20 17 16 19 26 31
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Trees [0] 60/-1/-1->59->58 [1] 60/-1/-1->59->58 [2] -1/-1/-1->59->58 [3] 60/-1/-1->59->58 [4] 60/-1/-1->59->58 [5] 60/-1/-1->59->58 [6] -1/-1/-1->59->58 [7] 60/-1/-1->59->58
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02/08 : 0 7 6 5 12 11 10 9 8 15 14 13 20 19 18 17 16 23 22 21
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] -1/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03/08 : 0 5 4 7 14 11 10 9 8 13 12 15 22 19 18 17 16 21 20 23
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 16 23 22 21
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05/08 : 0 3 10 15 14 13 12 9 8 11 18 23 22 21 20 17 16 19 26 31
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06/08 : 0 7 6 5 12 11 10 9 8 15 14 13 20 19 18 17 16 23 22 21
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07/08 : 0 5 4 7 14 11 10 9 8 13 12 15 22 19 18 17 16 21 20 23
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->8 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01 : 50[201c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01 : 42[201c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01 : 58[201c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01 : 18[201c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01 : 2[201c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01 : 34[201c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01 : 26[201c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01 : 10[201c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05 : 50[201c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00/0 : 41[101d0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05 : 42[201c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04/0 : 41[101d0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 49[101d0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04/0 : 49[101d0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05 : 58[201c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00/0 : 25[101d0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 25[101d0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00/0 : 57[101d0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04/0 : 57[101d0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05 : 18[201c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05 : 34[201c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05 : 2[201c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05 : 26[201c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05 : 10[201c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00/0 : 1[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04/0 : 1[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 33[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04/0 : 33[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 17[101d0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04/0 : 17[101d0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00/0 : 9[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04/0 : 9[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 51[201d0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05/0 : 51[201d0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01/0 : 43[201d0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05/0 : 43[201d0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01/0 : 27[201d0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 27[201d0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 35[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05/0 : 35[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01/0 : 3[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05/0 : 3[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01/0 : 11[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05/0 : 11[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 19[201d0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05/0 : 19[201d0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 23[a01d0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 23[a01d0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 63[a01d0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 63[a01d0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 31[a01d0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 31[a01d0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 7[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 7[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 47[a01d0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07/0 : 47[a01d0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01/0 : 59[201d0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 15[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05/0 : 59[201d0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07/0 : 15[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 39[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 39[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 55[a01d0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 55[a01d0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02/0 : 45[901d0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06/0 : 45[901d0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 53[901d0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02/0 : 29[901d0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 29[901d0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 37[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06/0 : 37[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02/0 : 13[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06/0 : 13[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02/0 : 5[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06/0 : 5[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 21[901d0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06/0 : 21[901d0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03/0 : 47[a01d0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07/0 : 47[a01d0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 55[a01d0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03/0 : 31[a01d0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 31[a01d0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 23[a01d0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03/0 : 15[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07/0 : 15[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02/0 : 61[901d0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 29[901d0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 13[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 61[901d0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 37[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 5[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 45[901d0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 21[901d0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 53[901d0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 33[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 27[201d0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 17[101d0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 59[201d0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 3[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 41[101d0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 11[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05/0 : 11[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 35[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 43[201d0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 57[101d0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 25[101d0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 19[201d0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 9[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 19[201d0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 1[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 1[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 51[201d0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 02 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 49[101d0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 06 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06/0 : 53[901d0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 39[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07/0 : 39[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03/0 : 7[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07/0 : 7[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07/0 : 23[a01d0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 37[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06/0 : 61[901d0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07/0 : 55[a01d0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 5[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03/0 : 63[a01d0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07/0 : 63[a01d0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06/0 : 45[901d0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 29[901d0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 61[901d0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06/0 : 13[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 33[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 35[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 21[901d0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 17[101d0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 53[901d0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 51[201d0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 02 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 02 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 25[101d0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 27[201d0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 06 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 06 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 02 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 3[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 06 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04/0 : 41[101d0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05/0 : 43[201d0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 02 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 57[101d0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 59[201d0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 06 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04/0 : 9[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 02 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 49[101d0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 06 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 02 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 06 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 00 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 01 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 02 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 04 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 05 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 06 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03 : 22[a01c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07 : 22[a01c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 00 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 00 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 02 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 02 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 03 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 04 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 06 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 07 : 26[201c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 03 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 04 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 06 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 07 : 18[201c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 00 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 00 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 01 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 00 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 03 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 04 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 01 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 05 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 02 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 07 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 03 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 00 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 01 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 03 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 03 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 00 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 04 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 01 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 04 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 04 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 05 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 07 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 06 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 02 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 00 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 05 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 01 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 03 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 04 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 04 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 05 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 07 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 07 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03 : 54[a01c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 07 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 05 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 00 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07 : 54[a01c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 00 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 06 : 22[a01c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03 : 38[a01c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03 : 14[a01c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 00 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 00 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07 : 38[a01c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07 : 14[a01c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 01 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 01 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 01 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 00 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 01 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 02 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 04 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03 : 30[a01c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 01 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 03 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 05 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 02 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 00 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 00 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07 : 30[a01c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 02 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 06 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 04 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 03 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 04 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 00 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 01 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 01 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 05 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 03 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 04 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 02 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 04 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 01 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 04 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 04 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 05 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 05 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 05 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 07 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 07 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 00 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 06 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 05 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 02 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 06 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 05 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 00 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 01 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 04 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 01 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 00 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 06 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 07 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 00 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 00 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 02 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 05 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 01 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 03 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 01 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 00 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 01 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 00 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 02 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 06 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 04 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 02 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 04 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 04 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 00 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 00 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 02 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 02 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 05 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 04 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 02 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 02 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 05 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 06 : 14[a01c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 03 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03 : 46[a01c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 01 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 00 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 03 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 03 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 00 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 02 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 00 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 04 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03 : 62[a01c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 02 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 05 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07 : 46[a01c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 04 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 03 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 05 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 07 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03 : 6[a01c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07 : 62[a01c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 01 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 00 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 03 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 05 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 04 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 03 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 01 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07 : 6[a01c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 04 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 06 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 06 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 06 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 02 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 04 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 06 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 06 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 00 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 02 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 06 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 04 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 07 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 07 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 05 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 04 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 01 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 07 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 04 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 07 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 05 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 02 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 06 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 02 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 06 : 30[a01c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 05 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 06 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 01 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 00 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 00 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 07 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 06 : 38[a01c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 04 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 02 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 00 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 03 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 00 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 01 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 01 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 00 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 04 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 02 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 00 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 05 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 02 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 07 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 02 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 02 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 04 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 03 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 01 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 00 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 00 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 00 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 06 : 54[a01c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 04 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 03 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 03 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 02 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 03 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 04 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 04 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 03 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 03 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 02 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 00 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 00 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 02 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 05 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 02 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 04 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 06 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 03 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 00 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 05 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 01 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 02 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 03 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 00 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 05 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 04 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 06 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 00 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 02 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 07 : 42[201c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 06 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 06 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 04 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 01 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 03 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 00 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 03 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 04 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 07 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 01 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 04 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 07 : 2[201c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 06 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 02 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 02 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 04 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 06 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 01 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 00 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 03 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 02 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 06 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 02 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 06 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 07 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 04 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 06 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 02 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 04 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 00 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 03 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 07 : 58[201c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 03 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 07 : 34[201c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 04 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 04 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 05 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 04 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 03 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 05 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 01 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 07 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 05 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 07 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 06 : 6[a01c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 02 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 05 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 06 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 06 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 04 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 05 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 07 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 05 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 06 : 62[a01c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 06 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 06 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 00 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 07 : 50[201c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 04 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 07 : 10[201c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 06 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 07 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 1[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 07 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 06 : 46[a01c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 07 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 01 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 06 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 02 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 07 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 04 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 05 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 02 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 02 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 06 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 00 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 02 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 00 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 02 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 00 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 01 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 06 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 06 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 02 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 01 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 02 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 02 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 04 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 17[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 00 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 9[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 00 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 02 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 00 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 00 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 00 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 04 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 01 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 01 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 02 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 01 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 04 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 00 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 00 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 05 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 03 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 21[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 00 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 02 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 02 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 06 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 05 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 06 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 04 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 04 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 04 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 06 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 00 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 06 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 02 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 01 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 05 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 03 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 02 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 00 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 41[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06 : 21[901d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 25[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 07 : 19[201d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 00 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 06 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 04 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 04 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 04 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 03 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 00 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 00 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 04 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 00 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 04 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 49[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 05 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 13[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 02 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 05 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 02 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 05 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 06 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 06 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06 : 13[901d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 03 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 02 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 03 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 00 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 03 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 06 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 02 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 00 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 06 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 04 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 00 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 06 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 00 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 00 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 04 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 03 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 04 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 02 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 33[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 04 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 03 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 01 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 07 : 18[201c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 03 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 00 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 01 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 04 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 02 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 07 : 17[101d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 00 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 03 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 00 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 03 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 02 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 53[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 04 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 06 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 00 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 00 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 07 : 27[201d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 03 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 00 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 06 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 00 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 00 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 02 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 29[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 04 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 00 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 07 : 1[101d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 02 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Connected all rings
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 01 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 02 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 04 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 01 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 00 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 01 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 04 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 04 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 06 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 57[101d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 00 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 02 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 03 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 06 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 04 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 03 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 05 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 03 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 03 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 03 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 01 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 07 : 49[101d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 04 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 05 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 07 : 50[201c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 03 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06 : 29[901d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 03 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 04 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 04 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 04 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 07 : 11[201d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 04 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 00 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 02 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 37[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 04 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 06 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 06 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 06 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 06 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 00 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 07 : 35[201d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 06 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 04 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 06 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 05 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 04 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 02 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 04 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 01 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 05 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 01 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 07 : 41[101d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 45[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 07 : 43[201d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 02 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 05 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 07 : 26[201c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 03 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 06 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 04 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 07 : 51[201d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 07 : 25[101d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 07 : 10[201c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 07 : 9[101d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06 : 53[901d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 00 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 02 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 03 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06 : 37[901d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06 : 5[901d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 00 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 04 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 07 : 42[201c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 05 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 04 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 5[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 07 : 3[201d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 01 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 06 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 06 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 05 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 00 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 02 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 04 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 00 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 06 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 03 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 06 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 07 : 2[201c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 05 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 04 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06 : 45[901d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 02 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 07 : 33[101d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 06 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 05 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 04 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 03 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 03 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 07 : 34[201c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 06 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06 : 61[901d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 04 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 04 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 07 : 57[101d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 61[901d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 07 : 59[201d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 06 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 07 : 58[201c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 7[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 12[901c0] -> 21[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 6[a01c0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06/0 : 20[901c0] -> 13[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 10[201c0] -> 19[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 55[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 8[101c0] -> 17[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 23[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 40[101c0] -> 49[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 14[a01c0] -> 23[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 38[a01c0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 4[901c0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 47[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 15[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05/0 : 18[201c0] -> 11[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 2[201c0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04/0 : 16[101c0] -> 9[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 29[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 18[201c0] -> 35[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 6[a01c0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 31[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04/0 : 48[101c0] -> 41[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 39[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05/0 : 50[201c0] -> 43[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 4[901c0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 34[201c0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 27[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 39[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 37[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 42[201c0] -> 51[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 25[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 44[901c0] -> 53[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 46[a01c0] -> 55[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06 : 5[901d0] -> 4[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 14[a01c0] -> 23[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 36[901c0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06/0 : 52[901c0] -> 45[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 38[a01c0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 35[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 12[901c0] -> 21[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 22[a01c0] -> 39[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 2[201c0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 63[a01d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 35[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 31[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 01 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04 : 1[101d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 39[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 6[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 46[a01c0] -> 55[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 4[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 02 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 62[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 10[201c0] -> 19[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 2[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 05 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 60[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 35[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 06 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05 : 3[201d0] -> 2[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 31[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07 : 7[a01d0] -> 0[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 6[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 21[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 19[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06 : 61[901d0] -> 60[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 58[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 55[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 62[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 46[a01c0] -> 31[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05 : 35[201d0] -> 34[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 01 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 36[901c0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 20[901c0] -> 37[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04 : 57[101d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 02 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 37[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 44[901c0] -> 53[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 60[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 37[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 05 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 16[101c0] -> 33[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 58[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 29[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 06 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05 : 59[201d0] -> 58[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 53[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 34[201c0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06 : 37[901d0] -> 36[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07 : 63[a01d0] -> 56[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07/0 : 22[a01c0] -> 15[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 01 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 02 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 05 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 06 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07 : 15[a01d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 0[101c0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 2[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 32[101c0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 44[901c0] -> 29[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 32[101c0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07/0 : 54[a01c0] -> 47[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 01 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 42[201c0] -> 51[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 02 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 05 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 06 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 0[101c0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07 : 47[a01d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 33[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 0[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 23[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 40[101c0] -> 49[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 8[101c0] -> 17[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 33[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 27[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 39[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 17[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 51[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 31[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 49[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 01 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 02 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 42[201c0] -> 27[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 55[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 05 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 26[201c0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 06 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 62[a01c0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07 : 39[a01d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 01 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 02 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 05 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 01 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 06 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07 : 31[a01d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 02 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 56[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 05 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 06 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07 : 55[a01d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 30[a01c0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 23[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 29[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 4[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 01 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 53[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 02 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 62[a01c0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 05 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 06 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07 : 23[a01d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 28[901c0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06 : 53[901d0] -> 52[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06 : 29[901d0] -> 28[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 58[201c0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 60[901c0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 58[201c0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 26[201c0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 28[901c0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 19[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 21[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05/0 : 18[201c0] -> 11[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 25[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06/0 : 20[901c0] -> 13[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 33[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 25[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 27[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 27[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 40[101c0] -> 25[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04 : 33[101d0] -> 32[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 51[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06 : 21[901d0] -> 20[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 30[a01c0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 49[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05 : 11[201d0] -> 10[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 24[101c0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06 : 13[901d0] -> 12[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07/0 : 22[a01c0] -> 15[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05 : 51[201d0] -> 50[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05 : 19[201d0] -> 18[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04 : 49[101d0] -> 48[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 19[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 0[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04 : 25[101d0] -> 24[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05 : 27[201d0] -> 26[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 56[101c0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07/0 : 54[a01c0] -> 47[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 56[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05/0 : 50[201c0] -> 43[201d0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 35[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 43[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05 : 43[201d0] -> 42[201c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 11[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 3[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 60[901c0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06/0 : 52[901c0] -> 45[901d0] [send] via NET/AWS Libfabric/2/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 59[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 51[201d0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06 : 45[901d0] -> 44[901c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 24[101c0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 17[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 56[101c0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04 : 17[101d0] -> 16[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO comm 0x7f4a68000f60 rank 62 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO comm 0x7fcb68000f60 rank 63 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO comm 0x7fc204000f60 rank 61 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO comm 0x7f0394000f60 rank 57 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO comm 0x7f93b0000f60 rank 59 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO comm 0x7fcb08000f60 rank 56 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO comm 0x7f3c9c000f60 rank 58 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO comm 0x7f5490000f60 rank 60 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO comm 0x7fe294000f60 rank 32 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO comm 0x7efe98000f60 rank 36 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO comm 0x7f5290000f60 rank 33 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO comm 0x7fd830000f60 rank 34 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO comm 0x7ff3e0000f60 rank 38 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO comm 0x7f7304000f60 rank 37 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO comm 0x7fb6ec000f60 rank 39 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO comm 0x7f67d4000f60 rank 35 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04/0 : 16[101c0] -> 9[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04/0 : 48[101c0] -> 41[101d0] [send] via NET/AWS Libfabric/0/GDRDMA
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04 : 9[101d0] -> 8[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04 : 41[101d0] -> 40[101c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO comm 0x7fb184000f60 rank 31 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO comm 0x7f6858000f60 rank 25 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO comm 0x7ff32c000f60 rank 27 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO comm 0x7fcb50000f60 rank 29 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO comm 0x7f66cc000f60 rank 24 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO comm 0x7f6dec000f60 rank 26 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO comm 0x7f271c000f60 rank 28 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO comm 0x7f8de0000f60 rank 30 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO comm 0x7f7298000f60 rank 41 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO comm 0x7faf64000f60 rank 47 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO comm 0x7fc704000f60 rank 45 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO comm 0x7f2400000f60 rank 43 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO comm 0x7f411c000f60 rank 40 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO comm 0x7fcf64000f60 rank 42 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO comm 0x7f4ee4000f60 rank 44 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO comm 0x7fb2f0000f60 rank 46 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO comm 0x7f6760000f60 rank 17 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO comm 0x7ff7b0000f60 rank 23 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO comm 0x7fd9c8000f60 rank 21 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO comm 0x7faf40000f60 rank 16 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO comm 0x7f0040000f60 rank 19 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO comm 0x7ff02c000f60 rank 18 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO comm 0x7f6d80000f60 rank 20 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO comm 0x7f4d7c000f60 rank 22 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Connected all trees
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO comm 0x7fdabc000f60 rank 49 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO comm 0x7fbcc8000f60 rank 55 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO comm 0x7f4870000f60 rank 51 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO comm 0x7f5528000f60 rank 53 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO comm 0x7f761c000f60 rank 48 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO comm 0x7fd0f8000f60 rank 50 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO comm 0x7f663c000f60 rank 52 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO comm 0x7f6f48000f60 rank 54 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO comm 0x7f2bb4000f60 rank 8 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO comm 0x7f841c000f60 rank 9 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO comm 0x7fea58000f60 rank 15 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO comm 0x7f2038000f60 rank 13 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO comm 0x7f3234000f60 rank 11 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO comm 0x7fbc50000f60 rank 1 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO comm 0x7f5418000f60 rank 0 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO comm 0x7fbff8000f60 rank 5 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO comm 0x7f6564000f60 rank 10 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO comm 0x7fa078000f60 rank 2 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO comm 0x7f9168000f60 rank 7 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO comm 0x7f259c000f60 rank 3 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO comm 0x7fa2c4000f60 rank 12 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO comm 0x7f3f08000f60 rank 14 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO comm 0x7f4d84000f60 rank 4 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO comm 0x7fdf68000f60 rank 6 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Launch mode Parallel
Logging to wandb run nousr_laion/dalle2_diffusion_prior/2d8un1bt-valiant-cherry-75
Saving checkpoint locally
Saving to huggingface repo laion/DALLE2-PyTorch
Saving prior_config.json checkpoint to local path prior_config.json
Saving prior_config.json model to huggingface repo laion/DALLE2-PyTorch
[
TRAINING HERE
]
[E ProcessGroupNCCL.cpp:737] [Rank 49] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807920 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 53] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807922 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 50] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807912 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 55] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807924 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 40] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 41] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807948 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out.
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-13:13946:14101 [2] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-9:13991:14133 [6] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-13:13950:14027 [0] NCCL INFO comm 0x7fb2f0000f60 rank 46 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-9:13988:14056 [0] NCCL INFO comm 0x7fa2c4000f60 rank 12 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-13:13946:14101 [2] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 4
libfabric:13945:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f729146b300
compute-od-gpu-dy-p4d-24xlarge-13:13945:14026 [0] NCCL INFO comm 0x7f7298000f60 rank 41 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
compute-od-gpu-dy-p4d-24xlarge-13:13946:14030 [0] NCCL INFO comm 0x7fcf64000f60 rank 42 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 41] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807948 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f7193fff700 (most recent call first):
<no Python frame>
Thread 0x00007f71ad3fd700 (most recent call first):
<no Python frame>
Thread 0x00007f71addfe700 (most recent call first):
<no Python frame>
Thread 0x00007f71ae7ff700 (most recent call first):
<no Python frame>
Thread 0x00007f71c4b35700 (most recent call first):
<no Python frame>
Thread 0x00007f71c5536700 (most recent call first):
<no Python frame>
Thread 0x00007f71c7357700 (most recent call first):
<no Python frame>
Thread 0x00007f7295cbc700 (most recent call first):
<no Python frame>
Thread 0x00007f70b0dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70b2bfd700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70b35fe700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70cf5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70ccdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70cebfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70e8dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70e97fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70eb5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7104dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7122bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f71075fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7106bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6fb61fc700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7350819700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f737bfff700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7436a8c000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/rotary_embedding_torch/rotary_embedding_torch.py", line 47 in apply_rotary_emb
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/rotary_embedding_torch/rotary_embedding_torch.py", line 95 in rotate_queries_or_keys
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 728 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
compute-od-gpu-dy-p4d-24xlarge-13:13947:14031 [0] NCCL INFO comm 0x7f2400000f60 rank 43 nranks 64 cudaDev 3 busId 201d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-13:13948:14025 [0] NCCL INFO comm 0x7f4ee4000f60 rank 44 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 5
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b50 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b98 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676c28 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676a88 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a30 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd8366769b0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676be0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676b60 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 2, size: 0, state: CREATED, direction: SEND }
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ba8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c50 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 3, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676aa0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 3, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676b30 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676ae8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 3, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676b78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676ac0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 2, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c98 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676bc0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676b18 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 1, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676a40 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 1, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ad0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 1, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 2, size: 0, state: CREATED, direction: SEND }
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd8366769f8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ba8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-14:13981:14062 [0] NCCL INFO comm 0x7f5528000f60 rank 53 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13977:14060 [0] NCCL INFO comm 0x7fdabc000f60 rank 49 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-14:13978:14066 [0] NCCL INFO comm 0x7fd0f8000f60 rank 50 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 49] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807920 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fd9b6bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd9b75fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd9b7fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd9d0bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd9d15fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd9d1fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd9ecbff700 (most recent call first):
<no Python frame>
Thread 0x00007fdb1b357700 (most recent call first):
<no Python frame>
Thread 0x00007fd84bfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd8497fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd864dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd8661fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd8f35fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd90cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd8f3fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd90e1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd90f5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd928dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd90ffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd92a1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd92abfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7dbfff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fdb64e22700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fdb7bfff700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fdc5f4b7000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/functional.py", line 360 in einsum
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 761 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
Traceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
initialize_training(config_file, accelerator)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
train(
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
loss = trainer(text=txt, image_embed=img)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
Traceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 46. Original reason for failure was: [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
rv = self.invoke(ctx)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
initialize_training(config_file, accelerator)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
train(
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
loss = trainer(text=txt, image_embed=img)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 12. Original reason for failure was: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out.
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 7
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 7
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 7
Traceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
initialize_training(config_file, accelerator)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
train(
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
loss = trainer(text=txt, image_embed=img)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Traceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 43. Original reason for failure was: [Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out.
main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
compute-od-gpu-dy-p4d-24xlarge-14:13983:14063 [0] NCCL INFO comm 0x7fbcc8000f60 rank 55 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 55] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807924 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fbbcd5fe700 (most recent call first):
<no Python frame>
Thread 0x00007fbbcdfff700 (most recent call first):
<no Python frame>
Thread 0x00007fbbe8bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fbbe95fe700 (most recent call first):
<no Python frame>
Thread 0x00007fbbe9fff700 (most recent call first):
<no Python frame>
Thread 0x00007fbc051fe700 (most recent call first):
<no Python frame>
Thread 0x00007fbc217fb700 (most recent call first):
<no Python frame>
Thread 0x00007fbc23357700 (most recent call first):
<no Python frame>
Thread 0x00007fbad3fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbaeffff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acqui rv = self.invoke(ctx)
re
File "/fsx File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3 return ctx.invoke(self.callback, **ctx.params)
.8/threadi File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
ng.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb08dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/po return __callback(*args, **kwargs)
ol.py File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb0bfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb097fb700 (most recent call first):
File "/ initialize_training(config_file, accelerator)
usr/lib64/python3 File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb261fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File " train(
/usr/ File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb0abfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb40dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in wor loss = trainer(text=txt, image_embed=img)
ker
File "/usr File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb26bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb417fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool return forward_call(*input, **kwargs)
.py", line 114 in wor File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
ker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb421fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.p out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
y", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb435fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/libTraceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbb5cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fb9f21fc700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threadin loss.backward(**kwargs)
g.py" File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
, line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbd41cbc700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/ return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbdad8e3700 (most recent call first):
File "/usr/lib64/python3.8/ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe6a55a000 (most recent call first):
File "/fsx/nousr/ Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn self.main(*args, **kwargs)DALLE2-pyt
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
RuntimeErrororch/: NCCL communicator was aborted on rank 42. Original reason for failure was: [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out.dalle2_
pytorch/dalle2_pytorch.py", line 546 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/container.py", line 139 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2- rv = self.invoke(ctx)
pytorch/dalle2_ File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
pytorch/dalle2_pytorch.py", line 709 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/l return ctx.invoke(self.callback, **ctx.params)
ib64/p File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
ython3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site- return __callback(*args, **kwargs)
packa File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
ges/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/traine initialize_training(config_file, accelerator)
r.py", line 107 in in File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
ner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8 train(
/site-pack File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
ages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torc loss = trainer(text=txt, image_embed=img)
h/dis File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
tributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 44. Original reason for failure was: [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of '[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
std::runtime_error'
what(): [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fce4bfff700 (most recent call first):
<no Python frame>
terminate called after throwing an instance of 'Thread 0x00007fce64bfd700 (most recent call first):
std::runtime_error<no Python frame>
'
Thread 0x00007fce655fe700 (most recent call first):
<no Python frame>
Thread 0x what(): 00007fce65fff700[Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out. (most recent call first):
<no Python frame>
Thread 0x00007fce80bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fce815fe700 (most recent call first):
<no Python frame>
Thread 0x00007fce81fff700 (most recent call first):
<no Python frame>
Thread 0x00007fcfdd014700 (most recent call first):
<no Python frame>
Fatal Python error: Thread 0xAborted00007fcda35fe700
(most recent call first):
File "Thread 0x/u00007f23055fe700s (most recent call first):
r/<no Python frame>
li
bThread 0x600007f2305fff7004 (most recent call first):
/<no Python frame>
p
yThread 0xt00007f231d3fd700h (most recent call first):
on<no Python frame>
3
.Thread 0x8/00007f231ddfe700t (most recent call first):
h<no Python frame>
r
eThread 0xa00007f231e7ff700d (most recent call first):
in<no Python frame>
g
.Thread 0xpy00007f2374bff700" (most recent call first):
<no Python frame>
, line
Thread 0x30200007f23f2f7b700 in (most recent call first):
<no Python frame>
w
aThread 0xit00007f23fd357700
(most recent call first):
<no Python frame>
File
"Thread 0x/00007f22257fb700u (most recent call first):
s File r/"li/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n433g in .payc"qu, line i302r in ew
a File it"
/ File f"s/xu/snro/ulsirb/6d4a/lplyet2h/olni3b.684//tphyrtehaodni3n.g8./psyi"t, line e433- in paaccqkuaigrees
/ File em"b/efdsdxi/nngo_ursera/ddearl/lpea2r/qluiebt6_4n/upmyptyh_orne3a.d8e/rs.iptye"-p, line a121c in kapgieesc/ee_mgbeendedriantgo_rr
e File ad"e/ru/spra/rlqiube6t4_/npuymtphyo_nr3e.a8d/emru.lptyi"p, line ro121c in epsiseicneg_/gpeonoelr.aptyo"r, line
388 File in _"g/uuasrrd/eldi_bt6as4k/_pgyetnheorna3t.i8o/nm
ul File t"i/pursorc/elsisbi6n4g//ppyotohl.opny3".8, line /388m in u_lgtuiaprrdoecde_stsaisnkg_/gpeonoelr.aptyi"on, line
532 in File _"h/aunsdrl/el_itba6s4k/sp
y File t"h/ouns3r./8l/imbu6l4t/ippyrtohcoens3s.i8n/gt/hproeoald.ipnyg"., line p532y in "_, line h870a in nrdulne
_ File t"a/sukssr
/ File li"/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t870s in trraupn_
i File n"n/eurs
r File /"l/iubs6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.apdyi"n, line g932. in p_yb"oo, line t890s in tr_abpo_oitnsnterra
p File
"
/Thread 0xus00007fd04d8e3700r (most recent call first):
/l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/nsge.lpeyc"t, line or890s in ._pbyo"o, line t468s in tsrealpe
c
tThread 0x
00007f24bbfff700 File (most recent call first):
" File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8a/ssyenlceicot/obrass.ep_ye"v, line e468n in tsse.lpeyc"t
, line File 1823 in "_/rusurn/_loinbc6e4
/ File p"y/tuhsorn/3l.i8b/6a4s/ypnyctihoo/nb3a.s8e/_aesvyennctiso./pbya"se, line _1823e in v_ernutns_.opnyc"e, line
570 File in "r/uuns_rf/olriebv6e4r/
p File y"t/huosnr3/.l8i/ba6s4y/npcyitoh/obna3s.e8_/etvhernetasd.ipnyg".p, line y570" in , line r870u in nr_ufno
r File e"v/eurs
r/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t870s in trruanp
_ File i"n/nuesrr
/ File l"i/bus6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.apdyi"ng, line .932p in y_"b, line o890o in t_sbtoroatps_tirnanpe
r
Thread 0x File 00007fd10a697000" (most recent call first):
/u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3t.h8r/etahdrienagd.ipnyg"., line p890y in "_b, line o1027o in t_stwraaipt
_
fThread 0xo00007f259f096000r (most recent call first):
_ File t"s/tuastre/_lliobc6k4
/ File py"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/, line t1027h in r_ewaadiitn_gf.opry_"t, line s1011t in atjeo_ilno
c File k
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.m8u/ltthirperaodciensgs.ipnyg"/, line p1011o in olj.opiyn"
, line File 717" in /_utserr/mliinba6t4e/_ppyotohlo
n3 File .8"//muuslrt/ilpirbo6c4e/spsyitnhgo/np3o.o8l/.mpuyl"ti, line p717r in o_cteesrsmiinnga/tuet_iplo.oply
" File , line "224/ in u_s_rc/allilb_6_4
/p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/nmgu/luttiiplr.opcye"s, line s224i in n_g_/cuatlill_._p
y File "", line /300u in s_rr/ulni_bf6i4n/aplyitzheorns3
. File 8"//muuslrt/ilpirbo6c4e/spsyitnhgo/nu3t.i8l/.mpuyl"t, line i300p in ro_creusns_ifnign/aultiizle.rpsy
" File , line "334/ in u_serx/ilti_bf6u4n/cptyitohno
n3.8/multiprocessing/util.py", line 334 in _exit_function
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 50] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807912 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fcfecbfd700 (most recent call first):
<no Python frame>
Thread 0x00007fcfed5fe700 (most recent call first):
<no Python frame>
Thread 0x00007fcfedfff700 (most recent call first):
<no Python frame>
Thread 0x00007fd008bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd0095fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd009fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd0253ff700 (most recent call first):
<no Python frame>
Thread 0x00007fd0f5014700 (most recent call first):
<no Python frame>
Thread 0x00007fcf2bfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf461fc700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf44dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf997fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf475fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf47fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf60dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf621fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf7f5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf635fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf7d7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf7ffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcf7e1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fce12bfd700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd1a39bb700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd1b7fff700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd29b4ed000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
[E ProcessGroupNCCL.cpp:737] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808546 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808539 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 33] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 37] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808545 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 34] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808546 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808549 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 62] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808555 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 56] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 57] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 61] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808541 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 63] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 58] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 60] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808239 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:737] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808246 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fa1b55fe700 (most recent call first):
<no Python frame>
Thread 0x00007fa1b5fff700 (most recent call first):
<no Python frame>
Thread 0x00007fa1d0bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fa1d15fe700 (most recent call first):
<no Python frame>
Thread 0x00007fa1d1fff700 (most recent call first):
<no Python frame>
Thread 0x00007fa1ecbff700 (most recent call first):
<no Python frame>
Thread 0x00007fa321089700 (most recent call first):
<no Python frame>
Thread 0x00007fa321a8a700 (most recent call first):
<no Python frame>
Thread 0x00007fa0f2bfd700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fa3ab685700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fa4660e7000 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock
File "/usr/lib64/python3.8/threading.py", line 1011 in join
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function
compute-od-gpu-dy-p4d-24xlarge-8:13871:14033 [2] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-8:13871:14033 [2] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 4
compute-od-gpu-dy-p4d-24xlarge-8:13874:13956 [0] NCCL INFO comm 0x7fbff8000f60 rank 5 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-8:13873:13951 [0] NCCL INFO comm 0x7f4d84000f60 rank 4 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f4dfcdfc700 (most recent call first):
<no Python frame>
Thread 0x00007f4dfd7fd700 (most recent call first):
<no Python frame>
Thread 0x00007f4dfe1fe700 (most recent call first):
<no Python frame>
Thread 0x00007f4dfebff700 (most recent call first):
<no Python frame>
Thread 0x00007f4e54bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f4e555fe700 (most recent call first):
<no Python frame>
Thread 0x00007f4e55fff700 (most recent call first):
<no Python frame>
Thread 0x00007f4f44c62700 (most recent call first):
<no Python frame>
Thread 0x00007f4d057fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4fcdd88700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f508abf1000 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock
File "/usr/lib64/python3.8/threading.py", line 1011 in join
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fb20a1fe700 (most recent call first):
<no Python frame>
Thread 0x00007fb20abff700 (most recent call first):
<no Python frame>
Thread 0x00007fb221f55700 (most recent call first):
<no Python frame>
Thread 0x00007fb223fff700 (most recent call first):
<no Python frame>
Thread 0x00007fb268bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fb269fff700 (most recent call first):
<no Python frame>
Thread 0x00007fb2eab7c700 (most recent call first):
<no Python frame>
Thread 0x00007fb350bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fb10d7fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fb3da1e1700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
py", line 890 in _bootstrap
Thread 0x00007fb494a6a000 (most recent call first):
File "/usr/lterminate called after throwing an instance of 'ib64/std::runtime_error'
pytho what(): [Rank 53] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807922 milliseconds before timing out.n3.8/threading.py", line
1027 in _wFatal Python error: Aborted
ait_fThread 0x00007f54417fb700 (most recent call first):
or_ts<no Python frame>
Thread 0xtate_00007f54421fc700 (most recent call first):
<no Python frame>
lock
Thread 0x00007f5442bfd700 (most recent call first):
<no Python frame>
File "/u
Thread 0x00007f54435fe700 (most recent call first):
<no Python frame>
sr/li
Thread 0x00007f5443fff700 (most recent call first):
b64/p<no Python frame>
Thread 0x00007f545907d700 (most recent call first):
ython<no Python frame>
Thread 0x00007f545b5fe700 (most recent call first):
3.8/t<no Python frame>
Thread 0x00007f545bfff700 (most recent call first):
hreading.py", line 1011<no Python frame>
Thread 0x00007f532b5fe700 in joi (most recent call first):
File "n
File "/usr/usr/li/lib6b64/python3.8/mu4/python3.ltipr8/multocessiproceing/pssingool.p/pool.y", line 717 in py"_term, line 576 in _handleinate_resu_poollts
File
File "/"/usr/lusr/ib64/lib64/pythopython3.8/multin3.8/procethreassingding./utilpy", line 870.py", line in run224 in __
File "/ucall_sr/lib64/_
File "pytho/usr/n3.8/lib64threa/pythding.on3.8py", line 932/mult in _bootstrap_iniprocessiner
File ng/ut"/usril.py/lib64/p", line 300 in ython_run_3.8/tfinalizhreading.pyers
File "/usr", line 890 in /lib6_boot4/pytstraphon3.
Thread 0x00007f5344dfa700 (most recent call first):
8/mul File "/utiprocessing/sr/lib64/putil.ythonpy", line 3343.8/t in _exhreadit_fuing.pnctioy", line 302 in n
wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53461fc700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f5346bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53475fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53621fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f537ffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f537d7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f5363fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53997fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f539a1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f537cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f537f5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f524abfd700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f55d3fff700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f56126e2700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f56cce44000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 2
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-8:13871:13952 [0] NCCL INFO comm 0x7fa078000f60 rank 2 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fbf095fe700 (most recent call first):
<no Python frame>
Thread 0x00007fbf09fff700 (most recent call first):
<no Python frame>
Thread 0x00007fbf253fd700 (most recent call first):
<no Python frame>
Thread 0x00007fbf25dfe700 (most recent call first):
<no Python frame>
Thread 0x00007fbf267ff700 (most recent call first):
<no Python frame>
Thread 0x00007fbf42bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fbf43fff700 (most recent call first):
<no Python frame>
Thread 0x00007fbf5df57700 (most recent call first):
<no Python frame>
Thread 0x00007fbe0ffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe2a1fc700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe461fc700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe2abfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe297fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe2b5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe2bfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe457fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe44dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 4
in worker
File "/usr/lib64/python3.8/threading.pycompute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 4
", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe60dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe62bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe63fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbe621fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbd2cdfa700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc0b4b1f700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc0e18e3700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc19e62b000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-15:13988:14074 [0] NCCL INFO comm 0x7f0394000f60 rank 57 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13990:14076 [0] NCCL INFO comm 0x7f93b0000f60 rank 59 nranks 64 cudaDev 3 busId 201d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-15:13989:14075 [0] NCCL INFO comm 0x7f3c9c000f60 rank 58 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'terminate called after throwing an instance of 'std::runtime_errorstd::runtime_error'
'
what(): what(): [Rank 57] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.[Rank 58] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out.
Fatal Python error: Fatal Python error: AbortedAborted
Thread 0xThread 0x00007f02a21fc70000007f3bacffd700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3bad9fe70000007f02a2bfd700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3bae3ff70000007f02a35fe700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0x
Thread 0x00007f3c8ef7b700 (most recent call first):
00007f02a3fff700 (most recent call first):
<no Python frame>
<no Python frame>
Thread 0x
Thread 0x00007f3c8f97c700 (most recent call first):
00007f02bd3fd700 (most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3ca0bfd70000007f02bddfe700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3ca1fff70000007f02be7ff700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3cf97ff70000007f02f5f57700 (most recent call first):
(most recent call first):
<no Python frame>
<no Python frame>
Thread 0xThread 0x00007f3a975fe70000007f018b5fe700 (most recent call first):
(most recent call first):
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 576576 in in __hhaannddllee__rreessuullttss
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//tthhreraedaidnign.gp.yp"y, line "870 in , line r870u in nr
u File n"
/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."py, line "932, line in 932_ in b_obootosttsrtarpa_pin_nienrn
e File r"
/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y890" in , line 890_ in bo_obtosottrsatpr
a
pThread 0x
00007f01a4dfa700
(most recent call first):
Thread 0x File 00007f3accdfa700 (most recent call first):
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y302" in , line wa302i in tw
a File i"t/
u File sr"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8/etahdrienagd.ipnyg"., line p433y in "a, line c433q in uaicrqeu
ir File e
" File /f"s/xf/snxo/unsoru/sdra/ldlael2l/el2i/bl6i4b/6p4y/tphyotnh3o.n83/.s8i/tsei-tpea-cpkaacgkeasg/eesm/beemdbdeidndgi_nrge_ardeeard/epra/rpqaureqtu_entu_mnpuym_prye_ardeeard.epry."py, line "121, line in 121p in ipeiceec_eg_egneenreartaotro
r File
" File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8m/umlutlitpirporcoecsessisnign/gp/opoolo.lp.yp"y, line "388, line in 388_ in g_ugauradredde_dt_atsaks_kg_egneenreartaitoino
n
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 532532 in in __hhaannddllee__ttaasskkss
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//tthhrreeaaddiinngg..ppyy"", line , line 870870 in in rruunn
File " File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8t/htrheraedaidnign.gp.yp"y, line "932, line in 932_ in b_obootosttsrtarpa_pi_ninnenre
r File
" File /"u/surs/rli/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y890" in , line _890b in o_obtosottrsatpr
a
pThread 0x
00007f01a75fe700
(most recent call first):
Thread 0x00007f3ab0dfa700 File (most recent call first):
" File /u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.s8e/lseeclteocrtso.rpsy."p, line y415" in , line s415e in lescetl
ec File t"
/ File us"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/icnogn/nceocntnieocnt.ipoyn"., line p931y in "w, line a931i in tw
a File i"t/
u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/ipnogo/lp.opoyl"., line p499y in "_, line w499a in i_tw_afiotr__fuoprd_autpedsa
t File e"s/
u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/ipnogo/lp.opoyl"., line p519y in "_, line h519a in n_dlhea_nwdolrek_ewrosr
k File e"r/su
s File r"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8e/atdhirnega.dpiyn"g, line .870p in yr"u, line n870
in File r"u/nu
s File r"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8e/atdhirnega.dpiyn"g, line .932p in y_"b, line o932o in t_sbtoroatps_tirnanpe_ri
n File n"e/ru
sr File /"l/iubs6r4//lpiybt6h4o/np3y.t8h/otnhr3e.a8d/itnhgr.epayd"i, line n890g in ._pbyo"ot, line s890t in r_abpo
o
tThread 0xs00007f01de1fc700t (most recent call first):
ra File p"
/
uThread 0xs00007f3ab17fb700r (most recent call first):
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp"o, line o114l in .wpoyr"k, line e114r in
w File o"r/kuesrr
/l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i870n in gr.upny
" File , line "870/ in ursurn/
l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i932n in g_.bpoyo"t, line s932t in r_abpo_oitnsnterra
p File _"i/nunserr/
l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i890n in g_.bpoyo"t, line s890t in r_abpo
o
tThread 0xst00007f01df5fe700r (most recent call first):
ap File
"
/Thread 0xus00007f3ab21fc700r (most recent call first):
/l File i"b/6u4s/rp/lyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp"o, line o114l in .wpoyr"k, line e114r in
w File o"r/kuesrr
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n870g in .rpuyn"
, line File 870" in /ruusnr
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t932s in t_rbaopo_tisntnrearp
_ File i"n/nuesrr
/ File l"i/bu6s4r//plyithbo6n43/.p8y/tthhorne3a.d8in/gt.hprye"a, line d890i in n_gb.opoyt"s, line t890r in a_pb
o
oThread 0xt00007f01c35fe700s (most recent call first):
t File r"a/pu
s
rThread 0x/00007f3ab2bfd700l (most recent call first):
ib6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.psys"i, line n114g in /wpoorokle.rp
y File "", line /114u in swro/rlkiebr6
4/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni
n File g"./puys"r, line /870l in irbu6n4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"th, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort
s File t"r/aups_ri/nlniebr6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogts.tprya"p, line
890
in Thread 0x_b00007f01dcdfa700o (most recent call first):
o File t"s/tursarp/
l
iThread 0xb00007f3ab3fff7006 (most recent call first):
4/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroonc3e.s8s/imnugl/tpioporlo.cpeys"s, line i114n in gw/oprokoelr.
p File y""/, line u114s in rw/olrikbe6r4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni
ng File ."/puys"r, line /870l in irbu6n4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"th, line r932e in a_dbionogt.sptyr"a, line p932_i in n_nbeoro
t File s"t/ruaspr_/ilninbe6r4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..8p/yt"h, line r890e in a_dbionogt.sptyr"a, line p890
in
_Thread 0xb00007f01f8dfa700o (most recent call first):
ot File s"t/ruaspr
/
lThread 0xi00007f3ace1fc700b (most recent call first):
6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry
", line File 114" in /wuosrrk/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a870d in irnugn.
p File y""/, line u870s in rr/ulni
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"ea, line d932i in n_gb.opoyt"st, line r932a in p__bionontesrt
r File a"p/_uisnrn/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"tr, line a890p in
_
bThread 0xo00007f01fa1fc700o (most recent call first):
ts File t"r/aups
r
/Thread 0xl00007f3acebfd700i (most recent call first):
b6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry
" File , line "114/ in uwsorr/kleirb
6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"r, line e870a in driunng
. File p"y/"u, line s870r in /rluinb
6 File 4"//pyutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort
s File t"r/aups_ri/nlniebr6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogt.sptyr"ap, line
890
in Thread 0x_00007f01fb5fe700b (most recent call first):
oo File t"s/tursarp/
l
iThread 0xb00007f3acffff7006 (most recent call first):
4/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroon3c.e8s/smiunlgt/ipporoolc.epsys"i, line n114g in /wpoorokle.rp
y File "", line /114u in swro/rlkiebr6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r870e in arduinn
g File ."p/yu"s, line r870/ in lriubn6
4 File /"p/yutshro/nl3i.b684//tphyrtehaodni3n.g8./ptyh"r, line e932a in d_ibnogo.tpsyt"r, line a932p in __ibnonoetrs
tr File a"p/_uisnrn/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"t, line r890a in p_
b
oThread 0xo00007f0214dfa700t (most recent call first):
st File r"a/pu
s
rThread 0x/l00007f3ae8dfa700i (most recent call first):
b6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry
", line File 114" in /wuosrrk/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a870d in riunng
. File p"y/"u, line s870r in /rluinb
64 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort
s File t"r/aups_ri/nlniebr6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogt.sptyr"a, line p890
in
_Thread 0xb00007f0232bfd700o (most recent call first):
o File t"s/tursarp/
l
iThread 0xb600007f3ae97fb7004 (most recent call first):
/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroonc3e.s8s/imnugl/tpioporlo.cpeys"s, line i114n in gw/oprokoelr.
py File "", line /114u in swro/rlkiebr6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r870e in arduinn
g File ."p/yu"sr, line /870l in irbu6n4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h932r in e_abdoiontgs.tprya"p, line _932i in n_nbeoro
t File s"t/ruaspr_/ilninbe6r4
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h890r in e_abdoiontgs.tprya"p, line
890
in Thread 0x_00007f02175fe700b (most recent call first):
oo File t"s/tursarp/
l
iThread 0xb00007f3aea1fc7006 (most recent call first):
4/p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/nmgu/lptoioplr.opcye"s, line s114i in nwgo/rpkoeorl
.p File y""/, line u114s in rw/olrikbe6r4
/p File y"t/huosnr3/.l8ib/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni
ng File ."p/yu"s, line r870/ in lriubn6
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"hr, line e932a in d_ibnogo.tpsyt"r, line a932p in __ibnonoetrs
tr File a"p/_uisnrn/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tpyh"r, line e890a in d_ibnogo.tpsyt"r, line a890p in
_
bThread 0xo00007f00abfff700o (most recent call first):
ts File t"r/aups
r
/Thread 0xl00007f39b6bfd700i (most recent call first):
b6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"r, line e306a in dwianigt.
py File "", line /306u in swra/ilti
b6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"re, line a558d in iwnagi.tp
y" File , line "558/ in fwsaxi/tn
ou File s"r//fdsaxl/lneo2u/slri/bd6a4l/lpey2t/hloinb36.48//psyittheo-np3a.c8k/asgietse/-tpqadcmk/a_gmeosn/ittqodrm./p_ym"o, line n60i in trourn.
p File y""/, line u60s in rr/ulni
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a932d in i_nbgo.optys"t, line r932a in p__bionontesrt
r File a"p/_uisnrn/elri
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"tr, line a890p in
_
bThread 0xoo00007f0406f45700t (most recent call first):
s File tr"a/pu
sr
/Thread 0xl00007f3d44f0e700i (most recent call first):
b6 File 4/p"y/tuhsorn/3l.i8b/6c4o/npcyutrhroenn3t./8f/uctounrceusr/rtherneta/df.uptuyr"ecompute-od-gpu-dy-p4d-24xlarge-15:13991:14077 [0] NCCL INFO comm 0x7f5490000f60 rank 60 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE
s, line /78t in h_rweoardk.epry
" File , line "78/ in us_rw/olrikbe6r4
/p File y"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/compute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 5
, line t870h in rreuand
i File n"gcompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 5
/.upsyr"/, line l870i in br6u4n/compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 5
p File y"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/, line t932h in r_ebacompute-od-gpu-dy-p4d-24xlarge-10:13988:14141 [6] NCCL INFO [Service thread] Connection closed by localRank 5
odoitnsgt.rpayp"_, line i932n in n_ebro
o File t"s/tursarp/_liinbn6e4r/
p File y"t/huosnr3/.l8i/bt6h4r/epaydtihn[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
ogn.3p.y8"/, line t890h in r_ebaodoitnsgt.rpayp"
, line
890Thread 0x in 00007f047b19d700_ (most recent call first):
bo File "o/tusstrr/alpi
b
6Thread 0x400007f3d826e2700/ (most recent call first):
p File yt"h/ousnr3/.l8i/bs6e4l/epcyttohrosn.3p.y8"/, line s468eterminate called after throwing an instance of ' in lseecltestd::runtime_erroroc'
rts
. File py""/, line u468s in rs/ellieb what(): c6t[Rank 60] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out.4
/p
File yt"h/ouns3r./8l/iabs6y4n/cpiyot/Fatal Python error: hboaAbortedns
3e._8e/vaesnThread 0xytns00007f5378bfd700.c (most recent call first):
piy<no Python frame>
"o, line
/1823Thread 0xb in 00007f53795fe700a_ (most recent call first):
sr<no Python frame>
eu
n_Thread 0x_eov00007f5379fff700ne (most recent call first):
cnet<no Python frame>
s. File
p"Thread 0xy/00007f5394bfd700u (most recent call first):
"s<no Python frame>
, line
r1823Thread 0x/ in 00007f53955fe700l_ (most recent call first):
i<no Python frame>
rb
6uThread 0x4n/_00007f5395fff700opny (most recent call first):
ct<no Python frame>
he
o
Thread 0xn File 300007f53acbff700". (most recent call first):
/8<no Python frame>
u/
saThread 0xrs00007f5405357700/ (most recent call first):
ylni<no Python frame>
cb
i6Thread 0x4o00007f529bfff700// (most recent call first):
bp File yats"eh_o/envu3es.nr8t//sa.lspyyin"b, line c5706i in o4r/u/bnpay_stefh_oorenve3ev.net8rs/
.mp File uy"l/t"uis, line p570rr in /orlcuiebsn6s_4f/ionprygetv/heproo
on File l"3./.pu8ys/"rt/h, line lri576eb in a6_d4hi/apncompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 1
nydgt.hlpoeyn_"3r, line .870e8 in s/urtlhutrnse
a File File d"i"/nu/gsu.rscompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 1
/prly/"lii, line bb68706compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 1
4 in 4/rp/compute-od-gpu-dy-p4d-24xlarge-10:13988:14141 [6] NCCL INFO [Service thread] Connection closed by localRank 1
uyptnyh
toh File on"n3/3..u88s//rtt/hhlirrbee6aa4dd/ipinnyggt.p.hypo"yn", line 3932., line in 8870/_ in tbrhouronet
sa File tdr"ia/pnug_s.irpn/lniyeb"r6, line 4
932/ in File p_"yb/utoshorots/ntlr3.ia8bp/6_4itn/hnpreyertah
do File in"n3g/..up8sy/"rt, line /hlr932ie in ba6_4db/iopotnysgtt.hrpoaypn"_3, line i.890n in n8_e/brt
o File ho"rt/eussatdrri/anlgp.i
pby
6"Thread 0x4, line 00007f0535aef000/ (most recent call first):
890p in File y_"/tbfhosooxtn/s3n.to8rua/strp/h
rd
eaaThread 0xld00007f3e3ce00000lie (most recent call first):
2n File /"gl/.ifbps6yx4"//p, line ny890ot in huo_snrb3/o.doa8t/lssltiert2ae/p-l
pi
abThread 0x6c00007f52b61fc7004ka (most recent call first):
/gp File ye"st//uhstoron/r3li.bc86h/4/s/inptynte/h-moopdnauc3lk.ae8gse//st/mhtordeouarldeci.hnp/gyn."np, line /y1198f"u in , line n_302c_ in tgiweotanaiatt
lt.r File p_"y_"/
, line u1252 File s in r"d/r/lofpsioxbu/6tn4
o/u File sp"r/fy/stxDhA/oLnnLo3Eu.s28r/-t/pdyhatrloelreac2dh/i/lngid.bap6ly4"l/, line pe433y2 in t_ahpyctoqonru3chi./r8de/a
sl File lie"t2e/_-fppsyaxtco/krancgoheu.ssp/ryt/"od, line ar546lc in lhfeo/2rn/nli/bwm6ao4rd/dup
ylt File eh"so//fdnsr3xo./p8on/ousuitts.epr-y/pd"aa, line cl58kl in aefg2oe/rlswia/bre6md4b
e/ File dp"dy/iftsnhxgo/_nnroeua3sd.re8/rd//saiptlalee-r2pq/aucelktiab_g6ne4s/u/tmpoyrpctyh_h/ronenna/d3meo.r8d./upslyie"tse/, line -m121poad in cukpaligeeecse._/pgtyeo"nr, line ec1130rha in /t_ncnoa/rl
mlo File _d"ium/lpuelss
r File //m"lo/idfbus6lx4e//.pnpyoyu"ts, line hr1130o/n in d3_ac.la8lll/e_m2iu/mllptili
bp6 File r4"/o/cpfeysstxsh/ionnno3gu./s8rp//osDoiAltL.eLp-Ey2p-a"pcy, line kt388oa in g_regcsh/u/tadoarrldclehed/2n__ntp/yamtsookdr_ucglhee/snde/aclroalnteti2a_oipnny
etro File r.c"ph.y"py/, line "140u in , line s806fr in o/lrfiowbra6wr4ad/r
dp File
y" File /t"h/offsnsxx3//.n8no/oumususlrr//tdiadplalrlloeec22/e/lslisbiib6n644g//pp/yyptthhoooonnl33...88p//sysi"tiet, line -e532 in -pp_aahccakknaadggleesse//_ttootrracschkh/sn
/nn/n File m/"omo/dduulsuerless///mmlooiddbuu6ll4ee/..pppyyyt"h", line o, line 11301130 in n in _3_c.8c/aatllhllr__iemiapmdpll
i
File n File "g"/./fpfsysxx"//, line nn870oo in uurssurrn//
DD File AA"L/LuLsLr/ElE2i-2bp6-y4pty/oprtyocrthchh/od/ndaa3lll.lee822/__tpphyyrtteooarrdcchih/n/ddagal.lpllye2e"_2_p, line py932ytt in oo_rbrcocohht..spptyyra"p", line _, line 718i in 944nf in noefrrow
ra File rw"da/r
ud File s"
r/ File /f"ls/ifxbs/x6n/4on/oupsuysrrt//hddonaa3ll.lle8e/2/2lt/ihlrbie6ba4d6/i4p/nypgtyh.otpnhy3o".n, line 83890/. in s8_/bisotioett-esp-taracpkpaa
cg
keThread 0xsa00007f52d0dfa700/g (most recent call first):
etos File /r"tco/hruc/snhrn///lmnionbd/6mu4ol/depsuy/ltmehosod/num3lo.d8eu/l.speey."pl, line ye1130"c in , line t1130o_ in rc_sac.lpalyl_"li, line m_p415il in m
spe File ll"
e/ File cf"t
/s File xf/"sn/xou/usnsroru//slriD/bADLA6LL4E/Lp2Ey2-tpy-thopoynrt3oc.hr/8cd/haml/uldleat2li_lpepy2rt_oocrpecyshts/oirndcalhlg/e/d2ac_olpnlytneeo2c_rtpcyicompute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 5
thoo.nrpcompute-od-gpu-dy-p4d-24xlarge-12:13923:14077 [2] NCCL INFO [Service thread] Connection closed by localRank 5
cy.h".p, line pyy"806" in , line , line 931f1144 in o in rwpaw_ialtro
ds File
s File compute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 5
"e"s//
fu File ss"xr///fnlsoixubs6/r4/n/dopuaylstlrhe/o2D/compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 5
lAinLb3L6.E24-8/p/pymytuotlrthcoinhp/3r.do8c/aeslsliseti2e_-nppgayc/tkpaoogroecls.h//tpdoayrl"cl, line he/4992n in _n_p/wymatoiodtur_lcfehos.r/p_myuo"pd, line du1254al in tefe.ospr
yw" File a, line r1130" in d/
_u File cs"ar/l/flls_iixbm/6pn4lo
/u File p"ys/trfh/osndx3/a.ln8l/oemu2us/rllt/iibDp6Ar4Lo/LcpEey2tsh-sopiynn3t.go8/r/spciohto/edl-a.plpalyec2"k_, line ap519gye in tso_rh/chat/ndodarllcleeh2_/_wnponyrt/kmeoorrdsuc
lhe. File sp"/ym/"oud, line su944rl in /efl.oiprbwy6a"4r, line /dp1130
y in File t_hc"oa/llnf_3si.xm8/pn/lo
tu File sh"rr/e/afddsaixlnl/gen.2po/yu"ls, line ir870/bd in 6arl4ul/npe
2y/ File tl"hio/bnu634.s/8rp/y/tlshioinbt36e.48-//ppsaycitkthaeo-gpnea3sc./8kt/aotgrhecrshe//andtnio/nmrgc.ohp/dyu"nl, line n932/e in sp/_ambrooadolullteesl.t/pdryia"sp, line t_1130r in ii_nbcuntaelerld
_.ip File myp""l, line /
969u in File s_r"r//ufnls_ixdb/d6npo4_u/fspro/yrDtwAhaoLrnLd3E
.28 File -/p"t/yhftrsoerxc/ahnd/oidunsagrl./pldeya2"_lp, line ly890et in 2o_/rblocohit/bsd6t4ar/lplayep2t_
hp
yoThread 0xtn00007f52d17fb700o3 (most recent call first):
.r8 File /c"sh/iut.sprey/-"pl, line ai1144cb in k6ap4g_/leposys/ttsheoosnr
3 File c."8h///fmnsunxl//tnpioapurrsaorlcle/esDlsA/idLniLgsE/2tp-ropioybtlou.rtpceyhd"/.pd, line ya114"l in , line l1008we in o2f_ropkryetwroa
rr File cd"h
// File u"sd/arfl/sllexi2/b_n6p4oy/uptsoyrrt/chdho.anp3lyl."e8, line 2/1254/ in ltfihobr6re4wa/adpridyn
tg File h."op/nfy3s".x, line 8/870/n in soriuutsne
r- File /p"/daaucllsker2a//glleiisbb/66t44o//rppcyyhtt/hhnoonn/n33m..o88d//utslhiertesea/-dmpioandcgku.lapegy.e"sp, line /y932t" in o, line _r1130b in co_hoc/atnlsntl/r_maiopmd_upilln
en File se/"rm/
of File sd"x//uunlsoer.u/plsyir"b/, line D6A1130L4 in L_/cEap2l-ylpt_yhitomonprlc3
h./8 File d/a"t/lhlfers2ex_/apndyiontugos.rrpc/yhd/"at, line lrla890ei2n in /e_lrib.bop6o4yt"/spt, line ry394ta in phfo
no3
rThread 0x.w8a00007f52d21fc700/r (most recent call first):
sd File
i"t File /e"-u/psafrsc/xk/lanigboe6u4srs///DptyAotLrhLcoEh2n-/3.pn8ynt//omprucalrht/aidlpalrleolclee/2ds_spiysittnrogi/rbpuoctohel/d..ptpyry"a", line i, line 114n969 in in ew_ror.rpuykn"e_, line rd107
d in File pi"n_n/feuro
sr File rw"/a/lrfidsb
x6/4 File n/"op/uyfstsrhx//odnnao3lu.ls8er//t2dh/arlleilabde6i24n//gpl.yiptbyh6"4o, line n/8703p. in 8yr/tuhsnio
tn File e3"-.p/8aus/crskcompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 6
/ialtcompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 6
gieeb-s6p/4at/copkryactghhe/onsnn/3t/.om8ro/ctdhhur/lenaedns/i/npmaogr.adpluyll"eel, line ./932dp in iys_"tb, line or1130o in i_tcsbatulrtlea_pdi._pmiyp"nl, line n
1008e File in r"f/
orfw File as"rx/d/
un File so"ru//slfris/bxD6A/4Ln/opLuyEstr2/-hdpoanyl3tlo.re8c/2ht//hlrtierb6aa4di/ipnny_tgdh.iopfnyf3"u., line 8s890/i in so_inbt_oeop-tpsratcirkoaarpg.e
ps
/yt"Thread 0xo, line r00007f52d2bfd700c503 (most recent call first):
h in File /n"tn/r/uamsiornd/
ull File ie"bs/6/f4ms/oxpd/yulnteoh.opunsy3r"., line /81130D/ in A_mLcuLaEll2l-_tpiimypptrloo
File cre"cs/hsf/itsnrxga//ninpo_oduoislf.rfp/uysD"iA, line oL114nL in _Ewp2or-rpkiyotero.rrp
cyh File /""d/a, line u736l in lsiern2i/_tlipaiylbti6oz4re/_cpthyr/ttahrioannii3nn.ge8r
/. File t"hp/ryfe"sax, line d/394in in onfugos.rrpw/Dya"ArL, line dL870E
in File 2r"-upn/y
fts File x"o//rnucohsu/strr/rD/aAliiLnLb_E6d42if-/fppyuytsotrihcohon/n3d_.apr8li/loterh2.r_peypa"ydt, line io753n in rgmc.ahp/iytn"r
, line a File 932i" in /nf_esbrxo/.onptyos"uts, line rr107/a in dp_iainlnnlneeer2r/
l File File "i/"bf6/su4xs//nrop/ulysibtr6/h4od/anpl3yl.te8h2//oslnii3tb.6e84-//pptayhctrkheaoagdneis3n./8gc/.lspiiyct"ke, line -/890pc in ao_crbkoaego.etpss/yt"tro, line ar760cp in
hi
/nnvThread 0xno00007f52effff700/k (most recent call first):
meo
File d File ""u//lufesssrx///mlnoiodbu6usl4re//p.dypatlyhl"oe, line n113023 in /._l8icba/6l4ml/_upilymtptlih
po File nr"3o./c8fe/sssxsi/itnneo-ugsp/rap/cDokAoaLlLgE.e2p-syp/"yct, line lo114ri in ccwhko//rctkroearrie
n. File _p"dyi/"fuf, line us1404sr in i/iolnnivbo_6kp4er/
ip File yo"tr/h.fopsnyx3"/., line n8503o/ in uttshrrra/idenaa
dl File i"ln/egf2.s/plxiy/"bn, line 6o4870/ in uprsyurtn/h
Do File An"L3L/.Eu2s8-/rps/iylttieob-r6pc4ha//cpkytarthgaoeinsn3_/.dci8l/fiftchukrs/eiconao_drpiern.igpo.yr"p, line .y1055p"y in , line m"a, line 932i736 in n in _
ibn File oi"toi/taslftiszerx_a/pnt_roiunasniren/rid
nagl File
l" File e"/2/uf/ssrlx//inlobiu6sb46r4///DppyyAttLhhoLnoE3n2.3-8./p8sy/itttoherr-cepahadc/iktnargag.iepnsy_/"dci, line lf890if in uc_skbioo/noc_topsrtreri.apopry
."
, line Thread 0xp113000007f52d35fe700y in (most recent call first):
_"_, line File c753"a in l/mlua_sir_n/
l File File i""b/6/f4fss/xpx/yn/tonuhsooru/dnsa3lr.l/8ed/a2ml/ullleit2bi/6pl4ri/obpce6ys4th/sopinyn3t.hg8o//npso3io.tl8e/-.sppiaytc"ke, line a114g- in epswa/occrlkkiecarkg
e/ File sc"//oturoserr.c/phly/i"dbi6, line st7604r in /iipbnyvuottehkdeo/n
e File 3l".a/8sf/sttxih/crn/eomuausdlrit/ndgia.plpryol"ec, line 2e870/s in slriiunbng
6/4e File /r"pry/otuhrsosr//n_l3_.ii8nb/i6ts_4i_/.tppey-ytp"ha, line oc345kn in a3w.g8rea/stp/hpcrleeairdc
ki/ File nc"o/gf.rsepxy/."np, line oy932u in "s_, line rb1404/o in DoiAtLnsvLotEkr2ea-
pp File _y"i/tnofnrsecxrh/
/n File to"ru/ausirsn/_rdda/illflifeb2u/6sli4ibo/6np4_y/ptprhyoitnho3o.rn83./p.t8yh"/rs, line eiatd757e in i-<npmgaoc.dkpuaylg"ee, line s>890/
in c_libcoko/tcsotrrea.pp
y
"Thread 0x, line 105500007f52ee1fc700 in (most recent call first):
m File a"i/nu
s File r"//lfisbx6/4n/opuystrh/odna3l.l8e/2m/ullitbi6p4r/opcyestshionng3/.p8o/osl.iptye"-, line p114a in cwkoargkeesr/
c File l"i/cuks/rc/olrieb.6p4/yp"y, line t1130h in o_n_3c.a8/ltlh_r_e
a File din"g./pfys"x/, line n870o in ursurn/
d File a"l/lues2r//lliibb6644//pyptyhtohno3n.38./8s/itther-ecompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 2
paadciknagg.epsy/"t, line o932r in c_hb/odoitsstrtirbaupte_di/nenlears
t File i"c//ucompute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 2
msur/lltiibp6r4o/cepsystihnogn/3er.r8o/rtsh/r_e_iandiitn_g_..ppyy"", line , line 345890 in in w_rabpopoetrs
t File r"a/pf
s
xThread 0x/00007f53097fb700n (most recent call first):
ou File s"r//uDsArL/LEl2i-bp6y4t/oprycthh/otrna3i.8n/_mudlitfifpursoicoens_spirnigo/rp.opoyl".p, line y757" in , line <114m in owdourlkee>r
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53261fc700 (most recent call first):
File "/usr/lib64compute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 4
/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f53257fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f52eebfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f51b97fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f5543e1d700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threacompute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 2
ding.py", line 890 in _bootstrap
Thread 0x00007f558121c700 (most recent call first):
File "/uscompute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 2
compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 2
r/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/ascompute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 1
yncio/base_events.py", line 570 in run_forever
File "/usr/lib64/pcompute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 1
ython3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f563796c000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 710 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
compute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 1
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 3
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 6
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 5
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 7
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13976 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13978 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13979 closing signal SIGTERM
compute-od-gpu-dy-p4d-24xlarge-15:13992:14072 [0] NCCL INFO comm 0x7fc204000f60 rank 61 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13980 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13981 closing signal SIGTERM
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 61] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808541 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fc1075fe700 (most recent call first):
<no Python frame>
Thread 0x00007fc107fff700 (most recent call first):
<no Python frame>
Thread 0x00007fc1213fd700 (most recent call first):
<no Python frame>
Thread 0x00007fc121dfe700 (most recent call first):
<no Python frame>
Thread 0x00007fc1227ff700 (most recent call first):
<no Python frame>
Thread 0x00007fc1e4bff700 (most recent call first):
<no Python frame>
Thread 0x00007fc2015fe700 (most recent call first):
<no Python frame>
Thread 0x00007fc25ef57700 (most recent call first):
<no Python frame>
Thread 0x00007fc00abfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc0275fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc05cdfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13982 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13983 closing signal SIGTERM
wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc026bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc042bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc040dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc0421fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc043fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc05d7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc05e1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc05ebfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc07b5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc0797fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fbf297fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc297a7b700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc2ed7e0700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc3a40a0000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 752 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
compute-od-gpu-dy-p4d-24xlarge-15:13993:14071 [0] NCCL INFO comm 0x7f4a68000f60 rank 62 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALL[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
E2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousterminate called after throwing an instance of 'r/DALstd::runtime_errorL'
E2-pytorch/train_d what(): if[Rank 62] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808555 milliseconds before timing out.fus
ion_prior.py", line 753 in main
File Fatal Python error: "Aborted/
fsx/nousrThread 0x/da00007f4959fff700l (most recent call first):
le<no Python frame>
2/
lThread 0xib00007f4974ffd7006 (most recent call first):
4/<no Python frame>
p
yThread 0xth00007f49759fe700o (most recent call first):
n<no Python frame>
3
.8Thread 0x/00007f49763ff700s (most recent call first):
it<no Python frame>
e
-Thread 0xp00007f4990b38700a (most recent call first):
c<no Python frame>
k
aThread 0xge00007f4991539700s (most recent call first):
/<no Python frame>
c
liThread 0xck00007f4993357700/ (most recent call first):
c<no Python frame>
o
rThread 0xe.00007f4ad9532700p (most recent call first):
y<no Python frame>
"
, line Thread 0x76000007f48b21fc700 in (most recent call first):
in File vo"ke
/ File u"s/rfs/xl/inbo6u4s/rp/ydtaholnl3e.28//limbu6l4t/ippyrtocheosns3i.n8g//spiotoel-.ppacyk"ag, line es576/ in c_lhiacnkd/lec_orrees.uply"t, line s1404
in File in"v/okues
r File /"l/ifbs6x4//npoyutsrh/odna3l.l8e/2t/hlrieb6a4d/ipnytgh.opny3".8, line /870s in itreu-np
a File c"/kuasrg/elsi/bc6l4i/cpky/tchoorne3..p8y/"t, line h1055r in emaadiinn
g File .p"y/"f, line s932x in /_nboouostrs/dtarlalpe_2i/nlniebr6
4/ File p"y/tuhsorn/3l.i8b/6s4it/ep-pyatchkoange3s.8//ctlhirceka/dcoirneg..ppyy"", line , line 1130890 in in ___bcoaoltls_t_r
a File p"
/
fThread 0xs00007f487bfff700x (most recent call first):
/n File o"u/surs/rd/allilbe624//lpiby6th4o/np3y.t8h/otnh3r.e8a/disnigt.ep-yp"a, line c302k in agewsa/itto
rc File h"//duissrt/lriibb6u4t/epdy/tehloans3t.i8c//tmhulrteiadpirnogc.epsys"i, line n433g in /aecrqruoirrse/
_ File _i"n/ifts_x_/.nopuys"r, line /345d in awlrlaep2p/elri
b File 6"4//fpystxh/onno3u.s8r//sDiAtLLeE-2p-apcyktaogrecsh//etmrbaeidnd_idnigf_fruesaidoenr_/pprairorqu.epty_"n, line u757m in p<ym_ordeualde>e
r.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiproccompute-od-gpu-dy-p4d-24xlarge-15:13994:14073 [0] NCCL INFO comm 0x7fcb68000f60 rank 63 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE
essing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f48957fb700 (most recent call first):
File "/usr/[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/pythonterminate called after throwing an instance of '3.8std::runtime_error/'
multiprocessin what(): g[Rank 63] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out./c
onnection.py", line 931 in Fatal Python error: wAborteda
it
Thread 0x File 00007fca2f5fe700" (most recent call first):
/<no Python frame>
us
rThread 0x/l00007fca2ffff700i (most recent call first):
b<no Python frame>
6
4Thread 0x/00007fca48bfd700p (most recent call first):
y<no Python frame>
t
hThread 0xo00007fca495fe700n (most recent call first):
3<no Python frame>
.
8Thread 0x/00007fca49fff700m (most recent call first):
u<no Python frame>
l
tThread 0xi00007fca653ff700p (most recent call first):
r<no Python frame>
o
Thread 0xce00007fca7db56700s (most recent call first):
s<no Python frame>
iThread 0xn00007fca7e557700g (most recent call first):
/<no Python frame>
p
Thread 0xoo00007fcbd72bb700l (most recent call first):
. File py""/, line u499s in r_/wlaiibt6_4f/poyrt_uhpodna3t.e8s/
mu File l"t/uisprr/olciebs6s4i/npgy/tpohooln.3p.y8"/, line mu576l in t_ihparnodclee_srseisnugl/tpso
o File l."p/yu"s, line r519/ in l_ihba6n4d/lpey_twhoornk3e.r8s/
t File h"r/euasdri/lnig.bp6y4"/, line p870y in trhuonn
3. File 8"//tuhsrr/elaidbi6n4g/.ppyyt"h, line o870n in 3r.u8n/
t File h"r/euadsirn/gl.ipby6"4, line /932p in y_tbhoootns3.t8r/apth_rienandeir
n File g"./puys"r/, line l932i in b_6b4o/optytsthroanp3_[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
.i8n/ntehrr
e File a"d/iunsgr./plyi"b, line 68904 in /_pbyotohtosnt3r.a8p/
t
hThread 0xr00007fcbd7cbc700e (most recent call first):
a File d"i/nugs.rp/yl"i, line b8906 in 4/_pbyotohtosnt3r.a8p/
t
hThread 0xr00007f48ccdfa700e (most recent call first):
File a"d/iunsgr./plyi"b, line 63024 in /wpayitth
o File n"3/.u8sr//mluilbt6i4p/rpoyctehsosni3n.g8//pthoroela.dpiyn"g, line .114p in yw"o, line r433k in earcq
u File i"r/eu
s File r"//lfisbx6/4n/opuystrh/odna3l.l8e/2t/hlreiabd6i4n/gp.yptyh"o, line n8703 in .r8u/ns
i File t"e/-upsarc/klaigbe6s4//epmybtehdodni3n.g8_/rtehardeeard/ipnagr.pqyu"e, line t932_ in n_ubmopoyt_srteraadpe_ri.npnye"r, line
121 File in "p/iuescre/_lgiebn6e4ra/tpoyrt
h File "o/nu3s.r/8/ltihbr6e4a/dpiyntgh.opny3."8, line /890m in u_lbtoioptrsotcresaspi
n
gThread 0x/00007f487abfd700p (most recent call first):
o File o"l/.uspyr"/l, line i388b in 6_4g/upayrtdheod_nt3a.s8k/_mguelnteirpartocieosns
i File n"g/u/spro/olli.bp6y4"/p, line y114t in hwoornk3e.8r/
m File u"l/tuisprr/olciebs6s4i/ngp/yptoholo.npy3"., line 8/532t in h_rheaanddilneg_.tpays"ks, line
870 File in "r/uuns
r File /"l/iubs6r4//lpiybt6h4on/3p.y8t/htohnr3e.a8d/itnhgr.epayd", line i870n in gr.upny
" File , line "932/ in u_sbro/oltistbr6a4p/_piyntnheorn
3 File ."8//utshrr/elaidbi6n4g/.ppyyt"h, line o932n in 3_.b8o/otthsrteraadpi_nign.pnye"r, line
890 File " in /_ubsoro/tlsitbr6a4p/
p
yThread 0xt00007f48b0dfa700h (most recent call first):
o File n"3/u.s8r//tlhirbe6a4d/ipnygt.hpoyn"3, line .8908 in /_mbuolottistprraopc
e
sThread 0xs00007fcc04bfd700i (most recent call first):
n File g"//puosorl/.lpiyb6"4, line /114p in ywtorhkoenr3
. File 8"//sueslre/cltiobr6s4./pyp"y, line t415h in osne3l.e8c/tt
hr File e"a/duisnrg/.lpiyb"6, line 4870/ in pruynt
h File o"n/3u.s8r//mluiltbi6p4r/opcyetshsoinn3g./8c/otnhnreecatdiionng..ppyy"", line , line 931932 in in w_abioto
ts File t"r/aups_r/ilnibn6e4r/
p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3.s8s/itnhgr/epaodoiln.gp.yp"y", line , line 499 in 890_ in w_abioto_tfsotrr_aupp
d
aThread 0xt00007f48e8dfa700e (most recent call first):
s File
" File /"u/surs/rl/ilbi6b46/4p/yptythhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 519114 in in _whoarnkdelre
_ File w"o/rukserr/sl
i File b"6/4u/spry/tlhiobn634/.p8y/tthhorne3a.d8i/tnhgr.epadyi"n, line g.870p in yr"u, line n
870 File in r"u/nu
s File r"//luibs6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.adpiyn"g, line .932p in y_"b, line o932o in t_sbtoroatps_tirnanepr_
i File n"n/eurs
r File /"l/iubs64r//lpiybt6h4on/3p.y8t/htohnr3e.a8d/itnhrge.apdyi"n, line g890. in p_yb"oo, line t890s in t_rbaopo
t
sThread 0xt00007f48975fe700r (most recent call first):
ap File
"
/Thread 0xus00007fcc055fe700r (most recent call first):
/l File i"b/6us4r//plyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp", line o114o in lw.opryk"e, line r114
in File w"o/rukserr/
l File i"b6/4u/srp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpye"ad, line i870n in gr.upny
" File , line "870/ in ursunr/
l File i"b/6u4s/rp/yltihbo64n/3p.yt8h/otnh3r.8e/atdhirnega.dpinyg".p, line y932" in , line _932b in o_obtosottrsatpr_aipn_nienrn
er File "
/ File u"s/ru/srl/ilbi6b46/4p/yptyhtohno3n.38./8t/htrheraedaidningg..ppyy"", line , line 890890 in in __bboooottssttrraapp
Thread 0xThread 0x00007fcc05fff70000007f48961fc700 (most recent call first):
(most recent call first):
File File ""//uussrr//lliibb646/4p/yptyhtohno3n.38.8/m/umlutlitpirporcoecsessisnign/gp/opoolo.lp.yp"y, line "114, line in 114w in owrokrekre
r File
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y870" in , line r870u in n
r File u"n/
u File s"r//ulsirb/6l4i/bp6y4t/hopny3t.h8o/nt3h.r8e/atdhirnega.dpyi"n, line g932. in p_yb"o, line o932t in s_tbroapo_tisntnrearp
_ File i"n/nuesrr/
l File i"b/6u4s/rp/ytlhiobn36.48//ptyhtrehaodni3n.g8./ptyh"r, line e890a in d_ibnogo.tpsyt"r, line a890p in
_
bThread 0xo00007fcc0eb11700o (most recent call first):
t File "s/truaspr
/
lThread 0xi00007f4896bfd700b (most recent call first):
6 File 4"//upsyrt/hloinb36.48//pmyutlthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.erp
y File "", line /114u in swro/rlkibe6r4
/ File p"y/tuhsorn3/.l8i/bt6h4r/epaydtihnogn.3p.y"8, line /870t in hrruen
a File d"i/nugs.rp/yl"ib, line 68704 in /rpuynth
o File n"3/.u8s/rth/rleiabd6i4n/gp.yptyh"o, line n9323 in ._8b/otohtrsteraadpi_nignn.epry
File "", line /932u in s_rb/oloitbs6t4r/apypt_hionnn3e.r8
/ File t"h/ruesard/ilnigb.6p4y/"p, line terminate called after throwing an instance of 'std::runtime_error'
y890t in h_ what(): [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808539 milliseconds before timing out.
obno3o.Fatal Python error: Aborted
t8s/tThread 0x00007f4c9d3fd700 (most recent call first):
<no Python frame>
Thread 0x00007f4c9ddfe700 (most recent call first):
<no Python frame>
trharp
Thread 0x00007f4c9e7ff700 (most recent call first):
<no Python frame>
Thread 0x00007f4cb4dfc700 (most recent call first):
<no Python frame>
Thread 0x
e
aThread 0xd00007f4cb57fd700 (most recent call first):
<no Python frame>
Thread 0x00007f4cb7fff700 (most recent call first):
<no Python frame>
Thread 0x00007f4cfcbfd700 (most recent call first):
<no Python frame>
00007fcc0f512700i (most recent call first):
n File gThread 0x00007f4d81357700 (most recent call first):
<no Python frame>
Thread 0x00007f4c2d7fb700 (most recent call first):
File "/us."p/yu"sr/lib64/python3, line r890/.8/multiprocessin in l_ibbo6o4/tpsyttrhaog/pool.py", line 576 in _hpn
3
andle_results
File .Thread 0x800007f48cffff700"/usr/lib64/pyth/ (most recent call first):
m File uon3.8/threadi"l/tng.py", line 870 in run
uisprr File "/usr/lib64/p/olciebython3.8/threads6s4ing.py", line 932 in _booti/npgystrap_inner
/tpho File "/usoonl3r/lib6..p84/pyty/"m, line u114l in twiohon3.8prrkoe/threrc
e File ading.s"s/ipy", line 890unsgr/ in _boop/oltstrapoilb
Thread 0x00007f4bbe1fc700 (most recent call first):
.6p4/y File "/usp"y, line t114hr/lib6 in owno4/pyt3r.khon3.e8r/8/thre
t File h"r/euasading.dri/nlgpy", line 302.ipby in wait"6, line 4870
File "/u/ in prsr/lib6uynt
h4/pyth File o"n/on3.83u.sr/thre8//ladingitbh.py", line 6r4e/433 in acqapdyuire
tihnogn.3p.y8" File "/f/, line t870h in sx/norreuandusr/d
i File nalle2"g/.upsyr"/, line 932l in i/lib6_bb6o4o/t4/pytpsytthon3.rhaopn8/site-packages/embedding_reader/parquet_numpy_reader.p_3i.ny", line 121 in piece_gen8e/rnerator
File "/ust
h File r"r/lib64/pythone/audsi3.8/multiprocern/gl.ssing/pool.py"ipby6, line 388 in _guarded"4, line /_task_generati932p in y_on
File "/usr/ltbhooib64/python3.8/ont3s.t8rmultiprocess/atph_ing/pool.py", line 532 in rien_handle_tasksandeir
File "/usr/lib64/n
g File ."ppython3.8/th/yu"sreading.py", line 870 in , line r890/ in l_run
File "/usribbo6o4t/lib64/sptyrtahpo
/pytho
nThread 0x300007fcc0ff13700. (most recent call first):
n3.8/8 File /"t/huthrearserding.pa/dliy", line 932 in inbg6_boot.4p/yp"strapy, line t890h in o_inne_nb3o.o8tr
File "//smturlusr/liatpi
b64/pp
rThread 0xoython00007f48e97fb700c (most recent call first):
es File s"i/3.8/tnugs/rp/olohreadili.bp6ng.pyy4"/, line ", line 890 in _p114y in tbootswhoornk3e.r8
/ File m"trap
u/luThread 0x00007f4bdbfff700 (most recent call first):
File stri/"/usrplrib/lib6o6c4e4/pyt/spshon3.8yitnh/seleogn/3p.o8ctors.o/ltpy", line 415.hprye in sele"a, line d114ct
File i in wn"/usrogr.kpeyr"/lib64
, line File 870/pyth" in /ruuon3.8/ns
r File /"multil/iuprocebs6rssing//4l/ipby6connet4h/ctionpoynt3h.py", line .o8n/931 in wai3t.h8t
File "/r/etausr/lidhirneb64/pga.dpython3iyng"..8/mup, line y870ltipr" in , line r932uocessi in n_
bng/po File o"ot/sutsrra/ol.py"pl_iibn, line 499 in _w6ne4r/
ait_fp File y"tor_upd/huosnrates
3/.l8i/bt6h4 File "/usr/epayr/libdtihnogn.3p.y864/pyt"/, line t932hhon3. in r_ebaodo8/multitnsgt.raiprocppy_essing"i, line n890n in /pool_ebr.py", line o
o519 in _ha File t"s/ndle_wtursorkerarp/
l
s
File "/uThread 0xib00007fcc12abd7006sr/li (most recent call first):
4 File /"/b64/ppuystython3rh/olni3b.68.8/th4//tphyreadinrtheoag.py"nd3i.n8, line 870 in rung/.mpuy
File "/ul"t, line isr/li890p in r_b64/pobcoython3eost.8/thsstireadinrnagp/g.py"
p
oThread 0xo00007f48b2bfd700l, line 932 in _b. (most recent call first):
py File ""ootstr, line /114u in swap_inro/rlner
File kiebr6"/usr/
4 File /"plib64/yut/pythoshro/nn3.8/3l.i8b/6m4u/lpthreadtyiting.pphroy", line 890 in onc3e._boots8s/itstrap
nhgr/
Thread 0x00007f4bf7fff700 (most recent call first):
epao File "/usdoiln.gr/libp.yp"64/pyty, line "114, line in 870w in orruknhon3.e
r File 8/mult
" File /iproc"u/suressings/rl/ilb/pooli6b46.py", line /4p/yp114 in worytthhoker
File onn"/usr/33..88//tthhlib64rreeaa/pythoddiinnggn3.8/..ppythready"", line , line 932870 in in _ing.prbuony", line 870 in ro
t File s"t/ruaun
File "spr_/usr//ilnilib64/nbe6r4pytho
/ File pn3.8/"y/tuhthreadsorning.p/3l.i8b/6t4h/ry", line 932 in peyatdh_bootsionntrap_3g..inner
8p/yt" File "/ushr, line e932a in r/libd_ibnogo.t64/pytpsython3.8"r, line a890/threp in __iading.bnonpy", line 890oetrs
t File r" in _boo/aups
tstrap
rThread 0x/00007fcc134be700l
Thread 0x00007f4c13fff700 (most recent call first):
(most recent call first):
i File b"6/4u/spry/t File "/uslhiobr/libn634.64/pyt8//pthon3.8yhtrhe/multoand3i.n8giproce./pmyussing/"l, line tpool.pi890p in r_y", line 114 in obcoworkeeostr
File "/sstirusr/linagp/b64/p
p
oThread 0xoython300007f477ffff700l (most recent call first):
. File p"y/"u.8/th, line s114r in /wloireadinrbk6e4g.py", line r/
p870 in runy File t"h/
File "/uouns3sr/lir./8l/ib64/pytbh6thon3.r4e/ap8/thrdyitneadinghgo..py", line np3y.932 in _bo"8, line /306t in hwraeiotstraatd
ip_inn File ng".er
File "p/y/usr/lu"s, line r870ib64/p/ in lriythonubn6
43.8/t File /"p/yhreadituhsng.py"orn/3l.i8b/6t4h/r, line 890 in _bpeyatdhionngootstr3..p8y/ap
Thread 0x"th, line re55800007f4c117fb700 (most recent call first):
File "/a in dwusr/lainigt.ib64/p
py File ython"", line /932f in s_3.8/mxb/onoultiprotussocesstrr/adping/poa_linlnee2r/
ol.pyl File i"b", line 114 in w/6us4orker
r//ply File "/usitbh6or/libn43/.p8y/tsh64/pytiotne3-hon3..p8a8/thre/cthading.kraegpy", line 870aedsi in run
/ntgq.pdy File "/usrm"/, line /lib64/python3.8/threading.py", line 932 in _bootstrap_inner_890m in
File "/usr/lib64o_nbi/python3.8/threotootrading.py", line 890 in _bos.tprotstrap
Thread 0x00007f4bbebfd700 (most recent call first):
File ya"p"/usr/lib64/pyth, line
60
in Thread 0xon3.8/multiprocr00007fcbc95fe700u (most recent call first):
n File
" File essing/pool.py", line /"u/su114 in worker
File "/usrs/rl/ir/lib64/python3.8lb6i/threading.py4b/6p4y/tphyo", line 870 in run
File "/usr/tnh3o.lib64/pythonn83/.m3.8/threading.p8u/lttihprreoacdy", line 932 in _bootstreisnsap_inner
File "/usrign.gp/lib64/python3/yp"o, line ol932. in py_.8/thre"b, line o114o in twsading.otrrkapy", line 890epr_
i in _boot File n"n/estrapurs
r File
Thread 0x00007f4bbffff700 (most recent call first):
"//lu File "/usisbr6r/lib/4l/ipby6t64/pyt4h/ophon3.ny3t.h8/mult8o/ntiproc3h.r8eessinga/dt/poolihnrge.py", line 114.apd in workyi"n, line g870.er
File " in pryu"n/usr/l, line
890 File in ib64/"_/buopythonsort/sltir3.8/tba6phreadi4
/
ng.pypThread 0xy00007f4b0dfff700", line 870 in rt (most recent call first):
h File on3"./un
File "8u/st/usr/lrh/rib64/leiapythonbd6i3.8/t4n/ghreadip.yptyh"o, line ng.py932n in 3", line 932 in __.b8o/bootstoctosntrcaupr_rrap_iiennntnner
File e/rf
"/usru File t"/lib6/uurs4/pythers/on3.8/ltih/threbr6e4a/ading.dp.ytphpy", line 890yo"n in _boot3, line .788strap in /_t
Thread 0x00007f4bd97fb700 (most recent call first):
hwro File "/userakder/libir
n64/pytg File ."hon3.p/yu"s, line r890/ in l_8/mulibbo6o4ts/tpryatiproctph
essino
nThread 0xg/pool300007fcbc8bfd700. (most recent call first):
8.py", line File /"t/114 in woruhsrre/ker
File aldi"/usribn6g4/lib64./ppyy"/pytht, line h870o in on3.8/nr3u.threan8
/ding.pm File u"y", line 870 in l/tuirun
File psrro"/usr/c/ellib64isbs6i4n/pytho/gp/ypn3.8/tohooln.3p.y8"/, line t114h in rweothrearakdeding.ri
npy", line 932 File g". in _boot/puys"rstrap, line /932l_innei in b_6br
File "/4o/optusr/liystthrb64/poanp3_ython3.i8n/n.8/thtehrr
e File adreadi"i/nng.py"ugs., line 890 in _brp/yl"iootstr, line b8706 in 4rap
Thread 0x/upn00007f4bf4dfa700 (most recent call first):
File "/y
t File husr/l"o/nu3ib64/ps.r8ython//ltihbr6e4a3.8/mu/dpiynltiprtgh.oocesspn3y."8ing/p, line /890t in ool.pyh_rbeoa", line 114 in wodtiorkernsgt.r
File "/upayp"
sr/li, line
932Thread 0x in 00007f4b53685700_ (most recent call first):
bob64/py File o"t/sthon3utsr.8/thrra/pleadin_iibng.py", line 6n4e870 in run/rp
File "/u File y"t/huosnsr/libr/3l.64/pyi8b/6s4thon3.e/lpeyctthoor8/thrns3..8py/"t, line heading468r in esa.py", line deilne932 in _bogc.tp
otstray File "", line /p_inn890u in _sbro/er
File "/olitbusr/l6s4t/ib64/prpayptython
h
oThread 0xn00007fcb4dfff7003.8/t3 (most recent call first):
. File 8hreadi"//aung.pyssyrn/", line 890 in _cliibootstob/6rap
4b/aThread 0x00007f4bf61fc700 (most recent call first):
File psyet"/usr_heo/lib6vne3n4/pyth.t8son3.8./pmy/multiu"l, line tproce1823i in pssing/_rroucne_sopool.snicnepy", line 114g
/ File p"o/o in workeuls.r
File "/pry/"lusr/l, line i114b in 6wib64/p4o/rpythonkyetr3.8/th
o File nhread"3/.u8s/ra/ing.pyslyinbc6i4", line 870 in r/op/ybun
File "tahsoe/usr/ln_3e.ib64/v8e/pythonntthsr3.8/t.epayhreadd"i, line n570g in .rpuyning.py"_, line f870o in ", line 932 in _rreuvbootsne
r File trap_i
" File /"nner
u/surs/ File "/usrl/ilbr/lib6i6b46/4/pyt4p/ypthon3.8yhtohno3/thren.38./ading8t/htrherae.py", line daid890 in _bonign.gp.yp"yotstr, line "932, line in 870_ in brouap
Thread 0xont
s File 00007f4bdb5fe700 (most recent call first):
File "/t"r/ausr/lups_rib64/pi/nlnieythonbr
6 File 4"3.8/mu//pultiprsyrt/ocessihloing/ponb36ol.py".48//p, line 114 in wotyhtrheoandrker
i3n. File "/us8g/.tr/libhpry64/pyte"a, line d932ihon3. in n_g8/threb.opoyading"t, line s.py", line 890t in r_abpo_870 in runoitn
File "/usntesr/librra
p64/py File
"
/Thread 0xthon3.u00007fc88abfd700s (most recent call first):
r8/thr/ File l"ieading/bu6s4r//.py", line plyi932 in _botbh6o4n/3p.y8t/htohnr3e.otstra8d/itnhgr.ap_innepayd"er
File ", line i890n in /usr/g_.bpolib64/yo"t, line spytho306t in rwan3.8/tapi
t
Thread 0x File hread00007f4c0df6b000" (most recent call first):
/ing.p File us"y", line 890 in r//flsixb/6n4_bootso/uptrap
syrt/
Thread 0x00007f4bf75fe700 (most recent call first):
File hdoa"/usrnl3l/lib6.e82/4/pytt/hlhon3.8irbea/mult6d4i/iprocengp.yssingptyh"o, line /pooln5583 in ..py", line w8a/isti
t File e"-114 in wor/pafcker
File skxa"/usr/gneo/lib64su/srro/pyth/tdaaon3.8/rlylthrea_e2eding./mlbiebpy", line 870d6d4i/ in run
File pnyg"/usrt_ht/lib6oorn4/pythc3h./8r/oson3.8tiatre/threay-_pding.eamcpy", line 932bkead in _bootgdeisnstrap/gt_qtdomr/c_inne_hm.opr
File "/nyi"usr/lt, line o38r. in pryo"t, line a60t in reu_nh
aib64/p File l"f/
uythonsr/li3.8/thb64 File /"preadiy/tfhsong.py"xn/3.n8o/ut, line 890 in _bshrr/ootstedaadlrap
Thread 0xilneg00007f4aa61fc700 (most recent call first):
File "/2./plyi"usr/lb, line 69324 in ib64//_pbyopythontohtos3.8/tnt3r.a8hreadip/_siinng.pytnee-r", line 306 in wp
ac File ait
File k"a/guessr"/usr//lr/lib6iobt6a4/pytr4y/p_yhon3.8teh/thremobne3d.ading.d8i/py", line 558ntgh in wait_rteoa
File "/frdcihnsx/no/gr.optya"r, line usr/day890_ in elle2/_mbblib64/oeodtdsitnrgpytho_atpo
rn3.8/
cThread 0xh00007fcc13fff700. (most recent call first):
site-p File y"packag, line "47/es/tqu in sardm/_mop/pllnitoriyb_6r.py", line 604o/tpayr in run
tyh_oenm3b.
8 File /"c/ofns File "/uscxu/rnrr/liboeunst64/pyt/rf/udhon3.atlul8/threree2s//ladingitbh6.py", line r4e/apd932 in _booy.tphtstrayo"n, line 378. in 8_/p_innewsoirtr
File "/kee-rusr/lp
a File cib64/k"a/guespythonrs//l3.8/triobt6hreadi4a/rng.pypyy_t", line 890 in _heomnbbootste3d.d8rap
i/ntgh_rteoardcThread 0x00007f4dfffff700 (most recent call first):
File ih/nrg"/usro.tpa/lib6yr"y, line 4/pyth_870e in mron3.8buend
/concu File d"irrent/nugs_/futurt/olrires/thcbh6.read.4p/yp"y, line tpy", line 7895h in orno3t.8 in _wora/tteker
File h_qru"/usreear/lib6diine4/pythgs._poyon3.8r"_, line k/threa932e in y_sbding.
o File opy", line 870"t/s in run
ftsrxa/ File "/usrpn_oiunsnre//lib6rD
A File 4/pyth"L/LuEon3.8s2r-//threplyiading.tbo6py", line 932r4c/h in _boop/ydtatstrahlolne3p_inne2._8p/r
File "/ytthorrecausr/lhd/idnaib64/pgl.lpythoney2"_3.8/t, line p890y in hreadit_obrong.pycoht.", line 890 in _sptyr"a, line p762bootst
in f
rap
Thread 0xo00007fcc521e1700r (most recent call first):
wThread 0x00007f4e6b685700 (most recent call first):
File a File r"d/"/usr
u File s/lib64r"//lfisbx6/pyth/4n/opuyson3.8/trh/odnseleca3l.ltors.8e/2spy", line 468/ell in seleeicbt6o4rct
File "/sp.yp/usr/tyho"n, line 3468. in 8s/lib64/eslietctepytho
- File p"an3.8/a/cuksarsynci/gleo/baseisb/6t_even4o/rpcyht/hnts.py"onn/3m.o8d/ualseys, line 1823 in _/nmcorun_oidou/lbnce
File ea.spe"/usry_"ev, line e1130n in t/lib64_sc./pythaplyon3.8/l"_, line i1823m in asyncp_lr
uio/ba File n"_/se_eveofnscxnts.pe/
n File y", line 570 in o"u/surrun_fs/rdorever/all
File "/uilbe624/sr/li/lpiyb64/pbt6hython34o/np.8/th3y.t8hreadino/na3s.g.py"y8n/cs, line 870 in ruiiotn
File "/e/-bpausr/lasce_kib64/peavgeenst/s.ythontpoyr3.8/th"ch, line readi570/ in nrnng.pyu/nm_", line 932 in _ofdobootsurleevtrap_se/rc
o File inner
n"t/ File "/usauisr/libnre/rl.ipyb"64/pyt6, line 4139/hon3.p in yfth8/threoornadingw3a..py", line 8r/d890 in _boot
h File rtstra"e/ap
Thread 0x00007f4f25ef8000fdisn (most recent call first):
File "/fxg/.pnyo"ussx/no, line r870/ in usr/dadraulnl
lle2/e File 2"/lib64//luisbpythor6/4l/n3.8/sipby6tite-p4h/opnackagey3t.h8os/tor/ns3ich/aut.t8e/-ogradtpharce/__inikaadt__.pgiensy", line 173 in g/.tpoybackwr"c, line h932/ard
File in n_nb"/fsx/omoot/nousdsutlr/dallrease2/lip/_miondnub64/pelre
.ython3 File p"y/"u.8/si, line s1130rte-pac in /_lckagesiabl6l4_/ipmypt/torchlh
o File /_tenn"3/sor.pyf.s8x//t", line 396 in bnhoruackwaesard/ird
File "DnAgL/fsx/L.Ep2nousr/y-"p, line ydallet890o in r_cbho2/lib/odta64/pytsltlreahon3.2p_
8/sit
pThread 0xy00007fcd0ca49000t (most recent call first):
e-packo File r"cages//hf/sdxa/lnloeaccel2u_spry/tdoarlclhe.2p/erate/yl"i, line baccel67104 in erator/fpo.py", line yrtwha736 in bacornd3kward
. File 8"/ File "/f/sfistsx/nouxe/-sr/DApnaocukLLE2-psarg/ytorcdeaslh/dall/lteo2r/clhi/be2_py6n4n//torchpmyot/traihdouner.pnl3ey", line 400 in .s8//forwalsiird
File "nteea-r/fsx/p.apcnousryk"a, line g114e in sf//DALLEotrowr2-pytacrhd/orch/d
n File n"/alle2/mfos_pytoxd/ulneosu/srch/trmrod/uainerdlael.py", line .lpe107 in inny2"/, line l1130er
File "i in b_6c4a/lpl/fsx/y_tihmonousrnp3l.
/dall8 File /"s/ie2/libftse64/pyx-/pthon3.naocuksa8/sitrg/ee-packDsA/tLages/oLrEc2h-/pyntnotorch//rmcohnn/mod/udladuleselsl//modulem2o_e.py"dpuylt, line 1130 in _ceo.rpcyhall_i"/, line d1130a in mpl
File l_lc"/fsxea2l_lp_yitm/nousoprl
r/DALLc File h."p/yE2-pyf"s, line x806torch/ in nfoo/trairuwsarr/dD
n_diffA File L"L/usionEf2s-x_priorp/yntoo.py", line ursc503 in trarh//in
File ddaa"/fsxllllee22/_lpi/nousrybt6/DALLo4r/cE2-pyphy/torch/tdhaotrainlnl3e_diff.28_usion_/psyipriorttoer-cpha.py", line .cpkya"g, line e718s in /f736 in initoorrtialiwcahrd/ze_tr
n File n/aining"m/ofds
File "/uxl/enfsx/nso/umsord/udlaelousr/Dl.ep2y/lALLE2"i, line b-pytor11306 in 4_/ch/trcpaylain_dtlh_oiiffusinm3p.l8
on_pr File /"sior.p/ifty", line 753 in sex-main
p/ancoku File "/fasgresx/no/sD/usr/daAtLoLrEc2h-/lle2/pnynt/molib64/ordcuhpythol/edsan3.8/s/lmloedite-p2u_lackagpey.es/clitpoyck/cor"c, line re.py"h1130/ in d_, line 760 in iaclanvoke
llle_2i_m File "/fsppylt
ox/nou File r"csr/dal/hf.sle2/lpxy/"nib64/po, line u944ythons in rf/oD3.8/srAwLaite-pLrEd2
-ackage File p"y/ts/clifosrck/corxch//ne.py"doaulslre2, line 1404 in in/_dpavoke
yltlo File "/fser2c/x/nouhl/idbsr/dala6l4l/le2/lep2y_tpib64/phyotython3no3rc.8/si.h8./te-pacpsyi"t, line e806- in pfakagescokrawgaers/clickd/
t File o"/core/rfcshx.py", line //nn1055 in maionu/smrn
File "/o/dduafsx/nllelousr/se/2dalle2m/oldi/lib6ubl6e4./p4/pythpyy"t, line on3.8h1130o in n_3/sitec.a8-packal/ls_ges/ciitmlick/ep-lp
acore.pc File k"y", line 1130 in a/gfse__calxs//ntoorucsl__
File hr//nD"/fsxnA/LmLo/nousEd2ur/dal-lpeyle2/list/omrb64/pocdhu/ython3ldea..8/silplyte-pace"2, line kages_1130p in y_tcoar/torchlclh_//distidmapributelll
ed/ela2 File _"pstic//yftsmultipxo/rncoroceshu.ssing/erp/yD"rrorsA, line L1144L in E/__inip2_-plyott__.psosrecshy", line 345 in
/ File d"a/lwrappflsex2/_nper
File "yotu/fsx/osrrcnousr//hD/AdLaLlEDALLEl2e2-2-pyto_ppytyotrrch/tocrhc/rain_dhd.apliffuslye"2_, line ion_prp944y in tfior.poorrcy", line 757 in wha/rd<moduda
ll File e"2/_fple>
syxt/onrocushr./pdya"l, line le12542 in /floirbw6a4r/pdy
t File h"o/nf3s.x8//nsoiustre/-dpaalclkea2g/elsi/bt6o4r/cphy/tnhno/nm3o.d8u/lseist/emo-dpualcek.apgye"s, line /1130t in o_rcaclhl/_ninm/pmlo
d File u"l/efss/xm/ondouulser./pDyA"L, line L1130E in 2_-cpayltlo_ricmhp/ld
a File l"l/fes2x_/pnyotuosrrc/hd/adlallel2e/2l_ipby6t4o/rpcyht.hpyo"n, line 31144. in 8p/_sliotses-epsa
c File k"a/gfessx//tnoorucshr//nDAnL/LpEa2r-aplylteolr/cdhi/sdtarlilbe2u_tpeydt.opryc"h, line /969d in al_ler2u_np_ytdodrpc_hf.oprywa"r, line d1254
in f File o"r/wfasrxd
/ File no"u/fssrx//dnoaulslre/2d/allilbe624//lpyitbh6o4n/3p.y8t/hsoint3e.-8p/ascitkea-gpeasc/ktaogersc/ht/onrnc/hp/anrna/lmloedull/edsi/smtordiubleu.tpeyd"., line p1130y in "_, line c1008a in lflo_irmwpalr
d File
" File /"f/sfxs/xn/onuosurs/rd/adlallel2e/2l/ilbi6b46/4p/ypytthhoon3n.38./8s/istiet-ep-apcakcakgaegse/tso/rtcohr/cnhn//npna/rmaoldluelle/sd/imsotdruilbeu.tepdy."p, line y1130" in , line _969c in a_lrlu_ni_mdpdlp
_ File fo"r/wfasrxd/
n File o"u/sfrs/xD/nAoLuLsEr2/-dpayltloer2c/hl/idba6l4l/pey2t_hpoynt3o.r8c/hs/ittrea-ipnaecrk.apgye"s/, line t394o in rfcohr/wnanr/dp
ar File a"l/lfeslx//dniosutsrri/bDAuLtLeEd2.-ppyy"t, line o1008r in cfho/rdwaallred2
_ File p"y/tfosrxc/hno/utsrra/idnaelrl.ep2y/"l, line i107b in 6i4n/npeyrt
h File o"n/3fs.x8//nsoiutser-/pdaaclklaeg2e/sl/itbo6r4c/hpy/tnhno/n3m.o8d/uslietse/-mpoadcuklaeg.epsy/"t, line o1130r in c_hc/anlnl/_immodpull
e File s"//mfosdxu/lneo.upsyr"/, line D1130A in L_LEc2a-lpylt_oirmcphl/
d File al"l/ef2s_xp/yntoourscrh//DtArLaLinEe2r-.ppyyt"o, line r394c in hf/otrrwaairnd_
d File i"f/ffussxi/onno_upsrri/oDrA.LpLyE"2-, line p503y in ttorracihn/
da File l"le/2f_sxp/yntooursrc/hD/AtLrLaEi2n-epry.tpoyr"c, line h107/ in tirnaniern
_ File di"f/ffussxi/onno_upsrri/odr[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
a.lplye"2, line /736l in iinbi6t4i/aplyitzeh_otnr3a.i8n/isnigt
File terminate called after throwing an instance of 'std::runtime_errore"-/pf'
what(): [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out.
ascxk/aFatal Python error: Aborted
Thread 0xngoe00007f9f0d5fe700 (most recent call first):
<no Python frame>
Thread 0x00007f9f0dfff700 (most recent call first):
<no Python frame>
Thread 0x00007f9f28bfd700 (most recent call first):
<no Python frame>
uss/rt
Thread 0x00007f9f295fe700 (most recent call first):
<no Python frame>
Thread 0x00007f9f29fff700 (most recent call first):
<no Python frame>
Thread 0x/or00007f9f41141700 (most recent call first):
<no Python frame>
Thread 0x00007f9f43357700 (most recent call first):
<no Python frame>
Thread 0x00007fa01397c700 (most recent call first):
<no Python frame>
DcAhL/Thread 0x00007f9e2ebfd700 (most recent call first):
File "/usr/liLnEn2/-mopdyutb64/python3.8/mloerscultiprocessinh//mtordg/pool.py", line 576 in _auilne_.dpiyhandle_resultsf"f, line
File "/usr/lib64u1130s in i/python3.8/thr_ocna_leading.py", line 870plr_ in run
File "/usr/libiiomrp64/python3.8/t.l
p File yhreading.py", line ""/f, line s932 in _bootstrap_inn753x in /er
File "/usr/lib64/mnaoipython3.8/threaduns
r File ing.py"/"D/AfLsLxE/2n-, line 890 in _bopuysootsttro/rrap
Thread 0xdcah00007f9e497fb700 (most recent call first):
File "l/lte/usr/lr2a/ilnib64/i_bd6ipython4f/f3.8/thpuystihreadioonn_3p.r8ng.py"i/osr, line 302 in wi.tpeait
File y-"p, line a503"/usr/c in ktalib64rgaeisn/pytho
/ File c"li/cfks/xn3.8/tc/onhreadroeuing.pys.rp", line 433 in /yD"A, line LacquiL760E in 2i-re
File npvyo"/fsx/tkoer
ch File nousr/"tr//dallefasi2/libxn/_ndoiufsfru/s64/pdiaolythonnl_e3.8/sip2r/iloite-parb.6p4yckages/"p, line y/embet736h in dding_oinnreade3i.t8r/parqi/asluet_niitzumpy_ree_-tpraaicneaderkiangge.py"
s File /"c/, line 121 in pilfiscece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line xk//n388 in _guarded_task_gcooursereneration
File "/./pDyusr/lib64/pythonA"L, line 3.8/multiprocesL1404E in 2i-nsing/pool.py", line 532pvyotko in _handle_tasker
cs
File "/usr/lib64/pyth File /"t/hon3.8/threarfasixnding.py", line 870 in run/_ndo
File "/usr/lib64iufsfr/python3.8/threadu/siding.py", line 932 in _boanl_lpootstrap_innerer2i/o
File "/usr/lib64/lri.bppython3.8/threading6y4"/.py", line 890 in _bootst, line p753y in trap
Thread 0x00007f9e4bfff700 (most recent call first):
File "/usmhaoinr/lib64/pyn3
. File 8thon3."//sfi8/selestxe/-nctorspoau.py", line cskra/gdeasl/415 in sellcel2ect
File i/clk"/usri/bc6o4r/ep/lib6.ypyt"h4/pyth, line o1055n in 3m.a8on3.8i/ns
i File /multit"e/-fprocepsaxc/kssing/naogueconnessr/ction./cdlapy", line ilclke931 in wait/2c/ol
File "/urieb.6p4y/psr/li"y, line t760hb64/py in oinn3thon3v.o8k.8/mul/es
i File tiproct"eessing/-fpsax/pool.c/knagoepy", line 499uss/rc/ldia in _waiclklt_for_e/2c/olrieupdatb.6p4yes
File "/"p, line y1130t in h/usr/_o_nc3a.l8lib64//ls_pythoi_t
e File n3.8/m-"pa/cultipfksaxrocessg/ening/posu/sool.pycrl/idc", line 519 in _akl/lcehandlo2r/el.ipby6"e_wor4, line /1404p in kers
File yitn"/usrvhoo/lib64kne3
./pyth File 8"/on3.8//sfithreastxe/nding.p-opuascrk/daaglelse/2y", line 870 in t/olrrun
File icbh"/usr6/4d//lib64ipsy/pythtthron3.8/iobnuthrea3t.eding.p8d//sy", line 932 in eiltaes_boot-tpiacc/kmastrap_ugletsinneri/pcrloi
File "/uccke/scsosr/libirneg64/py./peyrthon3."r, line o1055r in 8/thrmsa/eadingi_n_
i File n"i.py", line /tf_s_890 in _box./pnyootstrau"s, line r345/p
Thread 0x00007f9e821fc700 in dwa (most recent call first):
File "/rlalusr/lipep2b64/p/elrib
ython36 File 4"//pf.8/muystxhltipro/onno3u.s8rcessi//sDiAtng/pooLeL-Epl.py"a2c-kpa, line 114 in wygteosorker
r/cchl/i File "/usctkr/r/libacior64/pytne._phon3.dyi"f, line f1130u in s_8/thre_icoanlading_lp_r_.py", line
i File or870 in run
"./p File "/usfys"x/, line n757o in u<smrr/libo/ddu64/pytallel>hon3.e
2/l8/threib64/adingpytho.py", line n3.8/932 in _boosite-tstrapackap_inneges/tr
File "/orch/dusr/liistributed/eb64/plastiython3c/mul.8/thtiprocreadinessing.py"g/err, line 890 in _boors/_otstr_initap
Thread 0x__.py00007f9e661fc700 (most recent call first):
File "/", line 345 in usr/lwrappib64/per
File "/fsx/noythonusr/D3.8/muALLE2ltipr-pytorocessich/trng/poain_dol.py"iffus, line 114 in woion_prker
rior. File "/usrpy", line 757/lib6 in <mod4/pythule>
on3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e66bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e9cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e675fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e80dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e67fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e9e1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9e82bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9ebb5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9d335fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fa0b61c7700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fa10521c700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fa1bb8b7000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13944 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13946 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13947 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13948 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13949 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13950 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13951 closing signal SIGTERM
compute-od-gpu-dy-p4d-24xlarge-9:13991:14066 [0] NCCL INFO comm 0x7f3f08000f60 rank 14 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f3e095fe700 (most recent call first):
<no Python frame>
Thread 0x00007f3e09fff700 (most recent call first):
<no Python frame>
Thread 0x00007f3e253fd700 (most recent call first):
<no Python frame>
Thread 0x00007f3e25dfe700 (most recent call first):
<no Python frame>
Thread 0x00007f3e267ff700 (most recent call first):
<no Python frame>
Thread 0x00007f3e40dfc700 (most recent call first):
<no Python frame>
Thread 0x00007f3e417fd700 (most recent call first):
<no Python frame>
Thread 0x00007f3e5df55700 (most recent call first):
<no Python frame>
Thread 0x00007f3d46bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d2bfff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d457fb700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d475fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d461fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d60dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d47fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d62bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d7d7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3dd3fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d63fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d635fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3d7cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3c2cdfa700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3fa0a59700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f3fc0e2a700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f40a8583000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/container.py", line 139 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 709 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13984 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13985 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13986 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13987 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13989 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13991 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13992 closing signal SIGTERM
Traceback (most recent call last):
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module>
main()
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
initialize_training(config_file, accelerator)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
train(
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
loss = trainer(text=txt, image_embed=img)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 59. Original reason for failure was: [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f92b3fff700 (most recent call first):
<no Python frame>
Thread 0x00007f92ccbfd700 (most recent call first):
<no Python frame>
Thread 0x00007f92cd5fe700 (most recent call first):
<no Python frame>
Thread 0x00007f92cdfff700 (most recent call first):
<no Python frame>
Thread 0x00007f92e8bff700 (most recent call first):
<no Python frame>
Thread 0x00007f9306bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f93075fe700 (most recent call first):
<no Python frame>
Thread 0x00007f9307fff700 (most recent call first):
<no Python frame>
Thread 0x00007f91d35fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f94958e3700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9552761000 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock
File "/usr/lib64/python3.8/threading.py", line 1011 in join
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06090 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06120 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06000 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea05f70 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea060d8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06048 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea05fb8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 3, size: 0, state: CREATED, direction: SEND }
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06168 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND }
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 1 (pid: 13945) of binary: /fsx/nousr/dalle2/bin/python3.8
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07158 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07080 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07038 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071a0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa070c8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07110 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa06ff0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 1, size: 0, state: CREATED, direction: SEND }
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071e8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND }
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 1 (pid: 13977) of binary: /fsx/nousr/dalle2/bin/python3.8
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 4 (pid: 13988) of binary: /fsx/nousr/dalle2/bin/python3.8
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06168 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND }
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 7
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 7
compute-od-gpu-dy-p4d-24xlarge-8:13876:13957 [0] NCCL INFO comm 0x7f9168000f60 rank 7 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808239 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f9033fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91643f9700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/asyn.py", line 54 in sync
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/asyn.py", line 86 in wrapper
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/s3fs/core.py", line 2173 in _fetch_range
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/s3fs/core.py", line 2030 in _fetch_range
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/caching.py", line 377 in _fetch
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/spec.py", line 1578 in read
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/pyarrow/parquet.py", line 1766 in __init__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/pyarrow/parquet.py", line 1960 in read_table
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 126 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9164dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91657fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91661fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9166bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91675fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9167fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91e193b700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91e233c700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91e2d3d700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f92216fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9223131700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f9069fff700 (most recent call first):
<no Python frame>
Thread 0x00007f9084bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f90855fe700 (most recent call first):
<no Python frame>
Thread 0x00007f9085fff700 (most recent call first):
<no Python frame>
Thread 0x00007f90bcb53700 (most recent call first):
<no Python frame>
Thread 0x00007f90bebfd700 (most recent call first):
<no Python frame>
Thread 0x00007f90bffff700 (most recent call first):
<no Python frame>
Thread 0x00007f90ecbff700 (most recent call first):
<no Python frame>
Thread 0x00007f8f521fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8f37fff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8f50dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8f361fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fa7fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fa6bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fc0dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fc17fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fc21fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fde1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8ffa1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fdcdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8fdf5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f8e8ebfd700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f91d7f89700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f924e1e1700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f930896a000 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 856 in next
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 149 in __call__
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dataloaders/prior_loader.py", line 75 in get_sample
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dataloaders/prior_loader.py", line 63 in __next__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39 in fetch
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/dataloader.py", line 692 in _next_data
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/dataloader.py", line 652 in __next__
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 222 in report_validation_loss
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 450 in eval_model
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 552 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fcaa946d700
compute-od-gpu-dy-p4d-24xlarge-15:13987:14070 [0] NCCL INFO comm 0x7fcb08000f60 rank 56 nranks 64 cudaDev 0 busId 101c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 56] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fc9a5fff700 (most recent call first):
<no Python frame>
Thread 0x00007fc9c13fd700 (most recent call first):
<no Python frame>
Thread 0x00007fc9c3fff700 (most recent call first):
<no Python frame>
Thread 0x00007fca10bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fca115fe700 (most recent call first):
<no Python frame>
Thread 0x00007fca11fff700 (most recent call first):
<no Python frame>
Thread 0x00007fcaa0c38700 (most recent call first):
<no Python frame>
Thread 0x00007fcaa1639700 (most recent call first):
<no Python frame>
Thread 0x00007fc8febfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc8fffff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc9197fb700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc91b5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc936bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc91bfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc934dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc9357fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc937fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc950dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc9521fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc952bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc9535fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fc7c97fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcb1bfff700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcb9521c700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fcc4baa7000 (most recent call first):
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 712 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15)
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071e8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND }
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13869 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13870 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13872 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13875 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 3 (pid: 13990) of binary: /fsx/nousr/dalle2/bin/python3.8
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13871) of binary: /fsx/nousr/dalle2/bin/python3.8
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3b40
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d2740
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d29c0
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d42c0
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d38c0
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3dc0
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3640
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d4540
compute-od-gpu-dy-p4d-24xlarge-12:13926:14011 [0] NCCL INFO comm 0x7f7304000f60 rank 37 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 37] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808545 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f71af5fe700 (most recent call first):
<no Python frame>
Thread 0x00007f71affff700 (most recent call first):
<no Python frame>
Thread 0x00007f71c8bff700 (most recent call first):
<no Python frame>
Thread 0x00007f71e5083700 (most recent call first):
<no Python frame>
Thread 0x00007f71e6bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f71e75fe700 (most recent call first):
<no Python frame>
Thread 0x00007f7290bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f7291fff700 (most recent call first):
<no Python frame>
Thread 0x00007f70cffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70eb5fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7104dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70e97fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70e8dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f71057fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7107fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70ea1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7120dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f71217fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70ebfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f70eabfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f713f5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6fd17fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f734bfff700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f738d19d700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f7449e47000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a8c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c940
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354676c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a140
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468fc0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546cbc0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b540
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354685c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835467940
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468840
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b2c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a640
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546ba40
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c1c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468ac0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c6c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b040
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469240
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354699c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354694c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546adc0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a3c0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469ec0
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469c40
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667c800
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667d100
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667b300
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667c500
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd836679200
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667ce00
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667a400
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd836678000
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667a100
compute-od-gpu-dy-p4d-24xlarge-12:13923:14009 [0] NCCL INFO comm 0x7fd830000f60 rank 34 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 34] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fd6b9fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd6d4bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd6d55fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd6d5fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd6f35fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd6f3fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd708dfa700 (most recent call first):
<no Python frame>
Thread 0x00007fd7b1357700 (most recent call first):
<no Python frame>
Thread 0x00007fd516bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd4fbfff700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd514dfa700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd5175fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd5161fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd530dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd517fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd5321fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd532bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd5317fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd533fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd54cdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd54d7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd4f97fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd84f9bf700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd88ce2a700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd9745a4000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1098 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7fefcd46b540 pkt_entry 0x7fefcd976fc0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4685c0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4676c0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469240
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469740
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468ac0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469c40
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46b2c0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd467e40
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46ab40
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468340
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468d40
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46bcc0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4699c0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46ba40
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468840
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46a3c0
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fefce67cb80
compute-od-gpu-dy-p4d-24xlarge-10:13984:14065 [0] NCCL INFO comm 0x7ff02c000f60 rank 18 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fee9ffff700 (most recent call first):
<no Python frame>
Thread 0x00007feeb8bfd700 (most recent call first):
<no Python frame>
Thread 0x00007feeb95fe700 (most recent call first):
<no Python frame>
Thread 0x00007feeb9fff700 (most recent call first):
<no Python frame>
Thread 0x00007feed4bfd700 (most recent call first):
<no Python frame>
Thread 0x00007feed55fe700 (most recent call first):
<no Python frame>
Thread 0x00007feed5fff700 (most recent call first):
<no Python frame>
Thread 0x00007feef3357700 (most recent call first):
<no Python frame>
Thread 0x00007fecfb5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fefc797c700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff02bfff700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fefc6f7b700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff030bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fefd12bb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fed157fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff07ac78700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff076e22700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff031fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fed175fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fed317fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fed321fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fecf8dfa700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff07b7f9700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff0b97e0700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007ff17005a000 (most recent call first):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d29c0
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d2ec0
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d3b40
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d3140
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d2c40
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d33c0
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d42c0
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d4540
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7f6d85469740 pkt_entry 0x7f6d85984340
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854694c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85469c40
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85468fc0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546a640
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546b7c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546b040
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85469240
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854680c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546c6c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85467bc0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546c440
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854676c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85468ac0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854685c0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546adc0
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546ab40
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678680
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667b080
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667aa80
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667c880
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86679280
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678c80
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678080
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86679b80
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7f4d81466680 pkt_entry 0x7f4d8197c680
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467a80
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146b180
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469100
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467f80
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81466400
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146a780
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81466e00
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146b680
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467800
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146ac80
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469600
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146af00
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467d00
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469b00
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467080
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81468700
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267cc40
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82677540
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c940
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c340
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82679c40
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c040
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82678d40
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267b440
compute-od-gpu-dy-p4d-24xlarge-10:13987:14066 [0] NCCL INFO comm 0x7fd9c8000f60 rank 21 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE
compute-od-gpu-dy-p4d-24xlarge-10:13986:14071 [0] NCCL INFO comm 0x7f6d80000f60 rank 20 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007fd86abfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd86b5fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd86bfff700 (most recent call first):
<no Python frame>
Thread 0x00007fd948bfd700 (most recent call first):
<no Python frame>
Thread 0x00007fd9495fe700 (most recent call first):
<no Python frame>
Thread 0x00007fd949fff700 (most recent call first):
<no Python frame>
Thread 0x00007fd95f97c700 (most recent call first):
<no Python frame>
Thread 0x00007fd9dcbff700 (most recent call first):
<no Python frame>
Thread 0x00007fd773fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in compute-od-gpu-dy-p4d-24xlarge-10:13988:14068 [0] NCCL INFO comm 0x7f4d7c000f60 rank 22 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE
_bootstrap
Thread 0x00007fd7717fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7c57fb700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7abfff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7ab5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7e35fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7a8dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7c4dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7e17fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7a97fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7c61fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd7e0dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd8357fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fd6917fb700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fda14a24700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fda281e1700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007fdb0c81f000 (most recent call first):
File "/usr/lib64/python3.8/tokenize.py", line 392 in open
File "/usr/lib64/python3.8/linecache.py", line 136 in updatecache
File "/usr/lib64/python3.8/linecache.py", line 47 in getlines
File "/usr/lib64/python3.8/linecache.py", line 16 in getline
File "/usr/lib64/python3.8/traceback.py", line 288 in line
File "/usr/lib64/python3.8/traceback.py", line 366 in extract
File "/usr/lib64/python3.8/traceback.py", line 509 in __init__
File "/usr/lib64/python3.8/traceback.py", line 120 in format_exception
File "/usr/lib64/python3.8/traceback.py", line 167 in format_exc
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py", line 75 in record_exception
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
what(): [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out.
Fatal Python error: Aborted
Thread 0x00007f6c755fe700 (most recent call first):
<no Python frame>
Thread 0x00007f6c75fff700 (most recent call first):
<no Python frame>
Thread 0x00007f6c90bfd700 (most recent call first):
<no Python frame>
Thread 0x00007f6c915fe700 (most recent call first):
<no Python frame>
Thread 0x00007f6c91fff700 (most recent call first):
<no Python frame>
Thread 0x00007f6cad3ff700 (most recent call first):
<no Python frame>
Thread 0x00007f6cc7fff700 (most recent call first):
<no Python frame>
Thread 0x00007f6cf5357700 (most recent call first):
<no Python frame>
Thread 0x00007f6bb35fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6be8dfa700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 302 in wait
File "/usr/lib64/python3.8/threading.py", line 433 in acquire
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bcd7fb700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 415 in select
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6beabfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6b975fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bcebfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bb2bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6c217fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6c061fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bcffff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bb3fff700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6be97fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6bea1fc700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f6a9b5fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in [E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File terminate called after throwing an instance of '"/ustd::runtime_errors'
r/lib64/python3 what(): .[Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.8/
threading.py", line 932 in _booFatal Python error: tAborteds
trap_inneThread 0xr
File 00007f4c6dfff700" (most recent call first):
/<no Python frame>
us
rThread 0x/00007f4c88bfd700l (most recent call first):
i<no Python frame>
b
6Thread 0x400007f4c895fe700/ (most recent call first):
p<no Python frame>
y
tThread 0xh00007f4c89fff700o (most recent call first):
n<no Python frame>
3
.Thread 0x800007f4cbcb4d700 (most recent call first):
/<no Python frame>
t
hThread 0xr00007f4cbf357700ea (most recent call first):
d<no Python frame>
i
nThread 0xg00007f4cecbfd700. (most recent call first):
p<no Python frame>
yThread 0x"00007f4cedfff700, line (most recent call first):
890<no Python frame>
in
_Thread 0xb00007f4b8f5fe700o (most recent call first):
o File ts"t/ruaspr
/
Thread 0xlib00007f6e395377006 (most recent call first):
4 File /p"y/tuhsorn3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/ncgo/npcooulr.rpeyn"t, line /f576u in t_uhraensd/lteh_rreesaudlt.sp
y File "", line /78u in s_rw/olrikbe6r4
/ File p"y/thuons3r./8/ltihbr6e4a/dpiyntgh.opny3"., line 8870/ in trhurne
a File d"/iunsgr./plyi"b, line 68704 in /rpuynt
h File o"n/3u.s8r/t/hlrieba6d4i/npg.yptyh"o, line n9323 in ._8b/otohtrsteraadp_iinngn.epry
" File , line "932/ in u_sbro/oltisbt6r4a/pp_ytihnonne3r.
8/ File t"h/ruesard/ilnigb.6p4y/"p, line y890t in h_obno3o.t8s/ttrharpe
a
dThread 0xi00007f4be21fc700n (most recent call first):
g File ."p/yu"sr, line /890l in i_bb6o4ot/sptyrtahpo
n
3Thread 0x.00007f6e3ffff7008 (most recent call first):
/ File th"r/euadsirn/gl.ipby6"4, line 302/ in pwyatihto
n File 3"./u8s/r/sleilbe6c4t/poyrtsh.opny3"., line 8468/ in tsherleeacdti
n File g"./puy"s, line r433/ in laicqbu6i4re/
p File yt"h/ofns3x./8n/oausysnrc/idoal/lbea2s/eli_be6v4e/pnyttsh.opny3."8, line /1823s in it_er-upna_conkacgee
s/ File e"m/buesdrd/ilnibg6_4r/eapdyetrh/opna3r.q8ue/ta_snyunmcpiyo_/rbeaadseer_.epvye"n, line t121s in .ppyi"ec, line e570_ in greunenr_afotroerv
e File r"
/u File s"r//ulsirb6/4l/ipby6t4h/opny3t.h8o/nmu3l.t8i/ptrhorceeasdsiinngg./ppyo"ol, line .870p in yr"u, line n388
in File _"g/uuasrrd/eldi_bt6a4s/kp_ygtehnoenr3.a8ti/otnh
r File e"a/duisnrg/.lpiyb"6, line 4932/ in p_ybtohootns3t.r8a/pm_uilntnieprr
o File c"e/sussirn/gl/ipboo6l4./pyp"y, line t532h in on_3ha.n8d/lteh_rteaasdkisn
g. File p"y/"u, line s890r in /_lbiobo64t/sptyrthaopn
3
.Thread 0x800007f6f22fda000/t (most recent call first):
hr File ea"d/inugs.rp/yl"i, line b870 in 6r4u/np
y File t"h/ouns3r./8l/itbo6k4e/npiyzteh.opny3"., line 8321/ in trheraeda_doirn_gs.toppy
" File , line "932 in /_ubsoort/sltirba6p_4in/npeyrt
h File o"n/3u.s8r//tloikbe6n4i/pzyet.hpyo"n, line 3363. in 8d/tehtreecatd_ienngc.opdyi"n, line g890
in _ File b"o/outssrt/rlaipb
6
4Thread 0x/00007f4b8e1fc700p (most recent call first):
yt File ho"/nu3s.r8//ltiobk6e4n/piyzteh.opny3"., line 8394/ in soepleecnt
o File rs"./puys"r, line /415l in isbe6l4e/cpty
t File h"o/nu3s.r8//lliib6n4e/cpaycthhoen.3p.y8"/, line m136u in ltuippdraotceecsascihneg
/ File c"o/nunserc/tlioinb.6p4y/", line p931y in twhaoint3
. File 8"//luisnre/lciabc64h/ep.yptyh"o, line n347. in 8g/emtullitniepsr
o File c"e/sussirn/gl/ipbo6ol4./ppyy"t, line h499o in n_3w.a8i/tl_ifonre_cuapcdhaet.esp
y" File , line "16/ in ugsert/lliinbe6
4/ File p"y/tuhsorn/3l.8i/bm6ul4t/ippryotcheosns3i.n8g//ptoroalc.epbya"c, line k519. in p_yh"a, line nd288l in el_wionrek
e File rs"
/ File u"s/ru/slri/bl6i4b6/4p/yptyhtohno3n.38./8t/rtahcreebadaicnkg..ppyy"", line , line 366870 in in erxutnr
a File c"t/
u File s"r//luibs6r4//lpiybt6h4o/np3y.t8h/otnhr3e.ad8i/ntgra.cpeyb"a, line 932c in k_.bpoyo"ts, line t509r in a_p__iinnnietr_
_ File
"/ File u"s/ru/slri/bl64i/bpy6t4h/opny3.t8h/otnh3r.e8a/dtirnagc.epbya"c, line k890. in p_ybo"o, line t120s in trfapo
r
mThread 0x00007f4baa1fc700a (most recent call first):
t File _"e/xucserp/tliiobn6
4 File /p"/yuthsorn/3l.i8b/6m4u/lptyiptrhocoens3s.i8n/gt/rpaoocle.bpayc"k, line .114p in yw"o, line r167k in efro
r File ma"t/_uesxrc/l
i File b6"4//fpsyxt/hnono3u.s8r//tdharlelaed2i/nlgi.bp6y4"/, line p870y in trhuonn
3 File ."8/u/ssri/tlei-bp64a/cpkyatgheosn/3t.o8r/tchhr/edaidsitnrgi.bpuyt"e, line d932/ in e_lbaosottisctr/ampu_litninperro
c File e"/sussirn/gli/be6r4r/poyrtsh/oenr3r.8o/rth_rheaanddilnegr..ppyy"", line , line 890 in 75_ in broeoctostrrda_pe
x
cThread 0xe00007f4be0dfa700p (most recent call first):
t File i"o/nu
sr File /l"i/bf6s4x//pnyotuhsorn3/.d8a/mlullet2i/plriobc6e4s/spiyntgh/opnoo3l..8p/ys"i, line t114 in ew-oprakcekra
File g"e/su/srt/olricbh64//dpiystthroinb3u.t8e/tdh/reealdainsgt.ipcy/"m, line u870l in triupnr
File o"c/eusssri/linbg6/4/epryrtohrosn3/._8_/itnhiret_a_d.inpgy."py, line "360, line in 932w in r_abpopoetrs
t File ra"p/_fisnnxe/rn
o File u"s/ru/sDr/AlLiLb6E42/-ppyytthoonr3c.h8//ttrhariena_ddiinfgf.upsyi"o, line n890_ in p_rbiooorts.tprya"p, line
757
in Thread 0x<00007f4be35fe700m (most recent call first):
o File d"u/lues>r
/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4be17fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4ba8dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4bfd7fb700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4c18dfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4bff5fe700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4bfcdfa700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4c36bfd700 (most recent call first):
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4a935fe700 (most recent call first):
File "/usr/lib64/python3.8/threading.py", line 306 in wait
File "/usr/lib64/python3.8/threading.py", line 558 in wait
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4e355d9700 (most recent call first):
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4e3bfff700 (most recent call first):
File "/usr/lib64/python3.8/selectors.py", line 468 in select
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever
File "/usr/lib64/python3.8/threading.py", line 870 in run
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f4f1eecc000 (most recent call first):
File "/usr/lib64/python3.8/tokenize.py", line 321 in read_or_stop
File "/usr/lib64/python3.8/tokenize.py", line 363 in detect_encoding
File "/usr/lib64/python3.8/tokenize.py", line 394 in open
File "/usr/lib64/python3.8/linecache.py", line 136 in updatecache
File "/usr/lib64/python3.8/linecache.py", line 47 in getlines
File "/usr/lib64/python3.8/linecache.py", line 16 in getline
File "/usr/lib64/python3.8/traceback.py", line 288 in line
File "/usr/lib64/python3.8/traceback.py", line 366 in extract
File "/usr/lib64/python3.8/traceback.py", line 509 in __init__
File "/usr/lib64/python3.8/traceback.py", line 120 in format_exception
File "/usr/lib64/python3.8/traceback.py", line 167 in format_exc
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py", line 75 in record_exception
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module>
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13921 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13922 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13924 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13925 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13927 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13928 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13982 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13983 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13985 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13989 closing signal SIGTERM
/usr/lib64/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 6 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13984) of binary: /fsx/nousr/dalle2/bin/python3.8
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13923) of binary: /fsx/nousr/dalle2/bin/python3.8
ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 311.0644772052765 seconds
Traceback (most recent call last):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier
store_util.barrier(
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier
synchronize(store, data, rank, world_size, key_prefix, barrier_timeout)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize
agent_data = get_all(store, rank, key_prefix, world_size)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all
data = store.get(f"{prefix}{idx}")
RuntimeError: Socket Timeout
ERROR:torch.distributed.elastic.multiprocessing.errors.error_handler:no error file defined for parent, to copy child error file (/tmp/torchelastic_h37v8tb9/none_qmhc8b_7/attempt_0/4/error.json)
Traceback (most recent call last):
File "/fsx/nousr/dalle2/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/run.py", line 761, in main
run(args)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/run.py", line 752, in run
elastic_launch(
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2022-07-16_00:26:36
host : compute-od-gpu-dy-p4d-24xlarge-9.hpc-1click-production2.pcluster.
rank : 12 (local_rank: 4)
exitcode : -6 (pid: 13988)
error_file: /tmp/torchelastic_h37v8tb9/none_qmhc8b_7/attempt_0/4/error.json
traceback : Traceback (most recent call last):
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main
initialize_training(config_file, accelerator)
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training
train(
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train
loss = trainer(text=txt, image_embed=img)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner
out = fn(model, *args, **kwargs)
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward
self.accelerator.backward(loss)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward
loss.backward(**kwargs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: NCCL communicator was aborted on rank 12. Original reason for failure was: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out.
============================================================
Traceback (most recent call last):
File "/fsx/nousr/dalle2/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 43, in main
args.func(args)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/launch.py", line 562, in launch_command
multi_gpu_launcher(args)
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/launch.py", line 306, in multi_gpu_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['torchrun', '--nproc_per_node', '8', '--nnodes', '8', '--node_rank', '1', '--master_addr', 'compute-od-gpu-dy-p4d-24xlarge-8', '--master_port', '12802', '/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py', '--config_file', '/fsx/nousr/DALLE2-pytorch/configs/prior.json']' returned non-zero exit status 1.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment