Created
July 18, 2022 15:29
-
-
Save nousr/be343a6084305ec9c3d793543520159e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading intelmpi version 2021.4.0 | |
go 8 | |
compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
myuser=zion | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-15 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
myuser=zion | |
COUNT_NODE=8 | |
LD_LIBRARY_PATH = /opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:/opt/intel/mpi/2021.4.0//libfabric/lib:/opt/intel/mpi/2021.4.0//lib/release:/opt/intel/mpi/2021.4.0//lib:/opt/intel/mpi/2021.4.0/libfabric/lib:/opt/intel/mpi/2021.4.0/lib/release:/opt/intel/mpi/2021.4.0/lib | |
PATH = /opt/amazon/efa/bin:/opt/intel/mpi/2021.4.0//libfabric/bin:/opt/intel/mpi/2021.4.0//bin:/opt/intel/mpi/2021.4.0/libfabric/bin:/opt/intel/mpi/2021.4.0/bin:/fsx/nousr/dalle2/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/home/zion/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/sbin:/bin:/usr/sbin:/opt/aws/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin:/home/zion/.local/bin:/home/zion/bin:/opt/parallelcluster/pyenv/versions/3.7.10/envs/awsbatch_virtualenv/bin:/opt/slurm/bin | |
hostname = compute-od-gpu-dy-p4d-24xlarge-12 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-10 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-9 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-13 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
which mpicc /opt/intel/mpi/2021.4.0/bin/mpicc | |
HOSTNAMES = compute-od-gpu-dy-p4d-24xlarge-8 compute-od-gpu-dy-p4d-24xlarge-9 compute-od-gpu-dy-p4d-24xlarge-10 compute-od-gpu-dy-p4d-24xlarge-11 compute-od-gpu-dy-p4d-24xlarge-12 compute-od-gpu-dy-p4d-24xlarge-13 compute-od-gpu-dy-p4d-24xlarge-14 compute-od-gpu-dy-p4d-24xlarge-15 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-14 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
hostname = compute-od-gpu-dy-p4d-24xlarge-11 | |
MASTER_ADDR= compute-od-gpu-dy-p4d-24xlarge-8 | |
MASTER_PORT= 12802 | |
THEID=0 | |
python3 version = Python 3.8.5 | |
THEID=7 | |
THEID=4 | |
THEID=1 | |
python3 version = Python 3.8.5 | |
THEID=5 | |
THEID=2 | |
python3 version = Python 3.8.5 | |
THEID=6 | |
python3 version = Python 3.8.5 | |
python3 version = Python 3.8.5 | |
python3 version = Python 3.8.5 | |
python3 version = Python 3.8.5 | |
THEID=3 | |
python3 version = Python 3.8.5 | |
Loading configuration from /fsx/nousr/DALLE2-pytorch/configs/prior.json | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Using network AWS Libfabric | |
NCCL version 2.12.7+cuda11.4 | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO Bootstrap : Using eth0:172.31.234.190<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO Bootstrap : Using eth0:172.31.236.214<0> | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO Bootstrap : Using eth0:172.31.230.141<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO Bootstrap : Using eth0:172.31.233.218<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO Bootstrap : Using eth0:172.31.229.104<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO Bootstrap : Using eth0:172.31.239.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO Bootstrap : Using eth0:172.31.225.29<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO Bootstrap : Using eth0:172.31.232.149<0> | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.4.0aws | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Running on p4d.24xlarge platform, Setting NCCL_TOPO_FILE environment variable to /opt/aws-ofi-nccl/share/aws-ofi-nccl/xml/p4d-24xl-topo.xml | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:13872 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:13870 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13873 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13876 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13871 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13874 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:13875 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:13945 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:13944 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:13947 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:13948 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:13951 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:13946 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:13950 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:13949 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:13988 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:13984 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:13986 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:13987 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:13983 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:13985 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:13982 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:13989 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:13924 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:13928 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:13923 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:13922 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:13926 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:13927 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:13921 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:13925 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:13988 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:13986 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:13992 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:13991 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:13987 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:13989 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:13985 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:13984 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:13978 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:13977 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:13982 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:13980 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:13983 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:13979 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:13976 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:13981 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13769 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13765 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13771 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13768 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13767 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13766 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13764 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13770 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:13990 [3] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:13993 [6] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:13989 [2] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:13994 [7] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:13988 [1] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO NET/OFI Selected Provider is efa | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:13987 [0] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:13991 [4] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:13992 [5] NCCL INFO Using network AWS Libfabric | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Setting affinity for GPU 3 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ff000000 | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Setting affinity for GPU 1 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Setting affinity for GPU 0 to ffffff | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Trees [0] 41/-1/-1->40->49 [1] 41/-1/-1->40->47 [2] 41/-1/-1->40->47 [3] 41/-1/-1->40->47 [4] 41/32/-1->40->25 [5] 41/-1/-1->40->47 [6] 41/-1/-1->40->47 [7] 41/-1/-1->40->47 | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Trees [0] 42/-1/-1->41->40 [1] -1/-1/-1->41->40 [2] 42/-1/-1->41->40 [3] 42/-1/-1->41->40 [4] 42/48/-1->41->40 [5] -1/-1/-1->41->40 [6] 42/-1/-1->41->40 [7] 42/-1/-1->41->40 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->51 [2] 43/-1/-1->42->41 [3] 43/-1/-1->42->41 [4] 43/-1/-1->42->41 [5] 43/34/-1->42->27 [6] 43/-1/-1->42->41 [7] 43/-1/-1->42->41 | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 [2] 39/-1/-1->38->37 [3] 39/54/-1->38->6 [4] 39/-1/-1->38->37 [5] 39/-1/-1->38->37 [6] 39/-1/-1->38->37 [7] 39/-1/-1->38->46 | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] 32/-1/-1->39->38 [2] 32/-1/-1->39->38 [3] 32/22/-1->39->38 [4] -1/-1/-1->39->38 [5] 32/-1/-1->39->38 [6] 32/-1/-1->39->38 [7] 32/-1/-1->39->38 | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/-1/-1->37->36 [2] 38/20/-1->37->36 [3] -1/-1/-1->37->36 [4] 38/-1/-1->37->36 [5] 38/-1/-1->37->36 [6] 38/-1/-1->37->36 [7] -1/-1/-1->37->36 | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Trees [0] 37/-1/-1->36->35 [1] 37/-1/-1->36->35 [2] 37/52/-1->36->4 [3] 37/-1/-1->36->35 [4] 37/-1/-1->36->35 [5] 37/-1/-1->36->35 [6] 37/-1/-1->36->44 [7] 37/-1/-1->36->35 | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Trees [0] 36/-1/-1->35->34 [1] 36/18/-1->35->34 [2] -1/-1/-1->35->34 [3] 36/-1/-1->35->34 [4] 36/-1/-1->35->34 [5] 36/-1/-1->35->34 [6] -1/-1/-1->35->34 [7] 36/-1/-1->35->34 | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Trees [0] 44/-1/-1->43->42 [1] 44/-1/-1->43->42 [2] -1/-1/-1->43->42 [3] 44/-1/-1->43->42 [4] 44/-1/-1->43->42 [5] 44/50/-1->43->42 [6] -1/-1/-1->43->42 [7] 44/-1/-1->43->42 | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/50/-1->34->2 [2] 35/-1/-1->34->33 [3] 35/-1/-1->34->33 [4] 35/-1/-1->34->33 [5] 35/-1/-1->34->42 [6] 35/-1/-1->34->33 [7] 35/-1/-1->34->33 | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] 24/-1/-1->31->30 [2] 24/-1/-1->31->30 [3] 24/-1/-1->31->30 [4] -1/-1/-1->31->30 [5] 24/-1/-1->31->30 [6] 24/-1/-1->31->30 [7] 24/46/-1->31->30 | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 [2] 31/-1/-1->30->29 [3] 31/-1/-1->30->22 [4] 31/-1/-1->30->29 [5] 31/-1/-1->30->29 [6] 31/-1/-1->30->29 [7] 31/14/-1->30->62 | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/-1/-1->45->44 [2] 46/-1/-1->45->44 [3] -1/-1/-1->45->44 [4] 46/-1/-1->45->44 [5] 46/-1/-1->45->44 [6] 46/52/-1->45->44 [7] -1/-1/-1->45->44 | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] 40/-1/-1->47->46 [2] 40/-1/-1->47->46 [3] 40/-1/-1->47->46 [4] -1/-1/-1->47->46 [5] 40/-1/-1->47->46 [6] 40/-1/-1->47->46 [7] 40/54/-1->47->46 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Trees [0] 45/-1/-1->44->43 [1] 45/-1/-1->44->43 [2] 45/-1/-1->44->53 [3] 45/-1/-1->44->43 [4] 45/-1/-1->44->43 [5] 45/-1/-1->44->43 [6] 45/36/-1->44->29 [7] 45/-1/-1->44->43 | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] -1/-1/-1->33->32 [2] 34/-1/-1->33->32 [3] 34/-1/-1->33->32 [4] 34/-1/-1->33->32 [5] -1/-1/-1->33->32 [6] 34/-1/-1->33->32 [7] 34/-1/-1->33->32 | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->39 [2] 33/-1/-1->32->39 [3] 33/-1/-1->32->39 [4] 33/-1/-1->32->40 [5] 33/-1/-1->32->39 [6] 33/-1/-1->32->39 [7] 33/-1/-1->32->39 | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/-1/-1->29->28 [2] 30/-1/-1->29->28 [3] -1/-1/-1->29->28 [4] 30/-1/-1->29->28 [5] 30/-1/-1->29->28 [6] 30/44/-1->29->28 [7] -1/-1/-1->29->28 | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Trees [0] 26/-1/-1->25->24 [1] -1/-1/-1->25->24 [2] 26/-1/-1->25->24 [3] 26/-1/-1->25->24 [4] 26/40/-1->25->24 [5] -1/-1/-1->25->24 [6] 26/-1/-1->25->24 [7] 26/-1/-1->25->24 | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 [2] 47/-1/-1->46->45 [3] 47/-1/-1->46->55 [4] 47/-1/-1->46->45 [5] 47/-1/-1->46->45 [6] 47/-1/-1->46->45 [7] 47/38/-1->46->31 | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/58/-1->50->34 [2] 51/-1/-1->50->49 [3] 51/-1/-1->50->49 [4] 51/-1/-1->50->49 [5] 51/-1/-1->50->43 [6] 51/-1/-1->50->49 [7] 51/-1/-1->50->49 | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] -1/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/22/-1->15->14 | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->23 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->13 [7] 15/6/-1->14->30 | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] 16/-1/-1->23->22 [2] 16/-1/-1->23->22 [3] 16/14/-1->23->22 [4] -1/-1/-1->23->22 [5] 16/-1/-1->23->22 [6] 16/-1/-1->23->22 [7] 16/-1/-1->23->22 | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 [2] 23/-1/-1->22->21 [3] 23/30/-1->22->39 [4] 23/-1/-1->22->21 [5] 23/-1/-1->22->21 [6] 23/-1/-1->22->21 [7] 23/-1/-1->22->15 | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] -1/-1/-1->49->48 [2] 50/-1/-1->49->48 [3] 50/-1/-1->49->48 [4] 50/-1/-1->49->48 [5] -1/-1/-1->49->48 [6] 50/-1/-1->49->48 [7] 50/-1/-1->49->48 | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->55 [2] 49/-1/-1->48->55 [3] 49/-1/-1->48->55 [4] 49/-1/-1->48->41 [5] 49/-1/-1->48->55 [6] 49/-1/-1->48->55 [7] 49/-1/-1->48->55 | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Trees [0] 25/-1/-1->24->16 [1] 25/-1/-1->24->31 [2] 25/-1/-1->24->31 [3] 25/-1/-1->24->31 [4] 25/8/-1->24->56 [5] 25/-1/-1->24->31 [6] 25/-1/-1->24->31 [7] 25/-1/-1->24->31 | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] -1/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->12 [6] 14/20/-1->13->12 [7] -1/-1/-1->13->12 | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/-1/-1->21->20 [2] 22/12/-1->21->20 [3] -1/-1/-1->21->20 [4] 22/-1/-1->21->20 [5] 22/-1/-1->21->20 [6] 22/-1/-1->21->20 [7] -1/-1/-1->21->20 | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Trees [0] 21/-1/-1->20->19 [1] 21/-1/-1->20->19 [2] 21/28/-1->20->37 [3] 21/-1/-1->20->19 [4] 21/-1/-1->20->19 [5] 21/-1/-1->20->19 [6] 21/-1/-1->20->13 [7] 21/-1/-1->20->19 | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/26/-1->18->35 [2] 19/-1/-1->18->17 [3] 19/-1/-1->18->17 [4] 19/-1/-1->18->17 [5] 19/-1/-1->18->11 [6] 19/-1/-1->18->17 [7] 19/-1/-1->18->17 | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Trees [0] 20/-1/-1->19->18 [1] 20/10/-1->19->18 [2] -1/-1/-1->19->18 [3] 20/-1/-1->19->18 [4] 20/-1/-1->19->18 [5] 20/-1/-1->19->18 [6] -1/-1/-1->19->18 [7] 20/-1/-1->19->18 | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/-1/-1->7->6 | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->18 [2] 27/-1/-1->26->25 [3] 27/-1/-1->26->25 [4] 27/-1/-1->26->25 [5] 27/10/-1->26->58 [6] 27/-1/-1->26->25 [7] 27/-1/-1->26->25 | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Trees [0] 29/-1/-1->28->27 [1] 29/-1/-1->28->27 [2] 29/-1/-1->28->20 [3] 29/-1/-1->28->27 [4] 29/-1/-1->28->27 [5] 29/-1/-1->28->27 [6] 29/12/-1->28->60 [7] 29/-1/-1->28->27 | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Trees [0] 28/-1/-1->27->26 [1] 28/-1/-1->27->26 [2] -1/-1/-1->27->26 [3] 28/-1/-1->27->26 [4] 28/-1/-1->27->26 [5] 28/42/-1->27->26 [6] -1/-1/-1->27->26 [7] 28/-1/-1->27->26 | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->21 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->11 [5] 13/-1/-1->12->11 [6] 13/4/-1->12->28 [7] 13/-1/-1->12->11 | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] -1/-1/-1->11->10 [3] 12/-1/-1->11->10 [4] 12/-1/-1->11->10 [5] 12/18/-1->11->10 [6] -1/-1/-1->11->10 [7] 12/-1/-1->11->10 | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->23 [2] 17/-1/-1->16->23 [3] 17/-1/-1->16->23 [4] 17/-1/-1->16->9 [5] 17/-1/-1->16->23 [6] 17/-1/-1->16->23 [7] 17/-1/-1->16->23 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/38/-1->6->-1 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->14 | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] -1/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] -1/-1/-1->5->4 | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] 48/-1/-1->55->54 [2] 48/-1/-1->55->54 [3] 48/46/-1->55->54 [4] -1/-1/-1->55->54 [5] 48/-1/-1->55->54 [6] 48/-1/-1->55->54 [7] 48/-1/-1->55->54 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 [2] 55/-1/-1->54->53 [3] 55/62/-1->54->38 [4] 55/-1/-1->54->53 [5] 55/-1/-1->54->53 [6] 55/-1/-1->54->53 [7] 55/-1/-1->54->47 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->19 [2] 11/-1/-1->10->9 [3] 11/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/2/-1->10->26 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] -1/-1/-1->9->8 [2] 10/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/16/-1->9->8 [5] -1/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] -1/-1/-1->17->16 [2] 18/-1/-1->17->16 [3] 18/-1/-1->17->16 [4] 18/-1/-1->17->16 [5] -1/-1/-1->17->16 [6] 18/-1/-1->17->16 [7] 18/-1/-1->17->16 | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/36/-1->4->-1 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->12 [7] 5/-1/-1->4->3 | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/-1/-1->53->52 [2] 54/44/-1->53->52 [3] -1/-1/-1->53->52 [4] 54/-1/-1->53->52 [5] 54/-1/-1->53->52 [6] 54/-1/-1->53->52 [7] -1/-1/-1->53->52 | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Trees [0] 52/-1/-1->51->50 [1] 52/42/-1->51->50 [2] -1/-1/-1->51->50 [3] 52/-1/-1->51->50 [4] 52/-1/-1->51->50 [5] 52/-1/-1->51->50 [6] -1/-1/-1->51->50 [7] 52/-1/-1->51->50 | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] 56/-1/-1->63->62 [2] 56/-1/-1->63->62 [3] 56/-1/-1->63->62 [4] -1/-1/-1->63->62 [5] 56/-1/-1->63->62 [6] 56/-1/-1->63->62 [7] 56/-1/-1->63->62 | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Trees [0] 58/-1/-1->57->56 [1] -1/-1/-1->57->56 [2] 58/-1/-1->57->56 [3] 58/-1/-1->57->56 [4] 58/-1/-1->57->56 [5] -1/-1/-1->57->56 [6] 58/-1/-1->57->56 [7] 58/-1/-1->57->56 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Trees [0] 57/-1/-1->56->48 [1] 57/-1/-1->56->63 [2] 57/-1/-1->56->63 [3] 57/-1/-1->56->63 [4] 57/24/-1->56->-1 [5] 57/-1/-1->56->63 [6] 57/-1/-1->56->63 [7] 57/-1/-1->56->63 | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->50 [2] 59/-1/-1->58->57 [3] 59/-1/-1->58->57 [4] 59/-1/-1->58->57 [5] 59/26/-1->58->-1 [6] 59/-1/-1->58->57 [7] 59/-1/-1->58->57 | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Trees [0] 9/-1/-1->8->17 [1] 9/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/0/-1->8->24 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/34/-1->2->-1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->10 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] 4/-1/-1->3->2 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Trees [0] 53/-1/-1->52->51 [1] 53/-1/-1->52->51 [2] 53/60/-1->52->36 [3] 53/-1/-1->52->51 [4] 53/-1/-1->52->51 [5] 53/-1/-1->52->51 [6] 53/-1/-1->52->45 [7] 53/-1/-1->52->51 | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 [2] 62/-1/-1->61->60 [3] -1/-1/-1->61->60 [4] 62/-1/-1->61->60 [5] 62/-1/-1->61->60 [6] 62/-1/-1->61->60 [7] -1/-1/-1->61->60 | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 [2] 63/-1/-1->62->61 [3] 63/-1/-1->62->54 [4] 63/-1/-1->62->61 [5] 63/-1/-1->62->61 [6] 63/-1/-1->62->61 [7] 63/30/-1->62->-1 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 16 23 22 21 | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Trees [0] 61/-1/-1->60->59 [1] 61/-1/-1->60->59 [2] 61/-1/-1->60->52 [3] 61/-1/-1->60->59 [4] 61/-1/-1->60->59 [5] 61/-1/-1->60->59 [6] 61/28/-1->60->-1 [7] 61/-1/-1->60->59 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01/08 : 0 3 10 15 14 13 12 9 8 11 18 23 22 21 20 17 16 19 26 31 | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Trees [0] 60/-1/-1->59->58 [1] 60/-1/-1->59->58 [2] -1/-1/-1->59->58 [3] 60/-1/-1->59->58 [4] 60/-1/-1->59->58 [5] 60/-1/-1->59->58 [6] -1/-1/-1->59->58 [7] 60/-1/-1->59->58 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02/08 : 0 7 6 5 12 11 10 9 8 15 14 13 20 19 18 17 16 23 22 21 | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] -1/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03/08 : 0 5 4 7 14 11 10 9 8 13 12 15 22 19 18 17 16 21 20 23 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 16 23 22 21 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05/08 : 0 3 10 15 14 13 12 9 8 11 18 23 22 21 20 17 16 19 26 31 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06/08 : 0 7 6 5 12 11 10 9 8 15 14 13 20 19 18 17 16 23 22 21 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07/08 : 0 5 4 7 14 11 10 9 8 13 12 15 22 19 18 17 16 21 20 23 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->8 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01 : 50[201c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01 : 42[201c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01 : 58[201c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01 : 18[201c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01 : 2[201c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01 : 34[201c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01 : 26[201c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01 : 10[201c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05 : 50[201c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00/0 : 41[101d0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05 : 42[201c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04/0 : 41[101d0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 49[101d0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04/0 : 49[101d0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05 : 58[201c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00/0 : 25[101d0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 25[101d0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00/0 : 57[101d0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04/0 : 57[101d0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05 : 18[201c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05 : 34[201c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05 : 2[201c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05 : 26[201c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05 : 10[201c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00/0 : 1[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04/0 : 1[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 33[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04/0 : 33[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 17[101d0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04/0 : 17[101d0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00/0 : 9[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04/0 : 9[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 51[201d0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05/0 : 51[201d0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01/0 : 43[201d0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05/0 : 43[201d0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01/0 : 27[201d0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 27[201d0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 35[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05/0 : 35[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01/0 : 3[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05/0 : 3[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01/0 : 11[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05/0 : 11[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 19[201d0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05/0 : 19[201d0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 23[a01d0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 23[a01d0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 63[a01d0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 63[a01d0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 31[a01d0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 31[a01d0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 7[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 7[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 47[a01d0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07/0 : 47[a01d0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01/0 : 59[201d0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 15[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05/0 : 59[201d0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07/0 : 15[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 39[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 39[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 55[a01d0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 55[a01d0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02/0 : 45[901d0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06/0 : 45[901d0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 53[901d0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02/0 : 29[901d0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 29[901d0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 37[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06/0 : 37[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02/0 : 13[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06/0 : 13[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02/0 : 5[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06/0 : 5[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 21[901d0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06/0 : 21[901d0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03/0 : 47[a01d0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07/0 : 47[a01d0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 55[a01d0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03/0 : 31[a01d0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 31[a01d0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 23[a01d0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03/0 : 15[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07/0 : 15[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02/0 : 61[901d0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 29[901d0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 13[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 61[901d0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 37[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 5[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 45[901d0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 21[901d0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 53[901d0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 33[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 27[201d0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 17[101d0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 59[201d0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 3[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 41[101d0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 11[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05/0 : 11[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 35[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 43[201d0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 57[101d0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 25[101d0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 19[201d0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 9[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 19[201d0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 1[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 1[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 51[201d0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 02 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 49[101d0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 06 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06/0 : 53[901d0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 39[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07/0 : 39[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03/0 : 7[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07/0 : 7[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07/0 : 23[a01d0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 37[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06/0 : 61[901d0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07/0 : 55[a01d0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 5[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03/0 : 63[a01d0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07/0 : 63[a01d0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06/0 : 45[901d0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 29[901d0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 61[901d0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06/0 : 13[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 33[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 35[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 21[901d0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 17[101d0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 53[901d0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 51[201d0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 02 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 02 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 25[101d0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 27[201d0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 06 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 06 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 02 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 3[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 06 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04/0 : 41[101d0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05/0 : 43[201d0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 02 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 57[101d0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 59[201d0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 06 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04/0 : 9[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 02 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 49[101d0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 06 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 02 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 06 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 00 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 01 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 02 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 04 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 05 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 06 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03 : 22[a01c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07 : 22[a01c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 00 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 00 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 02 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 02 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 03 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 04 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 06 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 07 : 26[201c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 03 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 04 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 06 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 07 : 18[201c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 00 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 00 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 01 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 00 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 03 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 04 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 01 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 05 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 02 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 07 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 03 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 00 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 01 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 03 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 03 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 00 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 04 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 01 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 04 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 04 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 05 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 07 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 06 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 02 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 00 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 05 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 01 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 03 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 04 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 04 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 05 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 07 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 07 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03 : 54[a01c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 07 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 05 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 00 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07 : 54[a01c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 00 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 06 : 22[a01c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03 : 38[a01c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03 : 14[a01c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 00 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 00 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07 : 38[a01c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07 : 14[a01c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 01 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 01 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 01 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 00 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 01 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 02 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 04 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03 : 30[a01c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 01 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 03 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 05 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 02 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 00 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 00 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07 : 30[a01c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 02 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 06 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 04 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 03 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 04 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 00 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 01 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 01 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 05 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 03 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 04 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 02 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 04 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 01 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 04 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 04 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 05 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 05 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 05 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 07 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 07 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 00 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 06 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 05 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 02 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 06 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 05 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 00 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 01 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 04 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 01 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 00 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 06 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 07 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 00 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 00 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 02 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 05 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 01 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 03 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 01 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 00 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 01 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 00 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 02 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 06 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 04 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 02 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 04 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 04 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 00 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 00 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 02 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 02 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 05 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 04 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 02 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 02 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 05 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 06 : 14[a01c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 03 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03 : 46[a01c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 01 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 00 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 03 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 03 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 00 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 02 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 00 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 04 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03 : 62[a01c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 02 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 05 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07 : 46[a01c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 04 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 03 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 05 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 07 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03 : 6[a01c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07 : 62[a01c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 01 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 00 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 03 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 05 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 04 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 03 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 01 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07 : 6[a01c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 04 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 06 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 06 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 06 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 02 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 04 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 06 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 06 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 00 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 02 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 06 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 04 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 07 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 07 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 05 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 04 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 01 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 07 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 04 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 07 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 05 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 02 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 06 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 02 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 06 : 30[a01c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 05 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 06 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 01 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 00 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 00 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 07 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 06 : 38[a01c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 04 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 02 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 00 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 03 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 00 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 01 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 01 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 00 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 04 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 02 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 00 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 05 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 02 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 07 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 02 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 02 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 04 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 03 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 01 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 00 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 00 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 00 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 06 : 54[a01c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 04 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 03 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 03 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 02 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 03 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 04 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 04 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 03 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 03 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 02 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 00 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 00 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 02 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 05 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 02 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 04 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 06 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 03 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 00 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 05 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 01 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 02 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 03 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 00 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 05 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 04 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 06 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 00 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 02 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 07 : 42[201c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 06 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 06 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 04 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 01 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 03 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 00 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 03 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 04 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 07 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 01 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 04 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 07 : 2[201c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 06 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 02 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 02 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 04 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 06 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 01 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 00 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 03 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 02 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 06 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 02 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 02 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 06 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 07 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 04 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 06 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 02 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 04 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 00 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 03 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 07 : 58[201c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 03 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 07 : 34[201c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 04 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 04 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 05 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 04 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 03 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 05 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 01 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 07 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 05 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 07 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 06 : 6[a01c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 02 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 05 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 06 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 06 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 04 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 05 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 07 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 05 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 06 : 62[a01c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 06 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 06 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 06 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 00 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 07 : 50[201c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 04 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 07 : 10[201c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 06 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 07 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 1[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 07 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 06 : 46[a01c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 07 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 01 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 06 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 02 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 07 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 04 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 05 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 02 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 02 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 06 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07 : 22[a01c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 00 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 02 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 00 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 02 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 00 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 01 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 06 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 06 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 02 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 01 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 02 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 02 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 04 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 17[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 00 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 9[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 00 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 02 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 00 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 00 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 00 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 04 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 01 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 01 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 02 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 01 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 04 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 00 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 00 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 05 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 03 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 21[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 00 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 02 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 02 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 06 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 05 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 06 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 04 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 04 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 04 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 06 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 00 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 06 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 02 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 01 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 05 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07 : 14[a01c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07 : 54[a01c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 03 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 02 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 00 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 41[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06 : 21[901d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 25[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 07 : 19[201d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 00 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 06 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 04 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 04 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 04 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 03 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 00 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 00 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 04 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 00 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 04 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 49[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 05 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 13[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 02 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 05 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 02 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 05 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 06 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 06 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06 : 13[901d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 03 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 02 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 03 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 00 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 03 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 06 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 02 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 00 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 06 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 04 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 00 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 06 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07 : 30[a01c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 00 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 00 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 04 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 03 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 04 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 02 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 33[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 04 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 03 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 01 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07 : 38[a01c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 07 : 18[201c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 03 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 00 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 01 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 04 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 02 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 07 : 17[101d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 00 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 03 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 00 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 03 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 02 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 53[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 04 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 06 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 00 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 00 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 07 : 27[201d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 03 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 00 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 06 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 00 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 00 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 02 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 29[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 04 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 00 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 07 : 1[101d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 02 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Connected all rings | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 01 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 02 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 04 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 01 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 00 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 01 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 04 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 04 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 06 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 57[101d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 00 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 02 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 03 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 06 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 04 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 03 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 05 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 03 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 03 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 03 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 01 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 07 : 49[101d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 04 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 05 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 07 : 50[201c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 03 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06 : 29[901d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 03 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 04 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 04 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 04 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 07 : 11[201d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 04 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 00 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 02 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 37[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 04 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 06 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 06 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 06 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 06 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 00 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 07 : 35[201d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 06 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 04 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 06 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 05 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 04 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 02 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 04 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 01 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 05 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 01 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 07 : 41[101d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 45[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 07 : 43[201d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 02 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 05 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 07 : 26[201c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07 : 6[a01c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 03 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 06 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 04 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 07 : 51[201d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 07 : 25[101d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 07 : 10[201c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 07 : 9[101d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06 : 53[901d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 00 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 02 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 03 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06 : 37[901d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06 : 5[901d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 00 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 04 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 07 : 42[201c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 05 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 04 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 5[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 07 : 3[201d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 01 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 06 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 06 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 05 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 00 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 02 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 04 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 00 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 01 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 06 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 03 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 06 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 07 : 2[201c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 05 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 04 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06 : 45[901d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07 : 46[a01c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 02 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 07 : 33[101d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 06 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 05 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 04 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 03 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 03 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 03 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 07 : 34[201c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 06 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07 : 62[a01c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06 : 61[901d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 04 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 04 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 07 : 57[101d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 01 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 61[901d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 07 : 59[201d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 05 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 06 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 03 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 01 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 07 : 58[201c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 07 : 0[101c0] -> 7[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 01 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 12[901c0] -> 21[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 05 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 03 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 03 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 6[a01c0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06/0 : 20[901c0] -> 13[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 05 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 05 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 01 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 10[201c0] -> 19[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 07 : 48[101c0] -> 55[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 8[101c0] -> 17[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 07 : 16[101c0] -> 23[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 03 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 01 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 01 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 40[101c0] -> 49[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 14[a01c0] -> 23[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 01 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 05 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 38[a01c0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 4[901c0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 07 : 40[101c0] -> 47[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 03 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 07 : 8[101c0] -> 15[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 03 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 05 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05/0 : 18[201c0] -> 11[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 2[201c0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04/0 : 16[101c0] -> 9[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 03 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 29[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 05 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 18[201c0] -> 35[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 6[a01c0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 07 : 24[101c0] -> 31[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 02 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04/0 : 48[101c0] -> 41[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 07 : 32[101c0] -> 39[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05/0 : 50[201c0] -> 43[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 4[901c0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 34[201c0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 27[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 22[a01c0] -> 39[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 20[901c0] -> 37[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 42[201c0] -> 51[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 05 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 25[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 44[901c0] -> 53[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 46[a01c0] -> 55[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Channel 06 : 5[901d0] -> 4[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 14[a01c0] -> 23[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 36[901c0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06/0 : 52[901c0] -> 45[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 38[a01c0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 18[201c0] -> 35[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 12[901c0] -> 21[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 00 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 22[a01c0] -> 39[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 2[201c0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 07 : 56[101c0] -> 63[a01d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 35[201d0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 31[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 01 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Channel 04 : 1[101d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 39[a01d0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 03/0 : 6[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 46[a01c0] -> 55[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 02/0 : 4[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 02 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 62[a01c0] -> 30[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 62[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 10[201c0] -> 19[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 01/0 : 2[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 05 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 01 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 60[901c0] -> 28[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 60[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01/0 : 35[201d0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 06 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Channel 05 : 3[201d0] -> 2[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 31[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07 : 7[a01d0] -> 0[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 6[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 38[a01c0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 02 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02/0 : 21[901d0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 6[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01/0 : 19[201d0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Channel 06 : 61[901d0] -> 60[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 58[201c0] -> 26[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03/0 : 55[a01d0] -> 46[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 58[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 62[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 07/0 : 62[a01c0] -> 30[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 00 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07/0 : 46[a01c0] -> 31[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 01 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Channel 05 : 35[201d0] -> 34[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 01 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 36[901c0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 20[901c0] -> 37[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Channel 04 : 57[101d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 02 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 37[901d0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 44[901c0] -> 53[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 60[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 06/0 : 60[901c0] -> 28[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02/0 : 37[901d0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 05 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 16[101c0] -> 33[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 01 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 58[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 05/0 : 58[201c0] -> 26[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 29[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 06 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Channel 05 : 59[201d0] -> 58[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02/0 : 53[901d0] -> 44[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 02 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 34[201c0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Channel 06 : 37[901d0] -> 36[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07 : 63[a01d0] -> 56[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07/0 : 22[a01c0] -> 15[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 01 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 02 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 05 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 06 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07 : 15[a01d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 0[101c0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 2[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 34[201c0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 32[101c0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06/0 : 44[901c0] -> 29[901d0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 32[101c0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07/0 : 54[a01c0] -> 47[a01d0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 01 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 42[201c0] -> 51[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 02 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 05 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 06 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 0[101c0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07 : 47[a01d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 16[101c0] -> 33[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 00/0 : 0[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03/0 : 23[a01d0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 40[101c0] -> 49[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 8[101c0] -> 17[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00/0 : 33[101d0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 27[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03/0 : 39[a01d0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00/0 : 17[101d0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01/0 : 51[201d0] -> 42[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 31[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00/0 : 49[101d0] -> 40[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 01 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 03/0 : 54[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 02 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05/0 : 42[201c0] -> 27[201d0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 03/0 : 55[a01d0] -> 46[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 05 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 01/0 : 26[201c0] -> 18[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 06 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 03/0 : 62[a01c0] -> 54[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07 : 39[a01d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 38[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Channel 07/0 : 46[a01c0] -> 38[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 01 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 02 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 05 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 01 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 06 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07 : 31[a01d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 02 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 56[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 04/0 : 56[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 05 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 06 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07 : 55[a01d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 07/0 : 30[a01c0] -> 14[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Channel 03/0 : 30[a01c0] -> 22[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 03/0 : 23[a01d0] -> 14[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 29[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 4[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 36[901c0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 01 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 02/0 : 53[901d0] -> 44[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 02 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Channel 03/0 : 62[a01c0] -> 54[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 05 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 06 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07 : 23[a01d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 36[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 02/0 : 28[901c0] -> 20[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 02 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 02 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Channel 06 : 53[901d0] -> 52[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Channel 06 : 29[901d0] -> 28[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 05/0 : 26[201c0] -> 10[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 06/0 : 28[901c0] -> 12[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Channel 01/0 : 58[201c0] -> 50[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 02/0 : 60[901c0] -> 52[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 01/0 : 50[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 01/0 : 58[201c0] -> 50[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 34[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Channel 01/0 : 26[201c0] -> 18[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 02/0 : 28[901c0] -> 20[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 01/0 : 19[201d0] -> 10[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 02/0 : 21[901d0] -> 12[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 2[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 24[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Channel 05/0 : 18[201c0] -> 11[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 25[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 06/0 : 20[901c0] -> 13[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 01 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 33[101d0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 03 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 25[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 05 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 27[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 02 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Channel 07 : 28[901c0] -> 27[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04/0 : 40[101c0] -> 25[101d0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 00 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Channel 04 : 33[101d0] -> 32[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 01/0 : 51[201d0] -> 42[201c0] [receive] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Channel 06 : 21[901d0] -> 20[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 03/0 : 30[a01c0] -> 22[a01c0] [receive] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 02 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 00/0 : 49[101d0] -> 40[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 01 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Channel 05 : 11[201d0] -> 10[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 00/0 : 24[101c0] -> 16[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Channel 05/0 : 42[201c0] -> 34[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 01 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 01 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 01 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Channel 06 : 13[901d0] -> 12[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 00 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 03 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Channel 07/0 : 22[a01c0] -> 15[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Channel 05 : 51[201d0] -> 50[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Channel 05 : 19[201d0] -> 18[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Channel 04 : 49[101d0] -> 48[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 05 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Channel 07 : 20[901c0] -> 19[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 00 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 0[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Channel 04 : 25[101d0] -> 24[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 32[101c0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 01 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Channel 05 : 27[201d0] -> 26[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Channel 07/0 : 14[a01c0] -> 6[a01c0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 00/0 : 48[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 00/0 : 56[101c0] -> 48[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 4[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Channel 07/0 : 54[a01c0] -> 47[a01d0] [send] via NET/AWS Libfabric/3/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 02/0 : 52[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 06/0 : 44[901c0] -> 36[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 56[101c0] -> 24[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 56[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Channel 05/0 : 50[201c0] -> 43[201d0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Channel 05/0 : 10[201c0] -> 2[201c0] [send] via NET/AWS Libfabric/1/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 01 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 03 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 01 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 05 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Channel 07 : 36[901c0] -> 35[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 03 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 06/0 : 12[901c0] -> 4[901c0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 05 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Channel 07 : 44[901c0] -> 43[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 01 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Channel 05 : 43[201d0] -> 42[201c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 01 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 01 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 03 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 03 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 05 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 05 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Channel 07 : 12[901c0] -> 11[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Channel 07 : 4[901c0] -> 3[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 32[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 0[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Channel 04/0 : 40[101c0] -> 32[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 02/0 : 60[901c0] -> 52[901c0] [receive] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 06/0 : 52[901c0] -> 45[901d0] [send] via NET/AWS Libfabric/2/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 03 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 01 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 03 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Channel 07 : 39[a01d0] -> 38[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 05 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Channel 07 : 60[901c0] -> 59[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 01 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 03 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 05 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Channel 07 : 52[901c0] -> 51[201d0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 04/0 : 24[101c0] -> 8[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 02 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 03 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Channel 06 : 45[901d0] -> 44[901c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Channel 07 : 47[a01d0] -> 46[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Channel 00/0 : 24[101c0] -> 16[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 00/0 : 17[101d0] -> 8[101c0] [receive] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Channel 00/0 : 56[101c0] -> 48[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 00 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Channel 04 : 17[101d0] -> 16[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 03 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Channel 07 : 31[a01d0] -> 30[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 03 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Channel 07 : 63[a01d0] -> 62[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14129 [6] NCCL INFO comm 0x7f4a68000f60 rank 62 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13994:14135 [7] NCCL INFO comm 0x7fcb68000f60 rank 63 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14131 [5] NCCL INFO comm 0x7fc204000f60 rank 61 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14130 [1] NCCL INFO comm 0x7f0394000f60 rank 57 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14136 [3] NCCL INFO comm 0x7f93b0000f60 rank 59 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14133 [0] NCCL INFO comm 0x7fcb08000f60 rank 56 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14134 [2] NCCL INFO comm 0x7f3c9c000f60 rank 58 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14132 [4] NCCL INFO comm 0x7f5490000f60 rank 60 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14068 [0] NCCL INFO comm 0x7fe294000f60 rank 32 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14067 [4] NCCL INFO comm 0x7efe98000f60 rank 36 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13922:14064 [1] NCCL INFO comm 0x7f5290000f60 rank 33 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14063 [2] NCCL INFO comm 0x7fd830000f60 rank 34 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14066 [6] NCCL INFO comm 0x7ff3e0000f60 rank 38 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14065 [5] NCCL INFO comm 0x7f7304000f60 rank 37 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13928:14061 [7] NCCL INFO comm 0x7fb6ec000f60 rank 39 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-12:13924:14062 [3] NCCL INFO comm 0x7f67d4000f60 rank 35 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Channel 04/0 : 8[101c0] -> 0[101c0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Channel 04/0 : 16[101c0] -> 9[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Channel 04/0 : 48[101c0] -> 41[101d0] [send] via NET/AWS Libfabric/0/GDRDMA | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 03 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Channel 07 : 23[a01d0] -> 22[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 00 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 03 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Channel 07 : 55[a01d0] -> 54[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Channel 04 : 9[101d0] -> 8[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 00 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Channel 04 : 41[101d0] -> 40[101c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 03 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13771:13909 [7] NCCL INFO comm 0x7fb184000f60 rank 31 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13765:13910 [1] NCCL INFO comm 0x7f6858000f60 rank 25 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13767:13913 [3] NCCL INFO comm 0x7ff32c000f60 rank 27 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13769:13915 [5] NCCL INFO comm 0x7fcb50000f60 rank 29 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13764:13911 [0] NCCL INFO comm 0x7f66cc000f60 rank 24 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Channel 07 : 7[a01d0] -> 6[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-11:13766:13916 [2] NCCL INFO comm 0x7f6dec000f60 rank 26 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13768:13914 [4] NCCL INFO comm 0x7f271c000f60 rank 28 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-11:13770:13912 [6] NCCL INFO comm 0x7f8de0000f60 rank 30 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14088 [1] NCCL INFO comm 0x7f7298000f60 rank 41 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13951:14092 [7] NCCL INFO comm 0x7faf64000f60 rank 47 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13949:14095 [5] NCCL INFO comm 0x7fc704000f60 rank 45 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14091 [3] NCCL INFO comm 0x7f2400000f60 rank 43 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14094 [0] NCCL INFO comm 0x7f411c000f60 rank 40 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14089 [2] NCCL INFO comm 0x7fcf64000f60 rank 42 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14090 [4] NCCL INFO comm 0x7f4ee4000f60 rank 44 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14093 [6] NCCL INFO comm 0x7fb2f0000f60 rank 46 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 03 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13983:14126 [1] NCCL INFO comm 0x7f6760000f60 rank 17 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14127 [7] NCCL INFO comm 0x7ff7b0000f60 rank 23 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14129 [5] NCCL INFO comm 0x7fd9c8000f60 rank 21 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13982:14128 [0] NCCL INFO comm 0x7faf40000f60 rank 16 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14123 [3] NCCL INFO comm 0x7f0040000f60 rank 19 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Channel 07 : 15[a01d0] -> 14[a01c0] via P2P/IPC/read | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14124 [2] NCCL INFO comm 0x7ff02c000f60 rank 18 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14130 [4] NCCL INFO comm 0x7f6d80000f60 rank 20 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13988:14125 [6] NCCL INFO comm 0x7f4d7c000f60 rank 22 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO Connected all trees | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 8/8/512 | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14124 [1] NCCL INFO comm 0x7fdabc000f60 rank 49 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14122 [7] NCCL INFO comm 0x7fbcc8000f60 rank 55 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13979:14121 [3] NCCL INFO comm 0x7f4870000f60 rank 51 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14126 [5] NCCL INFO comm 0x7f5528000f60 rank 53 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14123 [0] NCCL INFO comm 0x7f761c000f60 rank 48 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14120 [2] NCCL INFO comm 0x7fd0f8000f60 rank 50 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14119 [4] NCCL INFO comm 0x7f663c000f60 rank 52 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14125 [6] NCCL INFO comm 0x7f6f48000f60 rank 54 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14127 [0] NCCL INFO comm 0x7f2bb4000f60 rank 8 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13985:14126 [1] NCCL INFO comm 0x7f841c000f60 rank 9 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13992:14120 [7] NCCL INFO comm 0x7fea58000f60 rank 15 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13989:14125 [5] NCCL INFO comm 0x7f2038000f60 rank 13 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13987:14123 [3] NCCL INFO comm 0x7f3234000f60 rank 11 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13870:14018 [1] NCCL INFO comm 0x7fbc50000f60 rank 1 nranks 64 cudaDev 1 busId 101d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14001 [0] NCCL INFO comm 0x7f5418000f60 rank 0 nranks 64 cudaDev 0 busId 101c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:14022 [5] NCCL INFO comm 0x7fbff8000f60 rank 5 nranks 64 cudaDev 5 busId 901d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14122 [2] NCCL INFO comm 0x7f6564000f60 rank 10 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14021 [2] NCCL INFO comm 0x7fa078000f60 rank 2 nranks 64 cudaDev 2 busId 201c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:14020 [7] NCCL INFO comm 0x7f9168000f60 rank 7 nranks 64 cudaDev 7 busId a01d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13872:14017 [3] NCCL INFO comm 0x7f259c000f60 rank 3 nranks 64 cudaDev 3 busId 201d0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14121 [4] NCCL INFO comm 0x7fa2c4000f60 rank 12 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14124 [6] NCCL INFO comm 0x7f3f08000f60 rank 14 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:14019 [4] NCCL INFO comm 0x7f4d84000f60 rank 4 nranks 64 cudaDev 4 busId 901c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14023 [6] NCCL INFO comm 0x7fdf68000f60 rank 6 nranks 64 cudaDev 6 busId a01c0 - Init COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:13869 [0] NCCL INFO Launch mode Parallel | |
Logging to wandb run nousr_laion/dalle2_diffusion_prior/2d8un1bt-valiant-cherry-75 | |
Saving checkpoint locally | |
Saving to huggingface repo laion/DALLE2-PyTorch | |
Saving prior_config.json checkpoint to local path prior_config.json | |
Saving prior_config.json model to huggingface repo laion/DALLE2-PyTorch | |
[ | |
TRAINING HERE | |
] | |
[E ProcessGroupNCCL.cpp:737] [Rank 49] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807920 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 53] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807922 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 50] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807912 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 55] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807924 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 40] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 41] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807948 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out. | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14101 [2] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14133 [6] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-13:13950:14027 [0] NCCL INFO comm 0x7fb2f0000f60 rank 46 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-9:13988:14056 [0] NCCL INFO comm 0x7fa2c4000f60 rank 12 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14101 [2] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14102 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-13:13944:14108 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
libfabric:13945:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f729146b300 | |
compute-od-gpu-dy-p4d-24xlarge-13:13945:14026 [0] NCCL INFO comm 0x7f7298000f60 rank 41 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
compute-od-gpu-dy-p4d-24xlarge-13:13946:14030 [0] NCCL INFO comm 0x7fcf64000f60 rank 42 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 41] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807948 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f7193fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71ad3fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71addfe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71ae7ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71c4b35700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71c5536700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71c7357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f7295cbc700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f70b0dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70b2bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70b35fe700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70cf5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70ccdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70cebfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70e8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70e97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70eb5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7104dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7122bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f71075fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7106bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6fb61fc700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7350819700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f737bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7436a8c000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/rotary_embedding_torch/rotary_embedding_torch.py", line 47 in apply_rotary_emb | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/rotary_embedding_torch/rotary_embedding_torch.py", line 95 in rotate_queries_or_keys | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 728 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
compute-od-gpu-dy-p4d-24xlarge-13:13947:14031 [0] NCCL INFO comm 0x7f2400000f60 rank 43 nranks 64 cudaDev 3 busId 201d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-13:13948:14025 [0] NCCL INFO comm 0x7f4ee4000f60 rank 44 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b50 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b98 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676c28 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676a88 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a30 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676b08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd8366769b0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676be0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676b60 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 2, size: 0, state: CREATED, direction: SEND } | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ba8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c50 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 3, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676aa0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 3, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676b30 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676ae8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 3, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676b78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676ac0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 2, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c98 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676bc0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676b18 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 1, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676a40 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 1, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ad0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 1, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14083 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7efe3a676a78 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 2, size: 0, state: CREATED, direction: SEND } | |
libfabric:13925:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd8366769f8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13923:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14085 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7fd836676ba8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14081 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff3e6676c08 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13927:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-14:13981:14062 [0] NCCL INFO comm 0x7f5528000f60 rank 53 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13977:14060 [0] NCCL INFO comm 0x7fdabc000f60 rank 49 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-14:13978:14066 [0] NCCL INFO comm 0x7fd0f8000f60 rank 50 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 49] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807920 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fd9b6bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9b75fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9b7fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9d0bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9d15fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9d1fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9ecbff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fdb1b357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd84bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd8497fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd864dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd8661fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd8f35fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd90cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd8f3fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd90e1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd90f5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd928dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd90ffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd92a1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd92abfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7dbfff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fdb64e22700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fdb7bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fdc5f4b7000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/functional.py", line 360 in einsum | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 761 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
Traceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
rv = self.invoke(ctx) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
return __callback(*args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
initialize_training(config_file, accelerator) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
train( | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
loss = trainer(text=txt, image_embed=img) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
Traceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 46. Original reason for failure was: [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
rv = self.invoke(ctx) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
return __callback(*args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
initialize_training(config_file, accelerator) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
train( | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
loss = trainer(text=txt, image_embed=img) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 12. Original reason for failure was: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out. | |
compute-od-gpu-dy-p4d-24xlarge-14:13976:14129 [0] NCCL INFO [Service thread] Connection closed by localRank 7 | |
compute-od-gpu-dy-p4d-24xlarge-14:13980:14137 [4] NCCL INFO [Service thread] Connection closed by localRank 7 | |
compute-od-gpu-dy-p4d-24xlarge-14:13982:14134 [6] NCCL INFO [Service thread] Connection closed by localRank 7 | |
Traceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
rv = self.invoke(ctx) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
return __callback(*args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
initialize_training(config_file, accelerator) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
train( | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
loss = trainer(text=txt, image_embed=img) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Traceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 43. Original reason for failure was: [Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out. | |
main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
compute-od-gpu-dy-p4d-24xlarge-14:13983:14063 [0] NCCL INFO comm 0x7fbcc8000f60 rank 55 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 55] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807924 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fbbcd5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbbcdfff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbbe8bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbbe95fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbbe9fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbc051fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbc217fb700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbc23357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbad3fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbaeffff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acqui rv = self.invoke(ctx) | |
re | |
File "/fsx File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3 return ctx.invoke(self.callback, **ctx.params) | |
.8/threadi File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
ng.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb08dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/po return __callback(*args, **kwargs) | |
ol.py File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb0bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb097fb700 (most recent call first): | |
File "/ initialize_training(config_file, accelerator) | |
usr/lib64/python3 File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb261fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File " train( | |
/usr/ File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb0abfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb40dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in wor loss = trainer(text=txt, image_embed=img) | |
ker | |
File "/usr File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb26bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb417fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool return forward_call(*input, **kwargs) | |
.py", line 114 in wor File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
ker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb421fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.p out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
y", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb435fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/libTraceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbb5cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fb9f21fc700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threadin loss.backward(**kwargs) | |
g.py" File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
, line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbd41cbc700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/ return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbdad8e3700 (most recent call first): | |
File "/usr/lib64/python3.8/ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe6a55a000 (most recent call first): | |
File "/fsx/nousr/ Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn self.main(*args, **kwargs)DALLE2-pyt | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
RuntimeErrororch/: NCCL communicator was aborted on rank 42. Original reason for failure was: [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out.dalle2_ | |
pytorch/dalle2_pytorch.py", line 546 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/container.py", line 139 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2- rv = self.invoke(ctx) | |
pytorch/dalle2_ File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
pytorch/dalle2_pytorch.py", line 709 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/l return ctx.invoke(self.callback, **ctx.params) | |
ib64/p File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
ython3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site- return __callback(*args, **kwargs) | |
packa File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
ges/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/traine initialize_training(config_file, accelerator) | |
r.py", line 107 in in File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
ner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8 train( | |
/site-pack File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
ages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torc loss = trainer(text=txt, image_embed=img) | |
h/dis File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
tributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 44. Original reason for failure was: [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of '[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
std::runtime_error' | |
what(): [Rank 42] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807943 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fce4bfff700 (most recent call first): | |
<no Python frame> | |
terminate called after throwing an instance of 'Thread 0x00007fce64bfd700 (most recent call first): | |
std::runtime_error<no Python frame> | |
' | |
Thread 0x00007fce655fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x what(): 00007fce65fff700[Rank 43] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807950 milliseconds before timing out. (most recent call first): | |
<no Python frame> | |
Thread 0x00007fce80bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fce815fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fce81fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcfdd014700 (most recent call first): | |
<no Python frame> | |
Fatal Python error: Thread 0xAborted00007fcda35fe700 | |
(most recent call first): | |
File "Thread 0x/u00007f23055fe700s (most recent call first): | |
r/<no Python frame> | |
li | |
bThread 0x600007f2305fff7004 (most recent call first): | |
/<no Python frame> | |
p | |
yThread 0xt00007f231d3fd700h (most recent call first): | |
on<no Python frame> | |
3 | |
.Thread 0x8/00007f231ddfe700t (most recent call first): | |
h<no Python frame> | |
r | |
eThread 0xa00007f231e7ff700d (most recent call first): | |
in<no Python frame> | |
g | |
.Thread 0xpy00007f2374bff700" (most recent call first): | |
<no Python frame> | |
, line | |
Thread 0x30200007f23f2f7b700 in (most recent call first): | |
<no Python frame> | |
w | |
aThread 0xit00007f23fd357700 | |
(most recent call first): | |
<no Python frame> | |
File | |
"Thread 0x/00007f22257fb700u (most recent call first): | |
s File r/"li/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n433g in .payc"qu, line i302r in ew | |
a File it" | |
/ File f"s/xu/snro/ulsirb/6d4a/lplyet2h/olni3b.684//tphyrtehaodni3n.g8./psyi"t, line e433- in paaccqkuaigrees | |
/ File em"b/efdsdxi/nngo_ursera/ddearl/lpea2r/qluiebt6_4n/upmyptyh_orne3a.d8e/rs.iptye"-p, line a121c in kapgieesc/ee_mgbeendedriantgo_rr | |
e File ad"e/ru/spra/rlqiube6t4_/npuymtphyo_nr3e.a8d/emru.lptyi"p, line ro121c in epsiseicneg_/gpeonoelr.aptyo"r, line | |
388 File in _"g/uuasrrd/eldi_bt6as4k/_pgyetnheorna3t.i8o/nm | |
ul File t"i/pursorc/elsisbi6n4g//ppyotohl.opny3".8, line /388m in u_lgtuiaprrdoecde_stsaisnkg_/gpeonoelr.aptyi"on, line | |
532 in File _"h/aunsdrl/el_itba6s4k/sp | |
y File t"h/ouns3r./8l/imbu6l4t/ippyrtohcoens3s.i8n/gt/hproeoald.ipnyg"., line p532y in "_, line h870a in nrdulne | |
_ File t"a/sukssr | |
/ File li"/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t870s in trraupn_ | |
i File n"n/eurs | |
r File /"l/iubs6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.apdyi"n, line g932. in p_yb"oo, line t890s in tr_abpo_oitnsnterra | |
p File | |
" | |
/Thread 0xus00007fd04d8e3700r (most recent call first): | |
/l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/nsge.lpeyc"t, line or890s in ._pbyo"o, line t468s in tsrealpe | |
c | |
tThread 0x | |
00007f24bbfff700 File (most recent call first): | |
" File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8a/ssyenlceicot/obrass.ep_ye"v, line e468n in tsse.lpeyc"t | |
, line File 1823 in "_/rusurn/_loinbc6e4 | |
/ File p"y/tuhsorn/3l.i8b/6a4s/ypnyctihoo/nb3a.s8e/_aesvyennctiso./pbya"se, line _1823e in v_ernutns_.opnyc"e, line | |
570 File in "r/uuns_rf/olriebv6e4r/ | |
p File y"t/huosnr3/.l8i/ba6s4y/npcyitoh/obna3s.e8_/etvhernetasd.ipnyg".p, line y570" in , line r870u in nr_ufno | |
r File e"v/eurs | |
r/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t870s in trruanp | |
_ File i"n/nuesrr | |
/ File l"i/bus6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.apdyi"ng, line .932p in y_"b, line o890o in t_sbtoroatps_tirnanpe | |
r | |
Thread 0x File 00007fd10a697000" (most recent call first): | |
/u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3t.h8r/etahdrienagd.ipnyg"., line p890y in "_b, line o1027o in t_stwraaipt | |
_ | |
fThread 0xo00007f259f096000r (most recent call first): | |
_ File t"s/tuastre/_lliobc6k4 | |
/ File py"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/, line t1027h in r_ewaadiitn_gf.opry_"t, line s1011t in atjeo_ilno | |
c File k | |
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.m8u/ltthirperaodciensgs.ipnyg"/, line p1011o in olj.opiyn" | |
, line File 717" in /_utserr/mliinba6t4e/_ppyotohlo | |
n3 File .8"//muuslrt/ilpirbo6c4e/spsyitnhgo/np3o.o8l/.mpuyl"ti, line p717r in o_cteesrsmiinnga/tuet_iplo.oply | |
" File , line "224/ in u_s_rc/allilb_6_4 | |
/p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/nmgu/luttiiplr.opcye"s, line s224i in n_g_/cuatlill_._p | |
y File "", line /300u in s_rr/ulni_bf6i4n/aplyitzheorns3 | |
. File 8"//muuslrt/ilpirbo6c4e/spsyitnhgo/nu3t.i8l/.mpuyl"t, line i300p in ro_creusns_ifnign/aultiizle.rpsy | |
" File , line "334/ in u_serx/ilti_bf6u4n/cptyitohno | |
n3.8/multiprocessing/util.py", line 334 in _exit_function | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 50] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807912 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fcfecbfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcfed5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcfedfff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd008bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd0095fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd009fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd0253ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd0f5014700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcf2bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf461fc700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf44dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf997fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf475fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf47fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf60dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf621fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf7f5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf635fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf7d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf7ffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcf7e1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fce12bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd1a39bb700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd1b7fff700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd29b4ed000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
[E ProcessGroupNCCL.cpp:737] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808546 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808539 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 33] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 37] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808545 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 34] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808546 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808549 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 62] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808555 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 56] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 57] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 61] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808541 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 63] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 58] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 60] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808239 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:737] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808246 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fa1b55fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa1b5fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa1d0bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa1d15fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa1d1fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa1ecbff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa321089700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa321a8a700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa0f2bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fa3ab685700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fa4660e7000 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock | |
File "/usr/lib64/python3.8/threading.py", line 1011 in join | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__ | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14033 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:14033 [2] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 4 | |
compute-od-gpu-dy-p4d-24xlarge-8:13874:13956 [0] NCCL INFO comm 0x7fbff8000f60 rank 5 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-8:13873:13951 [0] NCCL INFO comm 0x7f4d84000f60 rank 4 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 44] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f4dfcdfc700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4dfd7fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4dfe1fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4dfebff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4e54bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4e555fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4e55fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4f44c62700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4d057fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4fcdd88700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f508abf1000 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock | |
File "/usr/lib64/python3.8/threading.py", line 1011 in join | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__ | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 46] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807949 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fb20a1fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb20abff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb221f55700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb223fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb268bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb269fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb2eab7c700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb350bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fb10d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fb3da1e1700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
py", line 890 in _bootstrap | |
Thread 0x00007fb494a6a000 (most recent call first): | |
File "/usr/lterminate called after throwing an instance of 'ib64/std::runtime_error' | |
pytho what(): [Rank 53] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807922 milliseconds before timing out.n3.8/threading.py", line | |
1027 in _wFatal Python error: Aborted | |
ait_fThread 0x00007f54417fb700 (most recent call first): | |
or_ts<no Python frame> | |
Thread 0xtate_00007f54421fc700 (most recent call first): | |
<no Python frame> | |
lock | |
Thread 0x00007f5442bfd700 (most recent call first): | |
<no Python frame> | |
File "/u | |
Thread 0x00007f54435fe700 (most recent call first): | |
<no Python frame> | |
sr/li | |
Thread 0x00007f5443fff700 (most recent call first): | |
b64/p<no Python frame> | |
Thread 0x00007f545907d700 (most recent call first): | |
ython<no Python frame> | |
Thread 0x00007f545b5fe700 (most recent call first): | |
3.8/t<no Python frame> | |
Thread 0x00007f545bfff700 (most recent call first): | |
hreading.py", line 1011<no Python frame> | |
Thread 0x00007f532b5fe700 in joi (most recent call first): | |
File "n | |
File "/usr/usr/li/lib6b64/python3.8/mu4/python3.ltipr8/multocessiproceing/pssingool.p/pool.y", line 717 in py"_term, line 576 in _handleinate_resu_poollts | |
File | |
File "/"/usr/lusr/ib64/lib64/pythopython3.8/multin3.8/procethreassingding./utilpy", line 870.py", line in run224 in __ | |
File "/ucall_sr/lib64/_ | |
File "pytho/usr/n3.8/lib64threa/pythding.on3.8py", line 932/mult in _bootstrap_iniprocessiner | |
File ng/ut"/usril.py/lib64/p", line 300 in ython_run_3.8/tfinalizhreading.pyers | |
File "/usr", line 890 in /lib6_boot4/pytstraphon3. | |
Thread 0x00007f5344dfa700 (most recent call first): | |
8/mul File "/utiprocessing/sr/lib64/putil.ythonpy", line 3343.8/t in _exhreadit_fuing.pnctioy", line 302 in n | |
wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53461fc700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f5346bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53475fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53621fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f537ffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f537d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f5363fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53997fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f539a1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f537cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f537f5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f524abfd700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f55d3fff700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f56126e2700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f56cce44000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-15:13991:14142 [4] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-8:13871:13952 [0] NCCL INFO comm 0x7fa078000f60 rank 2 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fbf095fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf09fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf253fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf25dfe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf267ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf42bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf43fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbf5df57700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fbe0ffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe2a1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe461fc700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe2abfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe297fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe2b5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe2bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe457fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe44dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
in worker | |
File "/usr/lib64/python3.8/threading.pycompute-od-gpu-dy-p4d-24xlarge-15:13993:14146 [6] NCCL INFO [Service thread] Connection closed by localRank 4 | |
", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe60dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe62bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe63fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbe621fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbd2cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc0b4b1f700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc0e18e3700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc19e62b000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-15:13988:14074 [0] NCCL INFO comm 0x7f0394000f60 rank 57 nranks 64 cudaDev 1 busId 101d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13990:14076 [0] NCCL INFO comm 0x7f93b0000f60 rank 59 nranks 64 cudaDev 3 busId 201d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-15:13989:14075 [0] NCCL INFO comm 0x7f3c9c000f60 rank 58 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'terminate called after throwing an instance of 'std::runtime_errorstd::runtime_error' | |
' | |
what(): what(): [Rank 57] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out.[Rank 58] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out. | |
Fatal Python error: Fatal Python error: AbortedAborted | |
Thread 0xThread 0x00007f02a21fc70000007f3bacffd700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3bad9fe70000007f02a2bfd700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3bae3ff70000007f02a35fe700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0x | |
Thread 0x00007f3c8ef7b700 (most recent call first): | |
00007f02a3fff700 (most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0x | |
Thread 0x00007f3c8f97c700 (most recent call first): | |
00007f02bd3fd700 (most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3ca0bfd70000007f02bddfe700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3ca1fff70000007f02be7ff700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3cf97ff70000007f02f5f57700 (most recent call first): | |
(most recent call first): | |
<no Python frame> | |
<no Python frame> | |
Thread 0xThread 0x00007f3a975fe70000007f018b5fe700 (most recent call first): | |
(most recent call first): | |
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 576576 in in __hhaannddllee__rreessuullttss | |
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//tthhreraedaidnign.gp.yp"y, line "870 in , line r870u in nr | |
u File n" | |
/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."py, line "932, line in 932_ in b_obootosttsrtarpa_pin_nienrn | |
e File r" | |
/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y890" in , line 890_ in bo_obtosottrsatpr | |
a | |
pThread 0x | |
00007f01a4dfa700 | |
(most recent call first): | |
Thread 0x File 00007f3accdfa700 (most recent call first): | |
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y302" in , line wa302i in tw | |
a File i"t/ | |
u File sr"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8/etahdrienagd.ipnyg"., line p433y in "a, line c433q in uaicrqeu | |
ir File e | |
" File /f"s/xf/snxo/unsoru/sdra/ldlael2l/el2i/bl6i4b/6p4y/tphyotnh3o.n83/.s8i/tsei-tpea-cpkaacgkeasg/eesm/beemdbdeidndgi_nrge_ardeeard/epra/rpqaureqtu_entu_mnpuym_prye_ardeeard.epry."py, line "121, line in 121p in ipeiceec_eg_egneenreartaotro | |
r File | |
" File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8m/umlutlitpirporcoecsessisnign/gp/opoolo.lp.yp"y, line "388, line in 388_ in g_ugauradredde_dt_atsaks_kg_egneenreartaitoino | |
n | |
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 532532 in in __hhaannddllee__ttaasskkss | |
File File ""//uussrr//lliibb6644//ppyytthhoonn33..88//tthhrreeaaddiinngg..ppyy"", line , line 870870 in in rruunn | |
File " File /"u/surs/rl/ilbi6b46/4p/yptyhtohno3n.38./8t/htrheraedaidnign.gp.yp"y, line "932, line in 932_ in b_obootosttsrtarpa_pi_ninnenre | |
r File | |
" File /"u/surs/rli/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y890" in , line _890b in o_obtosottrsatpr | |
a | |
pThread 0x | |
00007f01a75fe700 | |
(most recent call first): | |
Thread 0x00007f3ab0dfa700 File (most recent call first): | |
" File /u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.s8e/lseeclteocrtso.rpsy."p, line y415" in , line s415e in lescetl | |
ec File t" | |
/ File us"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/icnogn/nceocntnieocnt.ipoyn"., line p931y in "w, line a931i in tw | |
a File i"t/ | |
u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/ipnogo/lp.opoyl"., line p499y in "_, line w499a in i_tw_afiotr__fuoprd_autpedsa | |
t File e"s/ | |
u File s"r//ulsirb/6l4i/bp6y4t/hpoynt3h.o8n/3m.u8l/tmiuplrtoicpersosciensgs/ipnogo/lp.opoyl"., line p519y in "_, line h519a in n_dlhea_nwdolrek_ewrosr | |
k File e"r/su | |
s File r"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8e/atdhirnega.dpiyn"g, line .870p in yr"u, line n870 | |
in File r"u/nu | |
s File r"//luisbr6/4l/ipby6t4h/opny3t.h8o/nt3h.r8e/atdhirnega.dpiyn"g, line .932p in y_"b, line o932o in t_sbtoroatps_tirnanpe_ri | |
n File n"e/ru | |
sr File /"l/iubs6r4//lpiybt6h4o/np3y.t8h/otnhr3e.a8d/itnhgr.epayd"i, line n890g in ._pbyo"ot, line s890t in r_abpo | |
o | |
tThread 0xs00007f01de1fc700t (most recent call first): | |
ra File p" | |
/ | |
uThread 0xs00007f3ab17fb700r (most recent call first): | |
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp"o, line o114l in .wpoyr"k, line e114r in | |
w File o"r/kuesrr | |
/l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i870n in gr.upny | |
" File , line "870/ in ursurn/ | |
l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i932n in g_.bpoyo"t, line s932t in r_abpo_oitnsnterra | |
p File _"i/nunserr/ | |
l File i"b/6u4s/rp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpeya"d, line i890n in g_.bpoyo"t, line s890t in r_abpo | |
o | |
tThread 0xst00007f01df5fe700r (most recent call first): | |
ap File | |
" | |
/Thread 0xus00007f3ab21fc700r (most recent call first): | |
/l File i"b/6u4s/rp/lyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp"o, line o114l in .wpoyr"k, line e114r in | |
w File o"r/kuesrr | |
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n870g in .rpuyn" | |
, line File 870" in /ruusnr | |
/ File l"i/bu6s4r//plyitbh6o4n/3p.y8t/htohnr3e.a8d/itnhgr.epayd"i, line n932g in ._pbyo"o, line t932s in t_rbaopo_tisntnrearp | |
_ File i"n/nuesrr | |
/ File l"i/bu6s4r//plyithbo6n43/.p8y/tthhorne3a.d8in/gt.hprye"a, line d890i in n_gb.opoyt"s, line t890r in a_pb | |
o | |
oThread 0xt00007f01c35fe700s (most recent call first): | |
t File r"a/pu | |
s | |
rThread 0x/00007f3ab2bfd700l (most recent call first): | |
ib6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.psys"i, line n114g in /wpoorokle.rp | |
y File "", line /114u in swro/rlkiebr6 | |
4/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni | |
n File g"./puys"r, line /870l in irbu6n4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"th, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort | |
s File t"r/aups_ri/nlniebr6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogts.tprya"p, line | |
890 | |
in Thread 0x_b00007f01dcdfa700o (most recent call first): | |
o File t"s/tursarp/ | |
l | |
iThread 0xb00007f3ab3fff7006 (most recent call first): | |
4/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroonc3e.s8s/imnugl/tpioporlo.cpeys"s, line i114n in gw/oprokoelr. | |
p File y""/, line u114s in rw/olrikbe6r4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni | |
ng File ."/puys"r, line /870l in irbu6n4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"th, line r932e in a_dbionogt.sptyr"a, line p932_i in n_nbeoro | |
t File s"t/ruaspr_/ilninbe6r4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..8p/yt"h, line r890e in a_dbionogt.sptyr"a, line p890 | |
in | |
_Thread 0xb00007f01f8dfa700o (most recent call first): | |
ot File s"t/ruaspr | |
/ | |
lThread 0xi00007f3ace1fc700b (most recent call first): | |
6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry | |
", line File 114" in /wuosrrk/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a870d in irnugn. | |
p File y""/, line u870s in rr/ulni | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"ea, line d932i in n_gb.opoyt"st, line r932a in p__bionontesrt | |
r File a"p/_uisnrn/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"tr, line a890p in | |
_ | |
bThread 0xo00007f01fa1fc700o (most recent call first): | |
ts File t"r/aups | |
r | |
/Thread 0xl00007f3acebfd700i (most recent call first): | |
b6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry | |
" File , line "114/ in uwsorr/kleirb | |
6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"r, line e870a in driunng | |
. File p"y/"u, line s870r in /rluinb | |
6 File 4"//pyutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort | |
s File t"r/aups_ri/nlniebr6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogt.sptyr"ap, line | |
890 | |
in Thread 0x_00007f01fb5fe700b (most recent call first): | |
oo File t"s/tursarp/ | |
l | |
iThread 0xb00007f3acffff7006 (most recent call first): | |
4/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroon3c.e8s/smiunlgt/ipporoolc.epsys"i, line n114g in /wpoorokle.rp | |
y File "", line /114u in swro/rlkiebr6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r870e in arduinn | |
g File ."p/yu"s, line r870/ in lriubn6 | |
4 File /"p/yutshro/nl3i.b684//tphyrtehaodni3n.g8./ptyh"r, line e932a in d_ibnogo.tpsyt"r, line a932p in __ibnonoetrs | |
tr File a"p/_uisnrn/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"t, line r890a in p_ | |
b | |
oThread 0xo00007f0214dfa700t (most recent call first): | |
st File r"a/pu | |
s | |
rThread 0x/l00007f3ae8dfa700i (most recent call first): | |
b6 File 4"//puystrh/olni3b.684//mpuyltthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.epry | |
", line File 114" in /wuosrrk/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a870d in riunng | |
. File p"y/"u, line s870r in /rluinb | |
64 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r932e in a_dbionogt.sptyr"a, line p932_ in i_nbnoeort | |
s File t"r/aups_ri/nlniebr6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r890e in a_dbionogt.sptyr"a, line p890 | |
in | |
_Thread 0xb00007f0232bfd700o (most recent call first): | |
o File t"s/tursarp/ | |
l | |
iThread 0xb600007f3ae97fb7004 (most recent call first): | |
/ File p"y/tuhsorn/3l.i8b/6m4u/lptyitphroonc3e.s8s/imnugl/tpioporlo.cpeys"s, line i114n in gw/oprokoelr. | |
py File "", line /114u in swro/rlkiebr6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"h, line r870e in arduinn | |
g File ."p/yu"sr, line /870l in irbu6n4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h932r in e_abdoiontgs.tprya"p, line _932i in n_nbeoro | |
t File s"t/ruaspr_/ilninbe6r4 | |
/ File p"y/tuhsorn/3l.i8b/6t4h/rpeyatdhionng3..p8y/"t, line h890r in e_abdoiontgs.tprya"p, line | |
890 | |
in Thread 0x_00007f02175fe700b (most recent call first): | |
oo File t"s/tursarp/ | |
l | |
iThread 0xb00007f3aea1fc7006 (most recent call first): | |
4/p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/nmgu/lptoioplr.opcye"s, line s114i in nwgo/rpkoeorl | |
.p File y""/, line u114s in rw/olrikbe6r4 | |
/p File y"t/huosnr3/.l8ib/6t4h/rpeyatdhionng3..p8y/"t, line h870r in eraudni | |
ng File ."p/yu"s, line r870/ in lriubn6 | |
4 File /"p/yutshro/nl3i.b86/4t/hpryetahdoinn3g..8p/yt"hr, line e932a in d_ibnogo.tpsyt"r, line a932p in __ibnonoetrs | |
tr File a"p/_uisnrn/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tpyh"r, line e890a in d_ibnogo.tpsyt"r, line a890p in | |
_ | |
bThread 0xo00007f00abfff700o (most recent call first): | |
ts File t"r/aups | |
r | |
/Thread 0xl00007f39b6bfd700i (most recent call first): | |
b6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"r, line e306a in dwianigt. | |
py File "", line /306u in swra/ilti | |
b6 File 4"//puystrh/olni3b.684//tphyrtehaodni3n.g8./ptyh"re, line a558d in iwnagi.tp | |
y" File , line "558/ in fwsaxi/tn | |
ou File s"r//fdsaxl/lneo2u/slri/bd6a4l/lpey2t/hloinb36.48//psyittheo-np3a.c8k/asgietse/-tpqadcmk/a_gmeosn/ittqodrm./p_ym"o, line n60i in trourn. | |
p File y""/, line u60s in rr/ulni | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a932d in i_nbgo.optys"t, line r932a in p__bionontesrt | |
r File a"p/_uisnrn/elri | |
b File 6"4//upsyrt/hloinb36.48//ptyhtrheoand3i.n8g/.tphyr"e, line a890d in i_nbgo.optys"tr, line a890p in | |
_ | |
bThread 0xoo00007f0406f45700t (most recent call first): | |
s File tr"a/pu | |
sr | |
/Thread 0xl00007f3d44f0e700i (most recent call first): | |
b6 File 4/p"y/tuhsorn/3l.i8b/6c4o/npcyutrhroenn3t./8f/uctounrceusr/rtherneta/df.uptuyr"ecompute-od-gpu-dy-p4d-24xlarge-15:13991:14077 [0] NCCL INFO comm 0x7f5490000f60 rank 60 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE | |
s, line /78t in h_rweoardk.epry | |
" File , line "78/ in us_rw/olrikbe6r4 | |
/p File y"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/compute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
, line t870h in rreuand | |
i File n"gcompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
/.upsyr"/, line l870i in br6u4n/compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 5 | |
p File y"t/huosnr3/.l8i/bt6h4r/epaydtihnogn.3p.y8"/, line t932h in r_ebacompute-od-gpu-dy-p4d-24xlarge-10:13988:14141 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
odoitnsgt.rpayp"_, line i932n in n_ebro | |
o File t"s/tursarp/_liinbn6e4r/ | |
p File y"t/huosnr3/.l8i/bt6h4r/epaydtihn[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
ogn.3p.y8"/, line t890h in r_ebaodoitnsgt.rpayp" | |
, line | |
890Thread 0x in 00007f047b19d700_ (most recent call first): | |
bo File "o/tusstrr/alpi | |
b | |
6Thread 0x400007f3d826e2700/ (most recent call first): | |
p File yt"h/ousnr3/.l8i/bs6e4l/epcyttohrosn.3p.y8"/, line s468eterminate called after throwing an instance of ' in lseecltestd::runtime_erroroc' | |
rts | |
. File py""/, line u468s in rs/ellieb what(): c6t[Rank 60] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808552 milliseconds before timing out.4 | |
/p | |
File yt"h/ouns3r./8l/iabs6y4n/cpiyot/Fatal Python error: hboaAbortedns | |
3e._8e/vaesnThread 0xytns00007f5378bfd700.c (most recent call first): | |
piy<no Python frame> | |
"o, line | |
/1823Thread 0xb in 00007f53795fe700a_ (most recent call first): | |
sr<no Python frame> | |
eu | |
n_Thread 0x_eov00007f5379fff700ne (most recent call first): | |
cnet<no Python frame> | |
s. File | |
p"Thread 0xy/00007f5394bfd700u (most recent call first): | |
"s<no Python frame> | |
, line | |
r1823Thread 0x/ in 00007f53955fe700l_ (most recent call first): | |
i<no Python frame> | |
rb | |
6uThread 0x4n/_00007f5395fff700opny (most recent call first): | |
ct<no Python frame> | |
he | |
o | |
Thread 0xn File 300007f53acbff700". (most recent call first): | |
/8<no Python frame> | |
u/ | |
saThread 0xrs00007f5405357700/ (most recent call first): | |
ylni<no Python frame> | |
cb | |
i6Thread 0x4o00007f529bfff700// (most recent call first): | |
bp File yats"eh_o/envu3es.nr8t//sa.lspyyin"b, line c5706i in o4r/u/bnpay_stefh_oorenve3ev.net8rs/ | |
.mp File uy"l/t"uis, line p570rr in /orlcuiebsn6s_4f/ionprygetv/heproo | |
on File l"3./.pu8ys/"rt/h, line lri576eb in a6_d4hi/apncompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
nydgt.hlpoeyn_"3r, line .870e8 in s/urtlhutrnse | |
a File File d"i"/nu/gsu.rscompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 1 | |
/prly/"lii, line bb68706compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
4 in 4/rp/compute-od-gpu-dy-p4d-24xlarge-10:13988:14141 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
uyptnyh | |
toh File on"n3/3..u88s//rtt/hhlirrbee6aa4dd/ipinnyggt.p.hypo"yn", line 3932., line in 8870/_ in tbrhouronet | |
sa File tdr"ia/pnug_s.irpn/lniyeb"r6, line 4 | |
932/ in File p_"yb/utoshorots/ntlr3.ia8bp/6_4itn/hnpreyertah | |
do File in"n3g/..up8sy/"rt, line /hlr932ie in ba6_4db/iopotnysgtt.hrpoaypn"_3, line i.890n in n8_e/brt | |
o File ho"rt/eussatdrri/anlgp.i | |
pby | |
6"Thread 0x4, line 00007f0535aef000/ (most recent call first): | |
890p in File y_"/tbfhosooxtn/s3n.to8rua/strp/h | |
rd | |
eaaThread 0xld00007f3e3ce00000lie (most recent call first): | |
2n File /"gl/.ifbps6yx4"//p, line ny890ot in huo_snrb3/o.doa8t/lssltiert2ae/p-l | |
pi | |
abThread 0x6c00007f52b61fc7004ka (most recent call first): | |
/gp File ye"st//uhstoron/r3li.bc86h/4/s/inptynte/h-moopdnauc3lk.ae8gse//st/mhtordeouarldeci.hnp/gyn."np, line /y1198f"u in , line n_302c_ in tgiweotanaiatt | |
lt.r File p_"y_"/ | |
, line u1252 File s in r"d/r/lofpsioxbu/6tn4 | |
o/u File sp"r/fy/stxDhA/oLnnLo3Eu.s28r/-t/pdyhatrloelreac2dh/i/lngid.bap6ly4"l/, line pe433y2 in t_ahpyctoqonru3chi./r8de/a | |
sl File lie"t2e/_-fppsyaxtco/krancgoheu.ssp/ryt/"od, line ar546lc in lhfeo/2rn/nli/bwm6ao4rd/dup | |
ylt File eh"so//fdnsr3xo./p8on/ousuitts.epr-y/pd"aa, line cl58kl in aefg2oe/rlswia/bre6md4b | |
e/ File dp"dy/iftsnhxgo/_nnroeua3sd.re8/rd//saiptlalee-r2pq/aucelktiab_g6ne4s/u/tmpoyrpctyh_h/ronenna/d3meo.r8d./upslyie"tse/, line -m121poad in cukpaligeeecse._/pgtyeo"nr, line ec1130rha in /t_ncnoa/rl | |
mlo File _d"ium/lpuelss | |
r File //m"lo/idfbus6lx4e//.pnpyoyu"ts, line hr1130o/n in d3_ac.la8lll/e_m2iu/mllptili | |
bp6 File r4"/o/cpfeysstxsh/ionnno3gu./s8rp//osDoiAltL.eLp-Ey2p-a"pcy, line kt388oa in g_regcsh/u/tadoarrldclehed/2n__ntp/yamtsookdr_ucglhee/snde/aclroalnteti2a_oipnny | |
etro File r.c"ph.y"py/, line "140u in , line s806fr in o/lrfiowbra6wr4ad/r | |
dp File | |
y" File /t"h/offsnsxx3//.n8no/oumususlrr//tdiadplalrlloeec22/e/lslisbiib6n644g//pp/yyptthhoooonnl33...88p//sysi"tiet, line -e532 in -pp_aahccakknaadggleesse//_ttootrracschkh/sn | |
/nn/n File m/"omo/dduulsuerless///mmlooiddbuu6ll4ee/..pppyyyt"h", line o, line 11301130 in n in _3_c.8c/aatllhllr__iemiapmdpll | |
i | |
File n File "g"/./fpfsysxx"//, line nn870oo in uurssurrn// | |
DD File AA"L/LuLsLr/ElE2i-2bp6-y4pty/oprtyocrthchh/od/ndaa3lll.lee822/__tpphyyrtteooarrdcchih/n/ddagal.lpllye2e"_2_p, line py932ytt in oo_rbrcocohht..spptyyra"p", line _, line 718i in 944nf in noefrrow | |
ra File rw"da/r | |
ud File s" | |
r/ File /f"ls/ifxbs/x6n/4on/oupsuysrrt//hddonaa3ll.lle8e/2/2lt/ihlrbie6ba4d6/i4p/nypgtyh.otpnhy3o".n, line 83890/. in s8_/bisotioett-esp-taracpkpaa | |
cg | |
keThread 0xsa00007f52d0dfa700/g (most recent call first): | |
etos File /r"tco/hruc/snhrn///lmnionbd/6mu4ol/depsuy/ltmehosod/num3lo.d8eu/l.speey."pl, line ye1130"c in , line t1130o_ in rc_sac.lpalyl_"li, line m_p415il in m | |
spe File ll" | |
e/ File cf"t | |
/s File xf/"sn/xou/usnsroru//slriD/bADLA6LL4E/Lp2Ey2-tpy-thopoynrt3oc.hr/8cd/haml/uldleat2li_lpepy2rt_oocrpecyshts/oirndcalhlg/e/d2ac_olpnlytneeo2c_rtpcyicompute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
thoo.nrpcompute-od-gpu-dy-p4d-24xlarge-12:13923:14077 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
cy.h".p, line pyy"806" in , line , line 931f1144 in o in rwpaw_ialtro | |
ds File | |
s File compute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 5 | |
"e"s// | |
fu File ss"xr///fnlsoixubs6/r4/n/dopuaylstlrhe/o2D/compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
lAinLb3L6.E24-8/p/pymytuotlrthcoinhp/3r.do8c/aeslsliseti2e_-nppgayc/tkpaoogroecls.h//tpdoayrl"cl, line he/4992n in _n_p/wymatoiodtur_lcfehos.r/p_myuo"pd, line du1254al in tefe.ospr | |
yw" File a, line r1130" in d/ | |
_u File cs"ar/l/flls_iixbm/6pn4lo | |
/u File p"ys/trfh/osndx3/a.ln8l/oemu2us/rllt/iibDp6Ar4Lo/LcpEey2tsh-sopiynn3t.go8/r/spciohto/edl-a.plpalyec2"k_, line ap519gye in tso_rh/chat/ndodarllcleeh2_/_wnponyrt/kmeoorrdsuc | |
lhe. File sp"/ym/"oud, line su944rl in /efl.oiprbwy6a"4r, line /dp1130 | |
y in File t_hc"oa/llnf_3si.xm8/pn/lo | |
tu File sh"rr/e/afddsaixlnl/gen.2po/yu"ls, line ir870/bd in 6arl4ul/npe | |
2y/ File tl"hio/bnu634.s/8rp/y/tlshioinbt36e.48-//ppsaycitkthaeo-gpnea3sc./8kt/aotgrhecrshe//andtnio/nmrgc.ohp/dyu"nl, line n932/e in sp/_ambrooadolullteesl.t/pdryia"sp, line t_1130r in ii_nbcuntaelerld | |
_.ip File myp""l, line / | |
969u in File s_r"r//ufnls_ixdb/d6npo4_u/fspro/yrDtwAhaoLrnLd3E | |
.28 File -/p"t/yhftrsoerxc/ahnd/oidunsagrl./pldeya2"_lp, line ly890et in 2o_/rblocohit/bsd6t4ar/lplayep2t_ | |
hp | |
yoThread 0xtn00007f52d17fb700o3 (most recent call first): | |
.r8 File /c"sh/iut.sprey/-"pl, line ai1144cb in k6ap4g_/leposys/ttsheoosnr | |
3 File c."8h///fmnsunxl//tnpioapurrsaorlcle/esDlsA/idLniLgsE/2tp-ropioybtlou.rtpceyhd"/.pd, line ya114"l in , line l1008we in o2f_ropkryetwroa | |
rr File cd"h | |
// File u"sd/arfl/sllexi2/b_n6p4oy/uptsoyrrt/chdho.anp3lyl."e8, line 2/1254/ in ltfihobr6re4wa/adpridyn | |
tg File h."op/nfy3s".x, line 8/870/n in soriuutsne | |
r- File /p"/daaucllsker2a//glleiisbb/66t44o//rppcyyhtt/hhnoonn/n33m..o88d//utslhiertesea/-dmpioandcgku.lapegy.e"sp, line /y932t" in o, line _r1130b in co_hoc/atnlsntl/r_maiopmd_upilln | |
en File se/"rm/ | |
of File sd"x//uunlsoer.u/plsyir"b/, line D6A1130L4 in L_/cEap2l-ylpt_yhitomonprlc3 | |
h./8 File d/a"t/lhlfers2ex_/apndyiontugos.rrpc/yhd/"at, line lrla890ei2n in /e_lrib.bop6o4yt"/spt, line ry394ta in phfo | |
no3 | |
rThread 0x.w8a00007f52d21fc700/r (most recent call first): | |
sd File | |
i"t File /e"-u/psafrsc/xk/lanigboe6u4srs///DptyAotLrhLcoEh2n-/3.pn8ynt//omprucalrht/aidlpalrleolclee/2ds_spiysittnrogi/rbpuoctohel/d..ptpyry"a", line i, line 114n969 in in ew_ror.rpuykn"e_, line rd107 | |
d in File pi"n_n/feuro | |
sr File rw"/a/lrfidsb | |
x6/4 File n/"op/uyfstsrhx//odnnao3lu.ls8er//t2dh/arlleilabde6i24n//gpl.yiptbyh6"4o, line n/8703p. in 8yr/tuhsnio | |
tn File e3"-.p/8aus/crskcompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
/ialtcompute-od-gpu-dy-p4d-24xlarge-10:13984:14140 [2] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 6 | |
gieeb-s6p/4at/copkryactghhe/onsnn/3t/.om8ro/ctdhhur/lenaedns/i/npmaogr.adpluyll"eel, line ./932dp in iys_"tb, line or1130o in i_tcsbatulrtlea_pdi._pmiyp"nl, line n | |
1008e File in r"f/ | |
orfw File as"rx/d/ | |
un File so"ru//slfris/bxD6A/4Ln/opLuyEstr2/-hdpoanyl3tlo.re8c/2ht//hlrtierb6aa4di/ipnny_tgdh.iopfnyf3"u., line 8s890/i in so_inbt_oeop-tpsratcirkoaarpg.e | |
ps | |
/yt"Thread 0xo, line r00007f52d2bfd700c503 (most recent call first): | |
h in File /n"tn/r/uamsiornd/ | |
ull File ie"bs/6/f4ms/oxpd/yulnteoh.opunsy3r"., line /81130D/ in A_mLcuLaEll2l-_tpiimypptrloo | |
File cre"cs/hsf/itsnrxga//ninpo_oduoislf.rfp/uysD"iA, line oL114nL in _Ewp2or-rpkiyotero.rrp | |
cyh File /""d/a, line u736l in lsiern2i/_tlipaiylbti6oz4re/_cpthyr/ttahrioannii3nn.ge8r | |
/. File t"hp/ryfe"sax, line d/394in in onfugos.rrpw/Dya"ArL, line dL870E | |
in File 2r"-upn/y | |
fts File x"o//rnucohsu/strr/rD/aAliiLnLb_E6d42if-/fppyuytsotrihcohon/n3d_.apr8li/loterh2.r_peypa"ydt, line io753n in rgmc.ahp/iytn"r | |
, line a File 932i" in /nf_esbrxo/.onptyos"uts, line rr107/a in dp_iainlnnlneeer2r/ | |
l File File "i/"bf6/su4xs//nrop/ulysibtr6/h4od/anpl3yl.te8h2//oslnii3tb.6e84-//pptayhctrkheaoagdneis3n./8gc/.lspiiyct"ke, line -/890pc in ao_crbkoaego.etpss/yt"tro, line ar760cp in | |
hi | |
/nnvThread 0xno00007f52effff700/k (most recent call first): | |
meo | |
File d File ""u//lufesssrx///mlnoiodbu6usl4re//p.dypatlyhl"oe, line n113023 in /._l8icba/6l4ml/_upilymtptlih | |
po File nr"3o./c8fe/sssxsi/itnneo-ugsp/rap/cDokAoaLlLgE.e2p-syp/"yct, line lo114ri in ccwhko//rctkroearrie | |
n. File _p"dyi/"fuf, line us1404sr in i/iolnnivbo_6kp4er/ | |
ip File yo"tr/h.fopsnyx3"/., line n8503o/ in uttshrrra/idenaa | |
dl File i"ln/egf2.s/plxiy/"bn, line 6o4870/ in uprsyurtn/h | |
Do File An"L3L/.Eu2s8-/rps/iylttieob-r6pc4ha//cpkytarthgaoeinsn3_/.dci8l/fiftchukrs/eiconao_drpiern.igpo.yr"p, line .y1055p"y in , line m"a, line 932i736 in n in _ | |
ibn File oi"toi/taslftiszerx_a/pnt_roiunasniren/rid | |
nagl File | |
l" File e"/2/uf/ssrlx//inlobiu6sb46r4///DppyyAttLhhoLnoE3n2.3-8./p8sy/itttoherr-cepahadc/iktnargag.iepnsy_/"dci, line lf890if in uc_skbioo/noc_topsrtreri.apopry | |
." | |
, line Thread 0xp113000007f52d35fe700y in (most recent call first): | |
_"_, line File c753"a in l/mlua_sir_n/ | |
l File File i""b/6/f4fss/xpx/yn/tonuhsooru/dnsa3lr.l/8ed/a2ml/ullleit2bi/6pl4ri/obpce6ys4th/sopinyn3t.hg8o//npso3io.tl8e/-.sppiaytc"ke, line a114g- in epswa/occrlkkiecarkg | |
e/ File sc"//oturoserr.c/phly/i"dbi6, line st7604r in /iipbnyvuottehkdeo/n | |
e File 3l".a/8sf/sttxih/crn/eomuausdlrit/ndgia.plpryol"ec, line 2e870/s in slriiunbng | |
6/4e File /r"pry/otuhrsosr//n_l3_.ii8nb/i6ts_4i_/.tppey-ytp"ha, line oc345kn in a3w.g8rea/stp/hpcrleeairdc | |
ki/ File nc"o/gf.rsepxy/."np, line oy932u in "s_, line rb1404/o in DoiAtLnsvLotEkr2ea- | |
pp File _y"i/tnofnrsecxrh/ | |
/n File to"ru/ausirsn/_rdda/illflifeb2u/6sli4ibo/6np4_y/ptprhyoitnho3o.rn83./p.t8yh"/rs, line eiatd757e in i-<npmgaoc.dkpuaylg"ee, line s>890/ | |
in c_libcoko/tcsotrrea.pp | |
y | |
"Thread 0x, line 105500007f52ee1fc700 in (most recent call first): | |
m File a"i/nu | |
s File r"//lfisbx6/4n/opuystrh/odna3l.l8e/2m/ullitbi6p4r/opcyestshionng3/.p8o/osl.iptye"-, line p114a in cwkoargkeesr/ | |
c File l"i/cuks/rc/olrieb.6p4/yp"y, line t1130h in o_n_3c.a8/ltlh_r_e | |
a File din"g./pfys"x/, line n870o in ursurn/ | |
d File a"l/lues2r//lliibb6644//pyptyhtohno3n.38./8s/itther-ecompute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
paadciknagg.epsy/"t, line o932r in c_hb/odoitsstrtirbaupte_di/nenlears | |
t File i"c//ucompute-od-gpu-dy-p4d-24xlarge-10:13986:14133 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
msur/lltiibp6r4o/cepsystihnogn/3er.r8o/rtsh/r_e_iandiitn_g_..ppyy"", line , line 345890 in in w_rabpopoetrs | |
t File r"a/pf | |
s | |
xThread 0x/00007f53097fb700n (most recent call first): | |
ou File s"r//uDsArL/LEl2i-bp6y4t/oprycthh/otrna3i.8n/_mudlitfifpursoicoens_spirnigo/rp.opoyl".p, line y757" in , line <114m in owdourlkee>r | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53261fc700 (most recent call first): | |
File "/usr/lib64compute-od-gpu-dy-p4d-24xlarge-10:13982:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f53257fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f52eebfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f51b97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f5543e1d700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threacompute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
ding.py", line 890 in _bootstrap | |
Thread 0x00007f558121c700 (most recent call first): | |
File "/uscompute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
r/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/ascompute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
yncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/pcompute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 1 | |
ython3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f563796c000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 710 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
compute-od-gpu-dy-p4d-24xlarge-12:13921:14078 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-12:13925:14080 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-12:13927:14079 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
compute-od-gpu-dy-p4d-24xlarge-9:13984:14139 [0] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-9:13986:14137 [2] NCCL INFO [Service thread] Connection closed by localRank 3 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14145 [0] NCCL INFO [Service thread] Connection closed by localRank 7 | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13976 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13978 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13979 closing signal SIGTERM | |
compute-od-gpu-dy-p4d-24xlarge-15:13992:14072 [0] NCCL INFO comm 0x7fc204000f60 rank 61 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13980 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13981 closing signal SIGTERM | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 61] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808541 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fc1075fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc107fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc1213fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc121dfe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc1227ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc1e4bff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc2015fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc25ef57700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc00abfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc0275fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc05cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13982 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13983 closing signal SIGTERM | |
wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc026bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc042bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc040dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc0421fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc043fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc05d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc05e1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc05ebfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc07b5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc0797fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fbf297fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc297a7b700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc2ed7e0700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc3a40a0000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 752 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
compute-od-gpu-dy-p4d-24xlarge-15:13993:14071 [0] NCCL INFO comm 0x7f4a68000f60 rank 62 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALL[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
E2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousterminate called after throwing an instance of 'r/DALstd::runtime_errorL' | |
E2-pytorch/train_d what(): if[Rank 62] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808555 milliseconds before timing out.fus | |
ion_prior.py", line 753 in main | |
File Fatal Python error: "Aborted/ | |
fsx/nousrThread 0x/da00007f4959fff700l (most recent call first): | |
le<no Python frame> | |
2/ | |
lThread 0xib00007f4974ffd7006 (most recent call first): | |
4/<no Python frame> | |
p | |
yThread 0xth00007f49759fe700o (most recent call first): | |
n<no Python frame> | |
3 | |
.8Thread 0x/00007f49763ff700s (most recent call first): | |
it<no Python frame> | |
e | |
-Thread 0xp00007f4990b38700a (most recent call first): | |
c<no Python frame> | |
k | |
aThread 0xge00007f4991539700s (most recent call first): | |
/<no Python frame> | |
c | |
liThread 0xck00007f4993357700/ (most recent call first): | |
c<no Python frame> | |
o | |
rThread 0xe.00007f4ad9532700p (most recent call first): | |
y<no Python frame> | |
" | |
, line Thread 0x76000007f48b21fc700 in (most recent call first): | |
in File vo"ke | |
/ File u"s/rfs/xl/inbo6u4s/rp/ydtaholnl3e.28//limbu6l4t/ippyrtocheosns3i.n8g//spiotoel-.ppacyk"ag, line es576/ in c_lhiacnkd/lec_orrees.uply"t, line s1404 | |
in File in"v/okues | |
r File /"l/ifbs6x4//npoyutsrh/odna3l.l8e/2t/hlrieb6a4d/ipnytgh.opny3".8, line /870s in itreu-np | |
a File c"/kuasrg/elsi/bc6l4i/cpky/tchoorne3..p8y/"t, line h1055r in emaadiinn | |
g File .p"y/"f, line s932x in /_nboouostrs/dtarlalpe_2i/nlniebr6 | |
4/ File p"y/tuhsorn/3l.i8b/6s4it/ep-pyatchkoange3s.8//ctlhirceka/dcoirneg..ppyy"", line , line 1130890 in in ___bcoaoltls_t_r | |
a File p" | |
/ | |
fThread 0xs00007f487bfff700x (most recent call first): | |
/n File o"u/surs/rd/allilbe624//lpiby6th4o/np3y.t8h/otnh3r.e8a/disnigt.ep-yp"a, line c302k in agewsa/itto | |
rc File h"//duissrt/lriibb6u4t/epdy/tehloans3t.i8c//tmhulrteiadpirnogc.epsys"i, line n433g in /aecrqruoirrse/ | |
_ File _i"n/ifts_x_/.nopuys"r, line /345d in awlrlaep2p/elri | |
b File 6"4//fpystxh/onno3u.s8r//sDiAtLLeE-2p-apcyktaogrecsh//etmrbaeidnd_idnigf_fruesaidoenr_/pprairorqu.epty_"n, line u757m in p<ym_ordeualde>e | |
r.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiproccompute-od-gpu-dy-p4d-24xlarge-15:13994:14073 [0] NCCL INFO comm 0x7fcb68000f60 rank 63 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE | |
essing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f48957fb700 (most recent call first): | |
File "/usr/[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/pythonterminate called after throwing an instance of '3.8std::runtime_error/' | |
multiprocessin what(): g[Rank 63] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out./c | |
onnection.py", line 931 in Fatal Python error: wAborteda | |
it | |
Thread 0x File 00007fca2f5fe700" (most recent call first): | |
/<no Python frame> | |
us | |
rThread 0x/l00007fca2ffff700i (most recent call first): | |
b<no Python frame> | |
6 | |
4Thread 0x/00007fca48bfd700p (most recent call first): | |
y<no Python frame> | |
t | |
hThread 0xo00007fca495fe700n (most recent call first): | |
3<no Python frame> | |
. | |
8Thread 0x/00007fca49fff700m (most recent call first): | |
u<no Python frame> | |
l | |
tThread 0xi00007fca653ff700p (most recent call first): | |
r<no Python frame> | |
o | |
Thread 0xce00007fca7db56700s (most recent call first): | |
s<no Python frame> | |
iThread 0xn00007fca7e557700g (most recent call first): | |
/<no Python frame> | |
p | |
Thread 0xoo00007fcbd72bb700l (most recent call first): | |
. File py""/, line u499s in r_/wlaiibt6_4f/poyrt_uhpodna3t.e8s/ | |
mu File l"t/uisprr/olciebs6s4i/npgy/tpohooln.3p.y8"/, line mu576l in t_ihparnodclee_srseisnugl/tpso | |
o File l."p/yu"s, line r519/ in l_ihba6n4d/lpey_twhoornk3e.r8s/ | |
t File h"r/euasdri/lnig.bp6y4"/, line p870y in trhuonn | |
3. File 8"//tuhsrr/elaidbi6n4g/.ppyyt"h, line o870n in 3r.u8n/ | |
t File h"r/euadsirn/gl.ipby6"4, line /932p in y_tbhoootns3.t8r/apth_rienandeir | |
n File g"./puys"r/, line l932i in b_6b4o/optytsthroanp3_[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
.i8n/ntehrr | |
e File a"d/iunsgr./plyi"b, line 68904 in /_pbyotohtosnt3r.a8p/ | |
t | |
hThread 0xr00007fcbd7cbc700e (most recent call first): | |
a File d"i/nugs.rp/yl"i, line b8906 in 4/_pbyotohtosnt3r.a8p/ | |
t | |
hThread 0xr00007f48ccdfa700e (most recent call first): | |
File a"d/iunsgr./plyi"b, line 63024 in /wpayitth | |
o File n"3/.u8sr//mluilbt6i4p/rpoyctehsosni3n.g8//pthoroela.dpiyn"g, line .114p in yw"o, line r433k in earcq | |
u File i"r/eu | |
s File r"//lfisbx6/4n/opuystrh/odna3l.l8e/2t/hlreiabd6i4n/gp.yptyh"o, line n8703 in .r8u/ns | |
i File t"e/-upsarc/klaigbe6s4//epmybtehdodni3n.g8_/rtehardeeard/ipnagr.pqyu"e, line t932_ in n_ubmopoyt_srteraadpe_ri.npnye"r, line | |
121 File in "p/iuescre/_lgiebn6e4ra/tpoyrt | |
h File "o/nu3s.r/8/ltihbr6e4a/dpiyntgh.opny3."8, line /890m in u_lbtoioptrsotcresaspi | |
n | |
gThread 0x/00007f487abfd700p (most recent call first): | |
o File o"l/.uspyr"/l, line i388b in 6_4g/upayrtdheod_nt3a.s8k/_mguelnteirpartocieosns | |
i File n"g/u/spro/olli.bp6y4"/p, line y114t in hwoornk3e.8r/ | |
m File u"l/tuisprr/olciebs6s4i/ngp/yptoholo.npy3"., line 8/532t in h_rheaanddilneg_.tpays"ks, line | |
870 File in "r/uuns | |
r File /"l/iubs6r4//lpiybt6h4on/3p.y8t/htohnr3e.a8d/itnhgr.epayd", line i870n in gr.upny | |
" File , line "932/ in u_sbro/oltistbr6a4p/_piyntnheorn | |
3 File ."8//utshrr/elaidbi6n4g/.ppyyt"h, line o932n in 3_.b8o/otthsrteraadpi_nign.pnye"r, line | |
890 File " in /_ubsoro/tlsitbr6a4p/ | |
p | |
yThread 0xt00007f48b0dfa700h (most recent call first): | |
o File n"3/u.s8r//tlhirbe6a4d/ipnygt.hpoyn"3, line .8908 in /_mbuolottistprraopc | |
e | |
sThread 0xs00007fcc04bfd700i (most recent call first): | |
n File g"//puosorl/.lpiyb6"4, line /114p in ywtorhkoenr3 | |
. File 8"//sueslre/cltiobr6s4./pyp"y, line t415h in osne3l.e8c/tt | |
hr File e"a/duisnrg/.lpiyb"6, line 4870/ in pruynt | |
h File o"n/3u.s8r//mluiltbi6p4r/opcyetshsoinn3g./8c/otnhnreecatdiionng..ppyy"", line , line 931932 in in w_abioto | |
ts File t"r/aups_r/ilnibn6e4r/ | |
p File y"t/huosnr3/.l8i/bm6u4l/tpiyptrhoocne3.s8s/itnhgr/epaodoiln.gp.yp"y", line , line 499 in 890_ in w_abioto_tfsotrr_aupp | |
d | |
aThread 0xt00007f48e8dfa700e (most recent call first): | |
s File | |
" File /"u/surs/rl/ilbi6b46/4p/yptythhoonn33..88//mmuullttiipprroocceessssiinngg//ppooooll..ppyy"", line , line 519114 in in _whoarnkdelre | |
_ File w"o/rukserr/sl | |
i File b"6/4u/spry/tlhiobn634/.p8y/tthhorne3a.d8i/tnhgr.epadyi"n, line g.870p in yr"u, line n | |
870 File in r"u/nu | |
s File r"//luibs6r4//lpiybt6h4o/np3y.t8h/otnh3r.e8a/dtihnrge.adpiyn"g, line .932p in y_"b, line o932o in t_sbtoroatps_tirnanepr_ | |
i File n"n/eurs | |
r File /"l/iubs64r//lpiybt6h4on/3p.y8t/htohnr3e.a8d/itnhrge.apdyi"n, line g890. in p_yb"oo, line t890s in t_rbaopo | |
t | |
sThread 0xt00007f48975fe700r (most recent call first): | |
ap File | |
" | |
/Thread 0xus00007fcc055fe700r (most recent call first): | |
/l File i"b/6us4r//plyitbh6o4n/3p.y8t/hmounl3t.i8p/rmoucletsispirnogc/epsosoiln.gp/yp", line o114o in lw.opryk"e, line r114 | |
in File w"o/rukserr/ | |
l File i"b6/4u/srp/yltihbo6n43/.p8y/tthhorne3a.d8i/ntgh.rpye"ad, line i870n in gr.upny | |
" File , line "870/ in ursunr/ | |
l File i"b/6u4s/rp/yltihbo64n/3p.yt8h/otnh3r.8e/atdhirnega.dpinyg".p, line y932" in , line _932b in o_obtosottrsatpr_aipn_nienrn | |
er File " | |
/ File u"s/ru/srl/ilbi6b46/4p/yptyhtohno3n.38./8t/htrheraedaidningg..ppyy"", line , line 890890 in in __bboooottssttrraapp | |
Thread 0xThread 0x00007fcc05fff70000007f48961fc700 (most recent call first): | |
(most recent call first): | |
File File ""//uussrr//lliibb646/4p/yptyhtohno3n.38.8/m/umlutlitpirporcoecsessisnign/gp/opoolo.lp.yp"y, line "114, line in 114w in owrokrekre | |
r File | |
"/ File u"s/ru/slri/bl6i4b/6p4y/tphyotnh3o.n83/.t8h/rtehardeiandgi.npgy."p, line y870" in , line r870u in n | |
r File u"n/ | |
u File s"r//ulsirb/6l4i/bp6y4t/hopny3t.h8o/nt3h.r8e/atdhirnega.dpyi"n, line g932. in p_yb"o, line o932t in s_tbroapo_tisntnrearp | |
_ File i"n/nuesrr/ | |
l File i"b/6u4s/rp/ytlhiobn36.48//ptyhtrehaodni3n.g8./ptyh"r, line e890a in d_ibnogo.tpsyt"r, line a890p in | |
_ | |
bThread 0xo00007fcc0eb11700o (most recent call first): | |
t File "s/truaspr | |
/ | |
lThread 0xi00007f4896bfd700b (most recent call first): | |
6 File 4"//upsyrt/hloinb36.48//pmyutlthiopnr3o.c8e/smsuilntgi/pprooocle.spsyi"n, line g114/ in pwooorlk.erp | |
y File "", line /114u in swro/rlkibe6r4 | |
/ File p"y/tuhsorn3/.l8i/bt6h4r/epaydtihnogn.3p.y"8, line /870t in hrruen | |
a File d"i/nugs.rp/yl"ib, line 68704 in /rpuynth | |
o File n"3/.u8s/rth/rleiabd6i4n/gp.yptyh"o, line n9323 in ._8b/otohtrsteraadpi_nignn.epry | |
File "", line /932u in s_rb/oloitbs6t4r/apypt_hionnn3e.r8 | |
/ File t"h/ruesard/ilnigb.6p4y/"p, line terminate called after throwing an instance of 'std::runtime_error' | |
y890t in h_ what(): [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808539 milliseconds before timing out. | |
obno3o.Fatal Python error: Aborted | |
t8s/tThread 0x00007f4c9d3fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4c9ddfe700 (most recent call first): | |
<no Python frame> | |
trharp | |
Thread 0x00007f4c9e7ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4cb4dfc700 (most recent call first): | |
<no Python frame> | |
Thread 0x | |
e | |
aThread 0xd00007f4cb57fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4cb7fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4cfcbfd700 (most recent call first): | |
<no Python frame> | |
00007fcc0f512700i (most recent call first): | |
n File gThread 0x00007f4d81357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f4c2d7fb700 (most recent call first): | |
File "/us."p/yu"sr/lib64/python3, line r890/.8/multiprocessin in l_ibbo6o4/tpsyttrhaog/pool.py", line 576 in _hpn | |
3 | |
andle_results | |
File .Thread 0x800007f48cffff700"/usr/lib64/pyth/ (most recent call first): | |
m File uon3.8/threadi"l/tng.py", line 870 in run | |
uisprr File "/usr/lib64/p/olciebython3.8/threads6s4ing.py", line 932 in _booti/npgystrap_inner | |
/tpho File "/usoonl3r/lib6..p84/pyty/"m, line u114l in twiohon3.8prrkoe/threrc | |
e File ading.s"s/ipy", line 890unsgr/ in _boop/oltstrapoilb | |
Thread 0x00007f4bbe1fc700 (most recent call first): | |
.6p4/y File "/usp"y, line t114hr/lib6 in owno4/pyt3r.khon3.e8r/8/thre | |
t File h"r/euasading.dri/nlgpy", line 302.ipby in wait"6, line 4870 | |
File "/u/ in prsr/lib6uynt | |
h4/pyth File o"n/on3.83u.sr/thre8//ladingitbh.py", line 6r4e/433 in acqapdyuire | |
tihnogn.3p.y8" File "/f/, line t870h in sx/norreuandusr/d | |
i File nalle2"g/.upsyr"/, line 932l in i/lib6_bb6o4o/t4/pytpsytthon3.rhaopn8/site-packages/embedding_reader/parquet_numpy_reader.p_3i.ny", line 121 in piece_gen8e/rnerator | |
File "/ust | |
h File r"r/lib64/pythone/audsi3.8/multiprocern/gl.ssing/pool.py"ipby6, line 388 in _guarded"4, line /_task_generati932p in y_on | |
File "/usr/ltbhooib64/python3.8/ont3s.t8rmultiprocess/atph_ing/pool.py", line 532 in rien_handle_tasksandeir | |
File "/usr/lib64/n | |
g File ."ppython3.8/th/yu"sreading.py", line 870 in , line r890/ in l_run | |
File "/usribbo6o4t/lib64/sptyrtahpo | |
/pytho | |
nThread 0x300007fcc0ff13700. (most recent call first): | |
n3.8/8 File /"t/huthrearserding.pa/dliy", line 932 in inbg6_boot.4p/yp"strapy, line t890h in o_inne_nb3o.o8tr | |
File "//smturlusr/liatpi | |
b64/pp | |
rThread 0xoython00007f48e97fb700c (most recent call first): | |
es File s"i/3.8/tnugs/rp/olohreadili.bp6ng.pyy4"/, line ", line 890 in _p114y in tbootswhoornk3e.r8 | |
/ File m"trap | |
u/luThread 0x00007f4bdbfff700 (most recent call first): | |
File stri/"/usrplrib/lib6o6c4e4/pyt/spshon3.8yitnh/seleogn/3p.o8ctors.o/ltpy", line 415.hprye in sele"a, line d114ct | |
File i in wn"/usrogr.kpeyr"/lib64 | |
, line File 870/pyth" in /ruuon3.8/ns | |
r File /"multil/iuprocebs6rssing//4l/ipby6connet4h/ctionpoynt3h.py", line .o8n/931 in wai3t.h8t | |
File "/r/etausr/lidhirneb64/pga.dpython3iyng"..8/mup, line y870ltipr" in , line r932uocessi in n_ | |
bng/po File o"ot/sutsrra/ol.py"pl_iibn, line 499 in _w6ne4r/ | |
ait_fp File y"tor_upd/huosnrates | |
3/.l8i/bt6h4 File "/usr/epayr/libdtihnogn.3p.y864/pyt"/, line t932hhon3. in r_ebaodo8/multitnsgt.raiprocppy_essing"i, line n890n in /pool_ebr.py", line o | |
o519 in _ha File t"s/ndle_wtursorkerarp/ | |
l | |
s | |
File "/uThread 0xib00007fcc12abd7006sr/li (most recent call first): | |
4 File /"/b64/ppuystython3rh/olni3b.68.8/th4//tphyreadinrtheoag.py"nd3i.n8, line 870 in rung/.mpuy | |
File "/ul"t, line isr/li890p in r_b64/pobcoython3eost.8/thsstireadinrnagp/g.py" | |
p | |
oThread 0xo00007f48b2bfd700l, line 932 in _b. (most recent call first): | |
py File ""ootstr, line /114u in swap_inro/rlner | |
File kiebr6"/usr/ | |
4 File /"plib64/yut/pythoshro/nn3.8/3l.i8b/6m4u/lpthreadtyiting.pphroy", line 890 in onc3e._boots8s/itstrap | |
nhgr/ | |
Thread 0x00007f4bf7fff700 (most recent call first): | |
epao File "/usdoiln.gr/libp.yp"64/pyty, line "114, line in 870w in orruknhon3.e | |
r File 8/mult | |
" File /iproc"u/suressings/rl/ilb/pooli6b46.py", line /4p/yp114 in worytthhoker | |
File onn"/usr/33..88//tthhlib64rreeaa/pythoddiinnggn3.8/..ppythready"", line , line 932870 in in _ing.prbuony", line 870 in ro | |
t File s"t/ruaun | |
File "spr_/usr//ilnilib64/nbe6r4pytho | |
/ File pn3.8/"y/tuhthreadsorning.p/3l.i8b/6t4h/ry", line 932 in peyatdh_bootsionntrap_3g..inner | |
8p/yt" File "/ushr, line e932a in r/libd_ibnogo.t64/pytpsython3.8"r, line a890/threp in __iading.bnonpy", line 890oetrs | |
t File r" in _boo/aups | |
tstrap | |
rThread 0x/00007fcc134be700l | |
Thread 0x00007f4c13fff700 (most recent call first): | |
(most recent call first): | |
i File b"6/4u/spry/t File "/uslhiobr/libn634.64/pyt8//pthon3.8yhtrhe/multoand3i.n8giproce./pmyussing/"l, line tpool.pi890p in r_y", line 114 in obcoworkeeostr | |
File "/sstirusr/linagp/b64/p | |
p | |
oThread 0xoython300007f477ffff700l (most recent call first): | |
. File p"y/"u.8/th, line s114r in /wloireadinrbk6e4g.py", line r/ | |
p870 in runy File t"h/ | |
File "/uouns3sr/lir./8l/ib64/pytbh6thon3.r4e/ap8/thrdyitneadinghgo..py", line np3y.932 in _bo"8, line /306t in hwraeiotstraatd | |
ip_inn File ng".er | |
File "p/y/usr/lu"s, line r870ib64/p/ in lriythonubn6 | |
43.8/t File /"p/yhreadituhsng.py"orn/3l.i8b/6t4h/r, line 890 in _bpeyatdhionngootstr3..p8y/ap | |
Thread 0x"th, line re55800007f4c117fb700 (most recent call first): | |
File "/a in dwusr/lainigt.ib64/p | |
py File ython"", line /932f in s_3.8/mxb/onoultiprotussocesstrr/adping/poa_linlnee2r/ | |
ol.pyl File i"b", line 114 in w/6us4orker | |
r//ply File "/usitbh6or/libn43/.p8y/tsh64/pytiotne3-hon3..p8a8/thre/cthading.kraegpy", line 870aedsi in run | |
/ntgq.pdy File "/usrm"/, line /lib64/python3.8/threading.py", line 932 in _bootstrap_inner_890m in | |
File "/usr/lib64o_nbi/python3.8/threotootrading.py", line 890 in _bos.tprotstrap | |
Thread 0x00007f4bbebfd700 (most recent call first): | |
File ya"p"/usr/lib64/pyth, line | |
60 | |
in Thread 0xon3.8/multiprocr00007fcbc95fe700u (most recent call first): | |
n File | |
" File essing/pool.py", line /"u/su114 in worker | |
File "/usrs/rl/ir/lib64/python3.8lb6i/threading.py4b/6p4y/tphyo", line 870 in run | |
File "/usr/tnh3o.lib64/pythonn83/.m3.8/threading.p8u/lttihprreoacdy", line 932 in _bootstreisnsap_inner | |
File "/usrign.gp/lib64/python3/yp"o, line ol932. in py_.8/thre"b, line o114o in twsading.otrrkapy", line 890epr_ | |
i in _boot File n"n/estrapurs | |
r File | |
Thread 0x00007f4bbffff700 (most recent call first): | |
"//lu File "/usisbr6r/lib/4l/ipby6t64/pyt4h/ophon3.ny3t.h8/mult8o/ntiproc3h.r8eessinga/dt/poolihnrge.py", line 114.apd in workyi"n, line g870.er | |
File " in pryu"n/usr/l, line | |
890 File in ib64/"_/buopythonsort/sltir3.8/tba6phreadi4 | |
/ | |
ng.pypThread 0xy00007f4b0dfff700", line 870 in rt (most recent call first): | |
h File on3"./un | |
File "8u/st/usr/lrh/rib64/leiapythonbd6i3.8/t4n/ghreadip.yptyh"o, line ng.py932n in 3", line 932 in __.b8o/bootstoctosntrcaupr_rrap_iiennntnner | |
File e/rf | |
"/usru File t"/lib6/uurs4/pythers/on3.8/ltih/threbr6e4a/ading.dp.ytphpy", line 890yo"n in _boot3, line .788strap in /_t | |
Thread 0x00007f4bd97fb700 (most recent call first): | |
hwro File "/userakder/libir | |
n64/pytg File ."hon3.p/yu"s, line r890/ in l_8/mulibbo6o4ts/tpryatiproctph | |
essino | |
nThread 0xg/pool300007fcbc8bfd700. (most recent call first): | |
8.py", line File /"t/114 in woruhsrre/ker | |
File aldi"/usribn6g4/lib64./ppyy"/pytht, line h870o in on3.8/nr3u.threan8 | |
/ding.pm File u"y", line 870 in l/tuirun | |
File psrro"/usr/c/ellib64isbs6i4n/pytho/gp/ypn3.8/tohooln.3p.y8"/, line t114h in rweothrearakdeding.ri | |
npy", line 932 File g". in _boot/puys"rstrap, line /932l_innei in b_6br | |
File "/4o/optusr/liystthrb64/poanp3_ython3.i8n/n.8/thtehrr | |
e File adreadi"i/nng.py"ugs., line 890 in _brp/yl"iootstr, line b8706 in 4rap | |
Thread 0x/upn00007f4bf4dfa700 (most recent call first): | |
File "/y | |
t File husr/l"o/nu3ib64/ps.r8ython//ltihbr6e4a3.8/mu/dpiynltiprtgh.oocesspn3y."8ing/p, line /890t in ool.pyh_rbeoa", line 114 in wodtiorkernsgt.r | |
File "/upayp" | |
sr/li, line | |
932Thread 0x in 00007f4b53685700_ (most recent call first): | |
bob64/py File o"t/sthon3utsr.8/thrra/pleadin_iibng.py", line 6n4e870 in run/rp | |
File "/u File y"t/huosnsr/libr/3l.64/pyi8b/6s4thon3.e/lpeyctthoor8/thrns3..8py/"t, line heading468r in esa.py", line deilne932 in _bogc.tp | |
otstray File "", line /p_inn890u in _sbro/er | |
File "/olitbusr/l6s4t/ib64/prpayptython | |
h | |
oThread 0xn00007fcb4dfff7003.8/t3 (most recent call first): | |
. File 8hreadi"//aung.pyssyrn/", line 890 in _cliibootstob/6rap | |
4b/aThread 0x00007f4bf61fc700 (most recent call first): | |
File psyet"/usr_heo/lib6vne3n4/pyth.t8son3.8./pmy/multiu"l, line tproce1823i in pssing/_rroucne_sopool.snicnepy", line 114g | |
/ File p"o/o in workeuls.r | |
File "/pry/"lusr/l, line i114b in 6wib64/p4o/rpythonkyetr3.8/th | |
o File nhread"3/.u8s/ra/ing.pyslyinbc6i4", line 870 in r/op/ybun | |
File "tahsoe/usr/ln_3e.ib64/v8e/pythonntthsr3.8/t.epayhreadd"i, line n570g in .rpuyning.py"_, line f870o in ", line 932 in _rreuvbootsne | |
r File trap_i | |
" File /"nner | |
u/surs/ File "/usrl/ilbr/lib6i6b46/4/pyt4p/ypthon3.8yhtohno3/thren.38./ading8t/htrherae.py", line daid890 in _bonign.gp.yp"yotstr, line "932, line in 870_ in brouap | |
Thread 0xont | |
s File 00007f4bdb5fe700 (most recent call first): | |
File "/t"r/ausr/lups_rib64/pi/nlnieythonbr | |
6 File 4"3.8/mu//pultiprsyrt/ocessihloing/ponb36ol.py".48//p, line 114 in wotyhtrheoandrker | |
i3n. File "/us8g/.tr/libhpry64/pyte"a, line d932ihon3. in n_g8/threb.opoyading"t, line s.py", line 890t in r_abpo_870 in runoitn | |
File "/usntesr/librra | |
p64/py File | |
" | |
/Thread 0xthon3.u00007fc88abfd700s (most recent call first): | |
r8/thr/ File l"ieading/bu6s4r//.py", line plyi932 in _botbh6o4n/3p.y8t/htohnr3e.otstra8d/itnhgr.ap_innepayd"er | |
File ", line i890n in /usr/g_.bpolib64/yo"t, line spytho306t in rwan3.8/tapi | |
t | |
Thread 0x File hread00007f4c0df6b000" (most recent call first): | |
/ing.p File us"y", line 890 in r//flsixb/6n4_bootso/uptrap | |
syrt/ | |
Thread 0x00007f4bf75fe700 (most recent call first): | |
File hdoa"/usrnl3l/lib6.e82/4/pytt/hlhon3.8irbea/mult6d4i/iprocengp.yssingptyh"o, line /pooln5583 in ..py", line w8a/isti | |
t File e"-114 in wor/pafcker | |
File skxa"/usr/gneo/lib64su/srro/pyth/tdaaon3.8/rlylthrea_e2eding./mlbiebpy", line 870d6d4i/ in run | |
File pnyg"/usrt_ht/lib6oorn4/pythc3h./8r/oson3.8tiatre/threay-_pding.eamcpy", line 932bkead in _bootgdeisnstrap/gt_qtdomr/c_inne_hm.opr | |
File "/nyi"usr/lt, line o38r. in pryo"t, line a60t in reu_nh | |
aib64/p File l"f/ | |
uythonsr/li3.8/thb64 File /"preadiy/tfhsong.py"xn/3.n8o/ut, line 890 in _bshrr/ootstedaadlrap | |
Thread 0xilneg00007f4aa61fc700 (most recent call first): | |
File "/2./plyi"usr/lb, line 69324 in ib64//_pbyopythontohtos3.8/tnt3r.a8hreadip/_siinng.pytnee-r", line 306 in wp | |
ac File ait | |
File k"a/guessr"/usr//lr/lib6iobt6a4/pytr4y/p_yhon3.8teh/thremobne3d.ading.d8i/py", line 558ntgh in wait_rteoa | |
File "/frdcihnsx/no/gr.optya"r, line usr/day890_ in elle2/_mbblib64/oeodtdsitnrgpytho_atpo | |
rn3.8/ | |
cThread 0xh00007fcc13fff700. (most recent call first): | |
site-p File y"packag, line "47/es/tqu in sardm/_mop/pllnitoriyb_6r.py", line 604o/tpayr in run | |
tyh_oenm3b. | |
8 File /"c/ofns File "/uscxu/rnrr/liboeunst64/pyt/rf/udhon3.atlul8/threree2s//ladingitbh6.py", line r4e/apd932 in _booy.tphtstrayo"n, line 378. in 8_/p_innewsoirtr | |
File "/kee-rusr/lp | |
a File cib64/k"a/guespythonrs//l3.8/triobt6hreadi4a/rng.pypyy_t", line 890 in _heomnbbootste3d.d8rap | |
i/ntgh_rteoardcThread 0x00007f4dfffff700 (most recent call first): | |
File ih/nrg"/usro.tpa/lib6yr"y, line 4/pyth_870e in mron3.8buend | |
/concu File d"irrent/nugs_/futurt/olrires/thcbh6.read.4p/yp"y, line tpy", line 7895h in orno3t.8 in _wora/tteker | |
File h_qru"/usreear/lib6diine4/pythgs._poyon3.8r"_, line k/threa932e in y_sbding. | |
o File opy", line 870"t/s in run | |
ftsrxa/ File "/usrpn_oiunsnre//lib6rD | |
A File 4/pyth"L/LuEon3.8s2r-//threplyiading.tbo6py", line 932r4c/h in _boop/ydtatstrahlolne3p_inne2._8p/r | |
File "/ytthorrecausr/lhd/idnaib64/pgl.lpythoney2"_3.8/t, line p890y in hreadit_obrong.pycoht.", line 890 in _sptyr"a, line p762bootst | |
in f | |
rap | |
Thread 0xo00007fcc521e1700r (most recent call first): | |
wThread 0x00007f4e6b685700 (most recent call first): | |
File a File r"d/"/usr | |
u File s/lib64r"//lfisbx6/pyth/4n/opuyson3.8/trh/odnseleca3l.ltors.8e/2spy", line 468/ell in seleeicbt6o4rct | |
File "/sp.yp/usr/tyho"n, line 3468. in 8s/lib64/eslietctepytho | |
- File p"an3.8/a/cuksarsynci/gleo/baseisb/6t_even4o/rpcyht/hnts.py"onn/3m.o8d/ualseys, line 1823 in _/nmcorun_oidou/lbnce | |
File ea.spe"/usry_"ev, line e1130n in t/lib64_sc./pythaplyon3.8/l"_, line i1823m in asyncp_lr | |
uio/ba File n"_/se_eveofnscxnts.pe/ | |
n File y", line 570 in o"u/surrun_fs/rdorever/all | |
File "/uilbe624/sr/li/lpiyb64/pbt6hython34o/np.8/th3y.t8hreadino/na3s.g.py"y8n/cs, line 870 in ruiiotn | |
File "/e/-bpausr/lasce_kib64/peavgeenst/s.ythontpoyr3.8/th"ch, line readi570/ in nrnng.pyu/nm_", line 932 in _ofdobootsurleevtrap_se/rc | |
o File inner | |
n"t/ File "/usauisr/libnre/rl.ipyb"64/pyt6, line 4139/hon3.p in yfth8/threoornadingw3a..py", line 8r/d890 in _boot | |
h File rtstra"e/ap | |
Thread 0x00007f4f25ef8000fdisn (most recent call first): | |
File "/fxg/.pnyo"ussx/no, line r870/ in usr/dadraulnl | |
lle2/e File 2"/lib64//luisbpythor6/4l/n3.8/sipby6tite-p4h/opnackagey3t.h8os/tor/ns3ich/aut.t8e/-ogradtpharce/__inikaadt__.pgiensy", line 173 in g/.tpoybackwr"c, line h932/ard | |
File in n_nb"/fsx/omoot/nousdsutlr/dallrease2/lip/_miondnub64/pelre | |
.ython3 File p"y/"u.8/si, line s1130rte-pac in /_lckagesiabl6l4_/ipmypt/torchlh | |
o File /_tenn"3/sor.pyf.s8x//t", line 396 in bnhoruackwaesard/ird | |
File "DnAgL/fsx/L.Ep2nousr/y-"p, line ydallet890o in r_cbho2/lib/odta64/pytsltlreahon3.2p_ | |
8/sit | |
pThread 0xy00007fcd0ca49000t (most recent call first): | |
e-packo File r"cages//hf/sdxa/lnloeaccel2u_spry/tdoarlclhe.2p/erate/yl"i, line baccel67104 in erator/fpo.py", line yrtwha736 in bacornd3kward | |
. File 8"/ File "/f/sfistsx/nouxe/-sr/DApnaocukLLE2-psarg/ytorcdeaslh/dall/lteo2r/clhi/be2_py6n4n//torchpmyot/traihdouner.pnl3ey", line 400 in .s8//forwalsiird | |
File "nteea-r/fsx/p.apcnousryk"a, line g114e in sf//DALLEotrowr2-pytacrhd/orch/d | |
n File n"/alle2/mfos_pytoxd/ulneosu/srch/trmrod/uainerdlael.py", line .lpe107 in inny2"/, line l1130er | |
File "i in b_6c4a/lpl/fsx/y_tihmonousrnp3l. | |
/dall8 File /"s/ie2/libftse64/pyx-/pthon3.naocuksa8/sitrg/ee-packDsA/tLages/oLrEc2h-/pyntnotorch//rmcohnn/mod/udladuleselsl//modulem2o_e.py"dpuylt, line 1130 in _ceo.rpcyhall_i"/, line d1130a in mpl | |
File l_lc"/fsxea2l_lp_yitm/nousoprl | |
r/DALLc File h."p/yE2-pyf"s, line x806torch/ in nfoo/trairuwsarr/dD | |
n_diffA File L"L/usionEf2s-x_priorp/yntoo.py", line ursc503 in trarh//in | |
File ddaa"/fsxllllee22/_lpi/nousrybt6/DALLo4r/cE2-pyphy/torch/tdhaotrainlnl3e_diff.28_usion_/psyipriorttoer-cpha.py", line .cpkya"g, line e718s in /f736 in initoorrtialiwcahrd/ze_tr | |
n File n/aining"m/ofds | |
File "/uxl/enfsx/nso/umsord/udlaelousr/Dl.ep2y/lALLE2"i, line b-pytor11306 in 4_/ch/trcpaylain_dtlh_oiiffusinm3p.l8 | |
on_pr File /"sior.p/ifty", line 753 in sex-main | |
p/ancoku File "/fasgresx/no/sD/usr/daAtLoLrEc2h-/lle2/pnynt/molib64/ordcuhpythol/edsan3.8/s/lmloedite-p2u_lackagpey.es/clitpoyck/cor"c, line re.py"h1130/ in d_, line 760 in iaclanvoke | |
llle_2i_m File "/fsppylt | |
ox/nou File r"csr/dal/hf.sle2/lpxy/"nib64/po, line u944ythons in rf/oD3.8/srAwLaite-pLrEd2 | |
-ackage File p"y/ts/clifosrck/corxch//ne.py"doaulslre2, line 1404 in in/_dpavoke | |
yltlo File "/fser2c/x/nouhl/idbsr/dala6l4l/le2/lep2y_tpib64/phyotython3no3rc.8/si.h8./te-pacpsyi"t, line e806- in pfakagescokrawgaers/clickd/ | |
t File o"/core/rfcshx.py", line //nn1055 in maionu/smrn | |
File "/o/dduafsx/nllelousr/se/2dalle2m/oldi/lib6ubl6e4./p4/pythpyy"t, line on3.8h1130o in n_3/sitec.a8-packal/ls_ges/ciitmlick/ep-lp | |
acore.pc File k"y", line 1130 in a/gfse__calxs//ntoorucsl__ | |
File hr//nD"/fsxnA/LmLo/nousEd2ur/dal-lpeyle2/list/omrb64/pocdhu/ython3ldea..8/silplyte-pace"2, line kages_1130p in y_tcoar/torchlclh_//distidmapributelll | |
ed/ela2 File _"pstic//yftsmultipxo/rncoroceshu.ssing/erp/yD"rrorsA, line L1144L in E/__inip2_-plyott__.psosrecshy", line 345 in | |
/ File d"a/lwrappflsex2/_nper | |
File "yotu/fsx/osrrcnousr//hD/AdLaLlEDALLEl2e2-2-pyto_ppytyotrrch/tocrhc/rain_dhd.apliffuslye"2_, line ion_prp944y in tfior.poorrcy", line 757 in wha/rd<moduda | |
ll File e"2/_fple> | |
syxt/onrocushr./pdya"l, line le12542 in /floirbw6a4r/pdy | |
t File h"o/nf3s.x8//nsoiustre/-dpaalclkea2g/elsi/bt6o4r/cphy/tnhno/nm3o.d8u/lseist/emo-dpualcek.apgye"s, line /1130t in o_rcaclhl/_ninm/pmlo | |
d File u"l/efss/xm/ondouulser./pDyA"L, line L1130E in 2_-cpayltlo_ricmhp/ld | |
a File l"l/fes2x_/pnyotuosrrc/hd/adlallel2e/2l_ipby6t4o/rpcyht.hpyo"n, line 31144. in 8p/_sliotses-epsa | |
c File k"a/gfessx//tnoorucshr//nDAnL/LpEa2r-aplylteolr/cdhi/sdtarlilbe2u_tpeydt.opryc"h, line /969d in al_ler2u_np_ytdodrpc_hf.oprywa"r, line d1254 | |
in f File o"r/wfasrxd | |
/ File no"u/fssrx//dnoaulslre/2d/allilbe624//lpyitbh6o4n/3p.y8t/hsoint3e.-8p/ascitkea-gpeasc/ktaogersc/ht/onrnc/hp/anrna/lmloedull/edsi/smtordiubleu.tpeyd"., line p1130y in "_, line c1008a in lflo_irmwpalr | |
d File | |
" File /"f/sfxs/xn/onuosurs/rd/adlallel2e/2l/ilbi6b46/4p/ypytthhoon3n.38./8s/istiet-ep-apcakcakgaegse/tso/rtcohr/cnhn//npna/rmaoldluelle/sd/imsotdruilbeu.tepdy."p, line y1130" in , line _969c in a_lrlu_ni_mdpdlp | |
_ File fo"r/wfasrxd/ | |
n File o"u/sfrs/xD/nAoLuLsEr2/-dpayltloer2c/hl/idba6l4l/pey2t_hpoynt3o.r8c/hs/ittrea-ipnaecrk.apgye"s/, line t394o in rfcohr/wnanr/dp | |
ar File a"l/lfeslx//dniosutsrri/bDAuLtLeEd2.-ppyy"t, line o1008r in cfho/rdwaallred2 | |
_ File p"y/tfosrxc/hno/utsrra/idnaelrl.ep2y/"l, line i107b in 6i4n/npeyrt | |
h File o"n/3fs.x8//nsoiutser-/pdaaclklaeg2e/sl/itbo6r4c/hpy/tnhno/n3m.o8d/uslietse/-mpoadcuklaeg.epsy/"t, line o1130r in c_hc/anlnl/_immodpull | |
e File s"//mfosdxu/lneo.upsyr"/, line D1130A in L_LEc2a-lpylt_oirmcphl/ | |
d File al"l/ef2s_xp/yntoourscrh//DtArLaLinEe2r-.ppyyt"o, line r394c in hf/otrrwaairnd_ | |
d File i"f/ffussxi/onno_upsrri/oDrA.LpLyE"2-, line p503y in ttorracihn/ | |
da File l"le/2f_sxp/yntooursrc/hD/AtLrLaEi2n-epry.tpoyr"c, line h107/ in tirnaniern | |
_ File di"f/ffussxi/onno_upsrri/odr[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
a.lplye"2, line /736l in iinbi6t4i/aplyitzeh_otnr3a.i8n/isnigt | |
File terminate called after throwing an instance of 'std::runtime_errore"-/pf' | |
what(): [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out. | |
ascxk/aFatal Python error: Aborted | |
Thread 0xngoe00007f9f0d5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9f0dfff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9f28bfd700 (most recent call first): | |
<no Python frame> | |
uss/rt | |
Thread 0x00007f9f295fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9f29fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x/or00007f9f41141700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9f43357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fa01397c700 (most recent call first): | |
<no Python frame> | |
DcAhL/Thread 0x00007f9e2ebfd700 (most recent call first): | |
File "/usr/liLnEn2/-mopdyutb64/python3.8/mloerscultiprocessinh//mtordg/pool.py", line 576 in _auilne_.dpiyhandle_resultsf"f, line | |
File "/usr/lib64u1130s in i/python3.8/thr_ocna_leading.py", line 870plr_ in run | |
File "/usr/libiiomrp64/python3.8/t.l | |
p File yhreading.py", line ""/f, line s932 in _bootstrap_inn753x in /er | |
File "/usr/lib64/mnaoipython3.8/threaduns | |
r File ing.py"/"D/AfLsLxE/2n-, line 890 in _bopuysootsttro/rrap | |
Thread 0xdcah00007f9e497fb700 (most recent call first): | |
File "l/lte/usr/lr2a/ilnib64/i_bd6ipython4f/f3.8/thpuystihreadioonn_3p.r8ng.py"i/osr, line 302 in wi.tpeait | |
File y-"p, line a503"/usr/c in ktalib64rgaeisn/pytho | |
/ File c"li/cfks/xn3.8/tc/onhreadroeuing.pys.rp", line 433 in /yD"A, line LacquiL760E in 2i-re | |
File npvyo"/fsx/tkoer | |
ch File nousr/"tr//dallefasi2/libxn/_ndoiufsfru/s64/pdiaolythonnl_e3.8/sip2r/iloite-parb.6p4yckages/"p, line y/embet736h in dding_oinnreade3i.t8r/parqi/asluet_niitzumpy_ree_-tpraaicneaderkiangge.py" | |
s File /"c/, line 121 in pilfiscece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line xk//n388 in _guarded_task_gcooursereneration | |
File "/./pDyusr/lib64/pythonA"L, line 3.8/multiprocesL1404E in 2i-nsing/pool.py", line 532pvyotko in _handle_tasker | |
cs | |
File "/usr/lib64/pyth File /"t/hon3.8/threarfasixnding.py", line 870 in run/_ndo | |
File "/usr/lib64iufsfr/python3.8/threadu/siding.py", line 932 in _boanl_lpootstrap_innerer2i/o | |
File "/usr/lib64/lri.bppython3.8/threading6y4"/.py", line 890 in _bootst, line p753y in trap | |
Thread 0x00007f9e4bfff700 (most recent call first): | |
File "/usmhaoinr/lib64/pyn3 | |
. File 8thon3."//sfi8/selestxe/-nctorspoau.py", line cskra/gdeasl/415 in sellcel2ect | |
File i/clk"/usri/bc6o4r/ep/lib6.ypyt"h4/pyth, line o1055n in 3m.a8on3.8i/ns | |
i File /multit"e/-fprocepsaxc/kssing/naogueconnessr/ction./cdlapy", line ilclke931 in wait/2c/ol | |
File "/urieb.6p4y/psr/li"y, line t760hb64/py in oinn3thon3v.o8k.8/mul/es | |
i File tiproct"eessing/-fpsax/pool.c/knagoepy", line 499uss/rc/ldia in _waiclklt_for_e/2c/olrieupdatb.6p4yes | |
File "/"p, line y1130t in h/usr/_o_nc3a.l8lib64//ls_pythoi_t | |
e File n3.8/m-"pa/cultipfksaxrocessg/ening/posu/sool.pycrl/idc", line 519 in _akl/lcehandlo2r/el.ipby6"e_wor4, line /1404p in kers | |
File yitn"/usrvhoo/lib64kne3 | |
./pyth File 8"/on3.8//sfithreastxe/nding.p-opuascrk/daaglelse/2y", line 870 in t/olrrun | |
File icbh"/usr6/4d//lib64ipsy/pythtthron3.8/iobnuthrea3t.eding.p8d//sy", line 932 in eiltaes_boot-tpiacc/kmastrap_ugletsinneri/pcrloi | |
File "/uccke/scsosr/libirneg64/py./peyrthon3."r, line o1055r in 8/thrmsa/eadingi_n_ | |
i File n"i.py", line /tf_s_890 in _box./pnyootstrau"s, line r345/p | |
Thread 0x00007f9e821fc700 in dwa (most recent call first): | |
File "/rlalusr/lipep2b64/p/elrib | |
ython36 File 4"//pf.8/muystxhltipro/onno3u.s8rcessi//sDiAtng/pooLeL-Epl.py"a2c-kpa, line 114 in wygteosorker | |
r/cchl/i File "/usctkr/r/libacior64/pytne._phon3.dyi"f, line f1130u in s_8/thre_icoanlading_lp_r_.py", line | |
i File or870 in run | |
"./p File "/usfys"x/, line n757o in u<smrr/libo/ddu64/pytallel>hon3.e | |
2/l8/threib64/adingpytho.py", line n3.8/932 in _boosite-tstrapackap_inneges/tr | |
File "/orch/dusr/liistributed/eb64/plastiython3c/mul.8/thtiprocreadinessing.py"g/err, line 890 in _boors/_otstr_initap | |
Thread 0x__.py00007f9e661fc700 (most recent call first): | |
File "/", line 345 in usr/lwrappib64/per | |
File "/fsx/noythonusr/D3.8/muALLE2ltipr-pytorocessich/trng/poain_dol.py"iffus, line 114 in woion_prker | |
rior. File "/usrpy", line 757/lib6 in <mod4/pythule> | |
on3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e66bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e9cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e675fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e80dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e67fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e9e1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9e82bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9ebb5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9d335fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fa0b61c7700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fa10521c700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fa1bb8b7000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 397 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13944 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13946 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13947 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13948 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13949 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13950 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13951 closing signal SIGTERM | |
compute-od-gpu-dy-p4d-24xlarge-9:13991:14066 [0] NCCL INFO comm 0x7f3f08000f60 rank 14 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f3e095fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e09fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e253fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e25dfe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e267ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e40dfc700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e417fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3e5df55700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f3d46bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d2bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d457fb700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d475fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d461fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d60dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d47fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d62bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d7d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3dd3fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d63fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d635fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3d7cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3c2cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3fa0a59700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f3fc0e2a700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f40a8583000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/container.py", line 139 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 709 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13984 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13985 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13986 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13987 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13989 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13991 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13992 closing signal SIGTERM | |
Traceback (most recent call last): | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757, in <module> | |
main() | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
rv = self.invoke(ctx) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
return __callback(*args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
initialize_training(config_file, accelerator) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
train( | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
loss = trainer(text=txt, image_embed=img) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 59. Original reason for failure was: [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 59] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f92b3fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f92ccbfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f92cd5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f92cdfff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f92e8bff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9306bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f93075fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9307fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f91d35fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f94958e3700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9552761000 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 1027 in _wait_for_tstate_lock | |
File "/usr/lib64/python3.8/threading.py", line 1011 in join | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 717 in _terminate_pool | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 224 in __call__ | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 300 in _run_finalizers | |
File "/usr/lib64/python3.8/multiprocessing/util.py", line 334 in _exit_function | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06090 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06120 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06000 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea05f70 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea060d8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 3, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06048 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea05fb8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 3, size: 0, state: CREATED, direction: SEND } | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06168 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND } | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 1 (pid: 13945) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07158 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 253, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07080 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 250, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07038 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 249, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071a0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 254, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa070c8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 251, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa07110 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 252, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa06ff0 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 248, dev: 1, size: 0, state: CREATED, direction: SEND } | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071e8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND } | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 1 (pid: 13977) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 4 (pid: 13988) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
libfabric:13989:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13989:14149 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7ff74ea06168 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 3, size: 0, state: CREATED, direction: SEND } | |
compute-od-gpu-dy-p4d-24xlarge-8:13869:14030 [0] NCCL INFO [Service thread] Connection closed by localRank 7 | |
compute-od-gpu-dy-p4d-24xlarge-8:13875:14029 [6] NCCL INFO [Service thread] Connection closed by localRank 7 | |
compute-od-gpu-dy-p4d-24xlarge-8:13876:13957 [0] NCCL INFO comm 0x7f9168000f60 rank 7 nranks 64 cudaDev 7 busId a01d0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29489, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1808239 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f9033fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91643f9700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/asyn.py", line 54 in sync | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/asyn.py", line 86 in wrapper | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/s3fs/core.py", line 2173 in _fetch_range | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/s3fs/core.py", line 2030 in _fetch_range | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/caching.py", line 377 in _fetch | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/fsspec/spec.py", line 1578 in read | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/pyarrow/parquet.py", line 1766 in __init__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/pyarrow/parquet.py", line 1960 in read_table | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 126 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9164dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91657fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91661fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9166bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91675fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9167fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91e193b700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91e233c700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91e2d3d700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f92216fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9223131700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f9069fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9084bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f90855fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f9085fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f90bcb53700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f90bebfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f90bffff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f90ecbff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f8f521fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8f37fff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8f50dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8f361fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fa7fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fa6bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fc0dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fc17fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fc21fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fde1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8ffa1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fdcdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8fdf5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f8e8ebfd700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f91d7f89700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f924e1e1700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f930896a000 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 856 in next | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 149 in __call__ | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dataloaders/prior_loader.py", line 75 in get_sample | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dataloaders/prior_loader.py", line 63 in __next__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39 in fetch | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/dataloader.py", line 692 in _next_data | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/utils/data/dataloader.py", line 652 in __next__ | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 222 in report_validation_loss | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 450 in eval_model | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 552 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fcaa946d700 | |
compute-od-gpu-dy-p4d-24xlarge-15:13987:14070 [0] NCCL INFO comm 0x7fcb08000f60 rank 56 nranks 64 cudaDev 0 busId 101c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 56] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29487, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808556 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fc9a5fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc9c13fd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc9c3fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fca10bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fca115fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fca11fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcaa0c38700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fcaa1639700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fc8febfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc8fffff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc9197fb700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc91b5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc936bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc91bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc934dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc9357fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc937fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc950dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc9521fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc952bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc9535fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fc7c97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcb1bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcb9521c700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fcc4baa7000 (most recent call first): | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 546 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 712 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 806 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 944 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1144 in p_losses | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/dalle2_pytorch.py", line 1254 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969 in _run_ddp_forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008 in forward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 394 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
libfabric:13985:efa:cq:rxr_cq_write_tx_error():246<warn> rxr_cq_write_tx_error: err: 15, prov_err: Unknown error -15 (15) | |
compute-od-gpu-dy-p4d-24xlarge-10:13985:14145 [0] ofi_process_cq:1040 NCCL WARN NET/OFI Request 0x7f003aa071e8 completed with error. RC: 15. Error: unknown error. Completed length: 0, Request: { buffer_index: 255, dev: 1, size: 0, state: CREATED, direction: SEND } | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13869 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13870 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13872 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13875 closing signal SIGTERM | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 3 (pid: 13990) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13871) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3b40 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d2740 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d29c0 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d42c0 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d38c0 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3dc0 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d3640 | |
libfabric:13926:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f730a4d4540 | |
compute-od-gpu-dy-p4d-24xlarge-12:13926:14011 [0] NCCL INFO comm 0x7f7304000f60 rank 37 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 37] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808545 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f71af5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71affff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71c8bff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71e5083700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71e6bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f71e75fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f7290bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f7291fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f70cffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70eb5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7104dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70e97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70e8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f71057fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7107fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70ea1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7120dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f71217fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70ebfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f70eabfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f713f5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6fd17fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f734bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f738d19d700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f7449e47000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a8c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c940 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354676c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a140 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468fc0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546cbc0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b540 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354685c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835467940 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468840 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b2c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a640 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546ba40 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c1c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835468ac0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546c6c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546b040 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469240 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354699c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd8354694c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546adc0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd83546a3c0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469ec0 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd835469c40 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667c800 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667d100 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667b300 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667c500 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd836679200 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667ce00 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667a400 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd836678000 | |
libfabric:13923:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fd83667a100 | |
compute-od-gpu-dy-p4d-24xlarge-12:13923:14009 [0] NCCL INFO comm 0x7fd830000f60 rank 34 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 34] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808553 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fd6b9fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd6d4bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd6d55fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd6d5fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd6f35fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd6f3fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd708dfa700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd7b1357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd516bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd4fbfff700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd514dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd5175fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd5161fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd530dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd517fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd5321fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd532bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd5317fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd533fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd54cdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd54d7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd4f97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd84f9bf700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd88ce2a700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd9745a4000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1098 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7fefcd46b540 pkt_entry 0x7fefcd976fc0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4685c0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4676c0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469240 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469740 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468ac0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd469c40 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46b2c0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd467e40 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46ab40 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468340 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468d40 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46bcc0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd4699c0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46ba40 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd468840 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fefcd46a3c0 | |
libfabric:13984:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7fefce67cb80 | |
compute-od-gpu-dy-p4d-24xlarge-10:13984:14065 [0] NCCL INFO comm 0x7ff02c000f60 rank 18 nranks 64 cudaDev 2 busId 201c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808551 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fee9ffff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feeb8bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feeb95fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feeb9fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feed4bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feed55fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feed5fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007feef3357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fecfb5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fefc797c700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff02bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fefc6f7b700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff030bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fefd12bb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fed157fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff07ac78700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff076e22700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff031fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fed175fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fed317fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fed321fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fecf8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff07b7f9700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff0b97e0700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007ff17005a000 (most recent call first): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396 in backward | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736 in backward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400 in forward | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107 in inner | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130 in _call_impl | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503 in train | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736 in initialize_training | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404 in invoke | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055 in main | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130 in __call__ | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d29c0 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d2ec0 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d3b40 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d3140 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d2c40 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d33c0 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d42c0 | |
libfabric:13987:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7fd9ce4d4540 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7f6d85469740 pkt_entry 0x7f6d85984340 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854694c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85469c40 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85468fc0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546a640 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546b7c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546b040 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85469240 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854680c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546c6c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85467bc0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546c440 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854676c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d85468ac0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d854685c0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546adc0 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f6d8546ab40 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678680 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667b080 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667aa80 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d8667c880 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86679280 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678c80 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86678080 | |
libfabric:13986:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f6d86679b80 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():628<warn> Closing ep with unmatched unexpected tagged rx_entry: 0x7f4d81466680 pkt_entry 0x7f4d8197c680 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467a80 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146b180 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469100 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467f80 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81466400 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146a780 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81466e00 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146b680 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467800 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146ac80 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469600 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d8146af00 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467d00 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81469b00 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81467080 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():704<warn> Closing ep with unreleased rx_entry: 0x7f4d81468700 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267cc40 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82677540 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c940 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c340 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82679c40 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267c040 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d82678d40 | |
libfabric:13988:efa:ep_ctrl:rxr_ep_free_res():713<warn> Closing ep with unreleased tx_entry: 0x7f4d8267b440 | |
compute-od-gpu-dy-p4d-24xlarge-10:13987:14066 [0] NCCL INFO comm 0x7fd9c8000f60 rank 21 nranks 64 cudaDev 5 busId 901d0 - Abort COMPLETE | |
compute-od-gpu-dy-p4d-24xlarge-10:13986:14071 [0] NCCL INFO comm 0x7f6d80000f60 rank 20 nranks 64 cudaDev 4 busId 901c0 - Abort COMPLETE | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007fd86abfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd86b5fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd86bfff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd948bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9495fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd949fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd95f97c700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd9dcbff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007fd773fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in compute-od-gpu-dy-p4d-24xlarge-10:13988:14068 [0] NCCL INFO comm 0x7f4d7c000f60 rank 22 nranks 64 cudaDev 6 busId a01c0 - Abort COMPLETE | |
_bootstrap | |
Thread 0x00007fd7717fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7c57fb700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7abfff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7ab5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7e35fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7a8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7c4dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7e17fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7a97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7c61fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd7e0dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd8357fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fd6917fb700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fda14a24700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fda281e1700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007fdb0c81f000 (most recent call first): | |
File "/usr/lib64/python3.8/tokenize.py", line 392 in open | |
File "/usr/lib64/python3.8/linecache.py", line 136 in updatecache | |
File "/usr/lib64/python3.8/linecache.py", line 47 in getlines | |
File "/usr/lib64/python3.8/linecache.py", line 16 in getline | |
File "/usr/lib64/python3.8/traceback.py", line 288 in line | |
File "/usr/lib64/python3.8/traceback.py", line 366 in extract | |
File "/usr/lib64/python3.8/traceback.py", line 509 in __init__ | |
File "/usr/lib64/python3.8/traceback.py", line 120 in format_exception | |
File "/usr/lib64/python3.8/traceback.py", line 167 in format_exc | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py", line 75 in record_exception | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
[E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808543 milliseconds before timing out. | |
Fatal Python error: Aborted | |
Thread 0x00007f6c755fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6c75fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6c90bfd700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6c915fe700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6c91fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6cad3ff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6cc7fff700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6cf5357700 (most recent call first): | |
<no Python frame> | |
Thread 0x00007f6bb35fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 576 in _handle_results | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6be8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 302 in wait | |
File "/usr/lib64/python3.8/threading.py", line 433 in acquire | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/embedding_reader/parquet_numpy_reader.py", line 121 in piece_generator | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 388 in _guarded_task_generation | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 532 in _handle_tasks | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bcd7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 415 in select | |
File "/usr/lib64/python3.8/multiprocessing/connection.py", line 931 in wait | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 499 in _wait_for_updates | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 519 in _handle_workers | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6beabfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6b975fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bcebfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bb2bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6c217fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6c061fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bcffff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bb3fff700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6be97fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6bea1fc700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f6a9b5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in [E ProcessGroupNCCL.cpp:414] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down. | |
wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File terminate called after throwing an instance of '"/ustd::runtime_errors' | |
r/lib64/python3 what(): .[Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808550 milliseconds before timing out.8/ | |
threading.py", line 932 in _booFatal Python error: tAborteds | |
trap_inneThread 0xr | |
File 00007f4c6dfff700" (most recent call first): | |
/<no Python frame> | |
us | |
rThread 0x/00007f4c88bfd700l (most recent call first): | |
i<no Python frame> | |
b | |
6Thread 0x400007f4c895fe700/ (most recent call first): | |
p<no Python frame> | |
y | |
tThread 0xh00007f4c89fff700o (most recent call first): | |
n<no Python frame> | |
3 | |
.Thread 0x800007f4cbcb4d700 (most recent call first): | |
/<no Python frame> | |
t | |
hThread 0xr00007f4cbf357700ea (most recent call first): | |
d<no Python frame> | |
i | |
nThread 0xg00007f4cecbfd700. (most recent call first): | |
p<no Python frame> | |
yThread 0x"00007f4cedfff700, line (most recent call first): | |
890<no Python frame> | |
in | |
_Thread 0xb00007f4b8f5fe700o (most recent call first): | |
o File ts"t/ruaspr | |
/ | |
Thread 0xlib00007f6e395377006 (most recent call first): | |
4 File /p"y/tuhsorn3/.l8i/bm6u4l/tpiyptrhoocne3s.s8i/ncgo/npcooulr.rpeyn"t, line /f576u in t_uhraensd/lteh_rreesaudlt.sp | |
y File "", line /78u in s_rw/olrikbe6r4 | |
/ File p"y/thuons3r./8/ltihbr6e4a/dpiyntgh.opny3"., line 8870/ in trhurne | |
a File d"/iunsgr./plyi"b, line 68704 in /rpuynt | |
h File o"n/3u.s8r/t/hlrieba6d4i/npg.yptyh"o, line n9323 in ._8b/otohtrsteraadp_iinngn.epry | |
" File , line "932/ in u_sbro/oltisbt6r4a/pp_ytihnonne3r. | |
8/ File t"h/ruesard/ilnigb.6p4y/"p, line y890t in h_obno3o.t8s/ttrharpe | |
a | |
dThread 0xi00007f4be21fc700n (most recent call first): | |
g File ."p/yu"sr, line /890l in i_bb6o4ot/sptyrtahpo | |
n | |
3Thread 0x.00007f6e3ffff7008 (most recent call first): | |
/ File th"r/euadsirn/gl.ipby6"4, line 302/ in pwyatihto | |
n File 3"./u8s/r/sleilbe6c4t/poyrtsh.opny3"., line 8468/ in tsherleeacdti | |
n File g"./puy"s, line r433/ in laicqbu6i4re/ | |
p File yt"h/ofns3x./8n/oausysnrc/idoal/lbea2s/eli_be6v4e/pnyttsh.opny3."8, line /1823s in it_er-upna_conkacgee | |
s/ File e"m/buesdrd/ilnibg6_4r/eapdyetrh/opna3r.q8ue/ta_snyunmcpiyo_/rbeaadseer_.epvye"n, line t121s in .ppyi"ec, line e570_ in greunenr_afotroerv | |
e File r" | |
/u File s"r//ulsirb6/4l/ipby6t4h/opny3t.h8o/nmu3l.t8i/ptrhorceeasdsiinngg./ppyo"ol, line .870p in yr"u, line n388 | |
in File _"g/uuasrrd/eldi_bt6a4s/kp_ygtehnoenr3.a8ti/otnh | |
r File e"a/duisnrg/.lpiyb"6, line 4932/ in p_ybtohootns3t.r8a/pm_uilntnieprr | |
o File c"e/sussirn/gl/ipboo6l4./pyp"y, line t532h in on_3ha.n8d/lteh_rteaasdkisn | |
g. File p"y/"u, line s890r in /_lbiobo64t/sptyrthaopn | |
3 | |
.Thread 0x800007f6f22fda000/t (most recent call first): | |
hr File ea"d/inugs.rp/yl"i, line b870 in 6r4u/np | |
y File t"h/ouns3r./8l/itbo6k4e/npiyzteh.opny3"., line 8321/ in trheraeda_doirn_gs.toppy | |
" File , line "932 in /_ubsoort/sltirba6p_4in/npeyrt | |
h File o"n/3u.s8r//tloikbe6n4i/pzyet.hpyo"n, line 3363. in 8d/tehtreecatd_ienngc.opdyi"n, line g890 | |
in _ File b"o/outssrt/rlaipb | |
6 | |
4Thread 0x/00007f4b8e1fc700p (most recent call first): | |
yt File ho"/nu3s.r8//ltiobk6e4n/piyzteh.opny3"., line 8394/ in soepleecnt | |
o File rs"./puys"r, line /415l in isbe6l4e/cpty | |
t File h"o/nu3s.r8//lliib6n4e/cpaycthhoen.3p.y8"/, line m136u in ltuippdraotceecsascihneg | |
/ File c"o/nunserc/tlioinb.6p4y/", line p931y in twhaoint3 | |
. File 8"//luisnre/lciabc64h/ep.yptyh"o, line n347. in 8g/emtullitniepsr | |
o File c"e/sussirn/gl/ipbo6ol4./ppyy"t, line h499o in n_3w.a8i/tl_ifonre_cuapcdhaet.esp | |
y" File , line "16/ in ugsert/lliinbe6 | |
4/ File p"y/tuhsorn/3l.8i/bm6ul4t/ippryotcheosns3i.n8g//ptoroalc.epbya"c, line k519. in p_yh"a, line nd288l in el_wionrek | |
e File rs" | |
/ File u"s/ru/slri/bl6i4b6/4p/yptyhtohno3n.38./8t/rtahcreebadaicnkg..ppyy"", line , line 366870 in in erxutnr | |
a File c"t/ | |
u File s"r//luibs6r4//lpiybt6h4o/np3y.t8h/otnhr3e.ad8i/ntgra.cpeyb"a, line 932c in k_.bpoyo"ts, line t509r in a_p__iinnnietr_ | |
_ File | |
"/ File u"s/ru/slri/bl64i/bpy6t4h/opny3.t8h/otnh3r.e8a/dtirnagc.epbya"c, line k890. in p_ybo"o, line t120s in trfapo | |
r | |
mThread 0x00007f4baa1fc700a (most recent call first): | |
t File _"e/xucserp/tliiobn6 | |
4 File /p"/yuthsorn/3l.i8b/6m4u/lptyiptrhocoens3s.i8n/gt/rpaoocle.bpayc"k, line .114p in yw"o, line r167k in efro | |
r File ma"t/_uesxrc/l | |
i File b6"4//fpsyxt/hnono3u.s8r//tdharlelaed2i/nlgi.bp6y4"/, line p870y in trhuonn | |
3 File ."8/u/ssri/tlei-bp64a/cpkyatgheosn/3t.o8r/tchhr/edaidsitnrgi.bpuyt"e, line d932/ in e_lbaosottisctr/ampu_litninperro | |
c File e"/sussirn/gli/be6r4r/poyrtsh/oenr3r.8o/rth_rheaanddilnegr..ppyy"", line , line 890 in 75_ in broeoctostrrda_pe | |
x | |
cThread 0xe00007f4be0dfa700p (most recent call first): | |
t File i"o/nu | |
sr File /l"i/bf6s4x//pnyotuhsorn3/.d8a/mlullet2i/plriobc6e4s/spiyntgh/opnoo3l..8p/ys"i, line t114 in ew-oprakcekra | |
File g"e/su/srt/olricbh64//dpiystthroinb3u.t8e/tdh/reealdainsgt.ipcy/"m, line u870l in triupnr | |
File o"c/eusssri/linbg6/4/epryrtohrosn3/._8_/itnhiret_a_d.inpgy."py, line "360, line in 932w in r_abpopoetrs | |
t File ra"p/_fisnnxe/rn | |
o File u"s/ru/sDr/AlLiLb6E42/-ppyytthoonr3c.h8//ttrhariena_ddiinfgf.upsyi"o, line n890_ in p_rbiooorts.tprya"p, line | |
757 | |
in Thread 0x<00007f4be35fe700m (most recent call first): | |
o File d"u/lues>r | |
/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4be17fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4ba8dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4bfd7fb700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4c18dfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4bff5fe700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4bfcdfa700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4c36bfd700 (most recent call first): | |
File "/usr/lib64/python3.8/multiprocessing/pool.py", line 114 in worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4a935fe700 (most recent call first): | |
File "/usr/lib64/python3.8/threading.py", line 306 in wait | |
File "/usr/lib64/python3.8/threading.py", line 558 in wait | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/tqdm/_monitor.py", line 60 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4e355d9700 (most recent call first): | |
File "/usr/lib64/python3.8/concurrent/futures/thread.py", line 78 in _worker | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4e3bfff700 (most recent call first): | |
File "/usr/lib64/python3.8/selectors.py", line 468 in select | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 1823 in _run_once | |
File "/usr/lib64/python3.8/asyncio/base_events.py", line 570 in run_forever | |
File "/usr/lib64/python3.8/threading.py", line 870 in run | |
File "/usr/lib64/python3.8/threading.py", line 932 in _bootstrap_inner | |
File "/usr/lib64/python3.8/threading.py", line 890 in _bootstrap | |
Thread 0x00007f4f1eecc000 (most recent call first): | |
File "/usr/lib64/python3.8/tokenize.py", line 321 in read_or_stop | |
File "/usr/lib64/python3.8/tokenize.py", line 363 in detect_encoding | |
File "/usr/lib64/python3.8/tokenize.py", line 394 in open | |
File "/usr/lib64/python3.8/linecache.py", line 136 in updatecache | |
File "/usr/lib64/python3.8/linecache.py", line 47 in getlines | |
File "/usr/lib64/python3.8/linecache.py", line 16 in getline | |
File "/usr/lib64/python3.8/traceback.py", line 288 in line | |
File "/usr/lib64/python3.8/traceback.py", line 366 in extract | |
File "/usr/lib64/python3.8/traceback.py", line 509 in __init__ | |
File "/usr/lib64/python3.8/traceback.py", line 120 in format_exception | |
File "/usr/lib64/python3.8/traceback.py", line 167 in format_exc | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py", line 75 in record_exception | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 360 in wrapper | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 757 in <module> | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13921 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13922 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13924 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13925 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13927 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13928 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13982 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13983 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13985 closing signal SIGTERM | |
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 13989 closing signal SIGTERM | |
/usr/lib64/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 6 leaked semaphore objects to clean up at shutdown | |
warnings.warn('resource_tracker: There appear to be %d ' | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13984) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 2 (pid: 13923) of binary: /fsx/nousr/dalle2/bin/python3.8 | |
ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 311.0644772052765 seconds | |
Traceback (most recent call last): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier | |
store_util.barrier( | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier | |
synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize | |
agent_data = get_all(store, rank, key_prefix, world_size) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all | |
data = store.get(f"{prefix}{idx}") | |
RuntimeError: Socket Timeout | |
ERROR:torch.distributed.elastic.multiprocessing.errors.error_handler:no error file defined for parent, to copy child error file (/tmp/torchelastic_h37v8tb9/none_qmhc8b_7/attempt_0/4/error.json) | |
Traceback (most recent call last): | |
File "/fsx/nousr/dalle2/bin/torchrun", line 8, in <module> | |
sys.exit(main()) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/run.py", line 761, in main | |
run(args) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/run.py", line 752, in run | |
elastic_launch( | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2022-07-16_00:26:36 | |
host : compute-od-gpu-dy-p4d-24xlarge-9.hpc-1click-production2.pcluster. | |
rank : 12 (local_rank: 4) | |
exitcode : -6 (pid: 13988) | |
error_file: /tmp/torchelastic_h37v8tb9/none_qmhc8b_7/attempt_0/4/error.json | |
traceback : Traceback (most recent call last): | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper | |
return f(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1130, in __call__ | |
return self.main(*args, **kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1055, in main | |
rv = self.invoke(ctx) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 1404, in invoke | |
return ctx.invoke(self.callback, **ctx.params) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/click/core.py", line 760, in invoke | |
return __callback(*args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 753, in main | |
initialize_training(config_file, accelerator) | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 736, in initialize_training | |
train( | |
File "/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py", line 503, in train | |
loss = trainer(text=txt, image_embed=img) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 107, in inner | |
out = fn(model, *args, **kwargs) | |
File "/fsx/nousr/DALLE2-pytorch/dalle2_pytorch/trainer.py", line 400, in forward | |
self.accelerator.backward(loss) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/accelerator.py", line 736, in backward | |
loss.backward(**kwargs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/_tensor.py", line 396, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward | |
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
RuntimeError: NCCL communicator was aborted on rank 12. Original reason for failure was: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=29488, OpType=BROADCAST, Timeout(ms)=1800000) ran for 1807935 milliseconds before timing out. | |
============================================================ | |
Traceback (most recent call last): | |
File "/fsx/nousr/dalle2/bin/accelerate", line 8, in <module> | |
sys.exit(main()) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 43, in main | |
args.func(args) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/launch.py", line 562, in launch_command | |
multi_gpu_launcher(args) | |
File "/fsx/nousr/dalle2/lib64/python3.8/site-packages/accelerate/commands/launch.py", line 306, in multi_gpu_launcher | |
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) | |
subprocess.CalledProcessError: Command '['torchrun', '--nproc_per_node', '8', '--nnodes', '8', '--node_rank', '1', '--master_addr', 'compute-od-gpu-dy-p4d-24xlarge-8', '--master_port', '12802', '/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py', '--config_file', '/fsx/nousr/DALLE2-pytorch/configs/prior.json']' returned non-zero exit status 1. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment