Created
April 26, 2019 19:20
-
-
Save cswinter/3c8a245fe35d8808c466282a161974b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
root@managed-worker-sqfj:/# for nrings in 1 2 4 8 16 32; do mpirun --allow-run-as-root -H 10.73.0.19:8,10.73.0.95:8 -np 16 -mca btl_tcp_if_include ens12 -x NCCL_IB_DISABLE=1 -x LD_LIBRARY_PATH -x NCCL_SOCKET_IFNAME=ens12 -x NCCL_DEBUG=INFO -x NCCL_MIN_NRINGS=$nrings -x NCCL_MAX_NRINGS=$nrings /nccl-tests/build/all_reduce_perf -b 16M -e 1G -f 2 -g 1 -c 0; done | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3071 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3072 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3073 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3074 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3076 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3077 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3080 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3082 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 1852 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 1853 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 1854 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 1855 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 1856 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 1861 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 1865 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 1867 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO NET/Socket : 1 interfaces found | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-sqfj:3077:3077 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3077:3077 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3077:3077 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-rnls:1861:1861 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1861:1861 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1861:1861 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-sqfj:3073:3073 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3073:3073 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3073:3073 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3082:3082 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3082:3082 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3082:3082 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3074:3074 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3074:3074 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3074:3074 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-sqfj:3076:3076 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3076:3076 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3076:3076 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3080:3080 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3080:3080 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3080:3080 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-sqfj:3072:3072 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3072:3072 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3072:3072 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-rnls:1856:1856 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1856:1856 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1856:1856 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-rnls:1852:1852 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1852:1852 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1852:1852 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:1867:1867 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1867:1867 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1867:1867 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-rnls:1854:1854 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1854:1854 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1854:1854 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:1865:1865 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1865:1865 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1865:1865 [6] NCCL INFO rank 14 nranks 16 | |
managed-worker-rnls:1853:1853 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1853:1853 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1853:1853 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:1855:1855 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1855:1855 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1855:1855 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO comm 0x7f10c00566a0 rank 0 nranks 16 | |
managed-worker-rnls:1861:1892 [5] NCCL INFO comm 0x7f82380566a0 rank 13 nranks 16 | |
managed-worker-rnls:1861:1892 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1861:1892 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO comm 0x7f7c500566a0 rank 5 nranks 16 | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO comm 0x7f404c0566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO comm 0x7f7c140566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO comm 0x7f88000566a0 rank 3 nranks 16 | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1867:1893 [7] NCCL INFO comm 0x7f49700566a0 rank 15 nranks 16 | |
managed-worker-rnls:1867:1893 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1867:1893 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO comm 0x7ff6180566a0 rank 6 nranks 16 | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO comm 0x7f90a00566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO comm 0x7f52a00566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1853:1899 [1] NCCL INFO comm 0x7f617c0566a0 rank 9 nranks 16 | |
managed-worker-rnls:1853:1899 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1853:1899 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1852:1896 [0] NCCL INFO comm 0x7f83b00566a0 rank 8 nranks 16 | |
managed-worker-rnls:1852:1896 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1852:1896 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1855:1894 [3] NCCL INFO comm 0x7fa1e40566a0 rank 11 nranks 16 | |
managed-worker-rnls:1855:1894 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1855:1894 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1856:1895 [4] NCCL INFO comm 0x7f59700566a0 rank 12 nranks 16 | |
managed-worker-rnls:1865:1898 [6] NCCL INFO comm 0x7f98340566a0 rank 14 nranks 16 | |
managed-worker-rnls:1856:1895 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1856:1895 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1865:1898 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1865:1898 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1854:1897 [2] NCCL INFO comm 0x7f0c800566a0 rank 10 nranks 16 | |
managed-worker-rnls:1854:1897 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1854:1897 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1852:1896 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1854:1897 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1853:1899 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1855:1894 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1856:1895 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1861:1892 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1865:1898 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1867:1893 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3073:3114 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3077:3113 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3072:3119 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3080:3117 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3076:3118 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3074:3116 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3082:3115 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3071:3112 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Limiting to 1 rings per user request. | |
managed-worker-rnls:1856:1895 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1856:1895 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1856:1895 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1865:1898 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1865:1898 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1865:1898 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1861:1892 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1861:1892 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1861:1892 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1867:1893 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1867:1893 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1867:1893 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1853:1899 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1853:1899 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1853:1899 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1852:1896 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1852:1896 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1852:1896 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1855:1894 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1855:1894 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1855:1894 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-rnls:1854:1897 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1854:1897 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 1. | |
managed-worker-rnls:1854:1897 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 1. | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:1852:1896 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1855:1894 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:1856:1895 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-rnls:1852:1896 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1865:1898 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:1854:1897 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:1853:1899 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:1861:1892 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3071:3112 [0] NCCL INFO comm 0x7f10c00566a0 rank 0 nranks 16 - COMPLETE | |
managed-worker-sqfj:3073:3114 [2] NCCL INFO comm 0x7f404c0566a0 rank 2 nranks 16 - COMPLETE | |
managed-worker-sqfj:3072:3119 [1] NCCL INFO comm 0x7f90a00566a0 rank 1 nranks 16 - COMPLETE | |
managed-worker-sqfj:3080:3117 [6] NCCL INFO comm 0x7ff6180566a0 rank 6 nranks 16 - COMPLETE | |
# | |
# out-of-place in-place | |
# size count type redop time algbw busbw error time algbw busbw error | |
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) | |
managed-worker-sqfj:3071:3071 [0] NCCL INFO Launch mode Parallel | |
managed-worker-sqfj:3074:3116 [3] NCCL INFO comm 0x7f88000566a0 rank 3 nranks 16 - COMPLETE | |
managed-worker-sqfj:3076:3118 [4] NCCL INFO comm 0x7f52a00566a0 rank 4 nranks 16 - COMPLETE | |
managed-worker-sqfj:3082:3115 [7] NCCL INFO comm 0x7f7c140566a0 rank 7 nranks 16 - COMPLETE | |
managed-worker-sqfj:3077:3113 [5] NCCL INFO comm 0x7f7c500566a0 rank 5 nranks 16 - COMPLETE | |
managed-worker-rnls:1861:1892 [5] NCCL INFO comm 0x7f82380566a0 rank 13 nranks 16 - COMPLETE | |
managed-worker-rnls:1865:1898 [6] NCCL INFO comm 0x7f98340566a0 rank 14 nranks 16 - COMPLETE | |
managed-worker-rnls:1867:1893 [7] NCCL INFO comm 0x7f49700566a0 rank 15 nranks 16 - COMPLETE | |
managed-worker-rnls:1856:1895 [4] NCCL INFO comm 0x7f59700566a0 rank 12 nranks 16 - COMPLETE | |
managed-worker-rnls:1853:1899 [1] NCCL INFO comm 0x7f617c0566a0 rank 9 nranks 16 - COMPLETE | |
managed-worker-rnls:1854:1897 [2] NCCL INFO comm 0x7f0c800566a0 rank 10 nranks 16 - COMPLETE | |
managed-worker-rnls:1852:1896 [0] NCCL INFO comm 0x7f83b00566a0 rank 8 nranks 16 - COMPLETE | |
managed-worker-rnls:1855:1894 [3] NCCL INFO comm 0x7fa1e40566a0 rank 11 nranks 16 - COMPLETE | |
16777216 4194304 float sum 15319 1.10 2.05 N/A 15415 1.09 2.04 N/A | |
33554432 8388608 float sum 29881 1.12 2.11 N/A 29253 1.15 2.15 N/A | |
67108864 16777216 float sum 48441 1.39 2.60 N/A 47139 1.42 2.67 N/A | |
134217728 33554432 float sum 93931 1.43 2.68 N/A 95310 1.41 2.64 N/A | |
268435456 67108864 float sum 208563 1.29 2.41 N/A 233352 1.15 2.16 N/A | |
536870912 134217728 float sum 425113 1.26 2.37 N/A 417028 1.29 2.41 N/A | |
1073741824 268435456 float sum 819039 1.31 2.46 N/A 903537 1.19 2.23 N/A | |
# Out of bounds values : 0 OK | |
# Avg bus bandwidth : 2.35536 | |
# | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3128 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3129 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3130 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3131 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3132 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3133 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3137 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3142 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 1922 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 1923 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 1924 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 1925 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 1926 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 1931 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 1935 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 1937 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO NET/Socket : 1 interfaces found | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-rnls:1935:1935 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1935:1935 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1935:1935 [6] NCCL INFO rank 14 nranks 16 | |
managed-worker-rnls:1925:1925 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1925:1925 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1925:1925 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-rnls:1923:1923 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1923:1923 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1923:1923 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:1924:1924 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1924:1924 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1924:1924 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:1931:1931 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1931:1931 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1931:1931 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-rnls:1922:1922 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1922:1922 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1922:1922 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:1937:1937 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1937:1937 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1937:1937 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-rnls:1926:1926 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1926:1926 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1926:1926 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-sqfj:3131:3131 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3131:3131 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3131:3131 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-sqfj:3129:3129 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3129:3129 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3129:3129 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-sqfj:3132:3132 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3132:3132 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3132:3132 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3133:3133 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3133:3133 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3133:3133 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-sqfj:3137:3137 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3137:3137 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3137:3137 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-sqfj:3130:3130 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3130:3130 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3130:3130 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3142:3142 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3142:3142 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3142:3142 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO comm 0x7fa3c00566a0 rank 0 nranks 16 | |
managed-worker-rnls:1935:1962 [6] NCCL INFO comm 0x7f1ad40566a0 rank 14 nranks 16 | |
managed-worker-rnls:1935:1962 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1935:1962 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO comm 0x7fdb4c0566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO comm 0x7fdc240566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO comm 0x7fc21c0566a0 rank 3 nranks 16 | |
managed-worker-rnls:1922:1965 [0] NCCL INFO comm 0x7f32f80566a0 rank 8 nranks 16 | |
managed-worker-rnls:1922:1965 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1922:1965 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1925:1964 [3] NCCL INFO comm 0x7fb3b80566a0 rank 11 nranks 16 | |
managed-worker-rnls:1925:1964 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1925:1964 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1937:1966 [7] NCCL INFO comm 0x7fe7ec0566a0 rank 15 nranks 16 | |
managed-worker-rnls:1937:1966 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1937:1966 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO comm 0x7f42080566a0 rank 5 nranks 16 | |
managed-worker-rnls:1923:1963 [1] NCCL INFO comm 0x7ff7600566a0 rank 9 nranks 16 | |
managed-worker-rnls:1923:1963 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1923:1963 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO comm 0x7f7ca40566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO comm 0x7f87300566a0 rank 6 nranks 16 | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO comm 0x7f81580566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1924:1967 [2] NCCL INFO comm 0x7ff63c0566a0 rank 10 nranks 16 | |
managed-worker-rnls:1926:1969 [4] NCCL INFO comm 0x7f83c80566a0 rank 12 nranks 16 | |
managed-worker-rnls:1924:1967 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1924:1967 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1926:1969 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1926:1969 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1931:1968 [5] NCCL INFO comm 0x7fbd1c0566a0 rank 13 nranks 16 | |
managed-worker-rnls:1931:1968 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1931:1968 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1923:1963 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1922:1965 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1924:1967 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1925:1964 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1926:1969 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1931:1968 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1935:1962 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1937:1966 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3137:3175 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3142:3173 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1925:1964 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1925:1964 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1925:1964 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1923:1963 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1923:1963 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1923:1963 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1922:1965 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1922:1965 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1922:1965 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1924:1967 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1924:1967 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1924:1967 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1926:1969 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1926:1969 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1926:1969 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1931:1968 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1931:1968 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1931:1968 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1937:1966 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1937:1966 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1937:1966 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3133:3174 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3129:3170 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3128:3169 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Duplicating rings to 2 per user request. | |
managed-worker-sqfj:3132:3171 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3130:3176 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-rnls:1935:1962 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1935:1962 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-rnls:1935:1962 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3131:3172 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 2. | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 2. | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 01 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:1922:1965 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1925:1964 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:1922:1965 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:1926:1969 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:1924:1967 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:1935:1962 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:1931:1968 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:1923:1963 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 01 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:1922:1965 [0] NCCL INFO Ring 01 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1922:1965 [0] NCCL INFO Ring 01 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO Ring 01 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:1925:1964 [3] NCCL INFO Ring 01 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO Ring 01 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:1926:1969 [4] NCCL INFO Ring 01 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO Ring 01 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO Ring 01 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO Ring 01 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:1924:1967 [2] NCCL INFO Ring 01 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:1935:1962 [6] NCCL INFO Ring 01 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:1923:1963 [1] NCCL INFO Ring 01 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO Ring 01 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1931:1968 [5] NCCL INFO Ring 01 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:1926:1969 [4] NCCL INFO comm 0x7f83c80566a0 rank 12 nranks 16 - COMPLETE | |
managed-worker-rnls:1935:1962 [6] NCCL INFO comm 0x7f1ad40566a0 rank 14 nranks 16 - COMPLETE | |
managed-worker-rnls:1924:1967 [2] NCCL INFO comm 0x7ff63c0566a0 rank 10 nranks 16 - COMPLETE | |
managed-worker-rnls:1922:1965 [0] NCCL INFO comm 0x7f32f80566a0 rank 8 nranks 16 - COMPLETE | |
managed-worker-rnls:1923:1963 [1] NCCL INFO comm 0x7ff7600566a0 rank 9 nranks 16 - COMPLETE | |
managed-worker-rnls:1931:1968 [5] NCCL INFO comm 0x7fbd1c0566a0 rank 13 nranks 16 - COMPLETE | |
managed-worker-sqfj:3128:3169 [0] NCCL INFO comm 0x7fa3c00566a0 rank 0 nranks 16 - COMPLETE | |
managed-worker-rnls:1937:1966 [7] NCCL INFO comm 0x7fe7ec0566a0 rank 15 nranks 16 - COMPLETE | |
managed-worker-rnls:1925:1964 [3] NCCL INFO comm 0x7fb3b80566a0 rank 11 nranks 16 - COMPLETE | |
managed-worker-sqfj:3129:3170 [1] NCCL INFO comm 0x7fdb4c0566a0 rank 1 nranks 16 - COMPLETE | |
managed-worker-sqfj:3131:3172 [3] NCCL INFO comm 0x7fc21c0566a0 rank 3 nranks 16 - COMPLETE | |
managed-worker-sqfj:3130:3176 [2] NCCL INFO comm 0x7f81580566a0 rank 2 nranks 16 - COMPLETE | |
managed-worker-sqfj:3132:3171 [4] NCCL INFO comm 0x7fdc240566a0 rank 4 nranks 16 - COMPLETE | |
managed-worker-sqfj:3137:3175 [6] NCCL INFO comm 0x7f87300566a0 rank 6 nranks 16 - COMPLETE | |
managed-worker-sqfj:3133:3174 [5] NCCL INFO comm 0x7f42080566a0 rank 5 nranks 16 - COMPLETE | |
managed-worker-sqfj:3142:3173 [7] NCCL INFO comm 0x7f7ca40566a0 rank 7 nranks 16 - COMPLETE | |
# | |
# out-of-place in-place | |
# size count type redop time algbw busbw error time algbw busbw error | |
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) | |
managed-worker-sqfj:3128:3128 [0] NCCL INFO Launch mode Parallel | |
16777216 4194304 float sum 11701 1.43 2.69 N/A 11722 1.43 2.68 N/A | |
33554432 8388608 float sum 22987 1.46 2.74 N/A 22960 1.46 2.74 N/A | |
67108864 16777216 float sum 44431 1.51 2.83 N/A 44558 1.51 2.82 N/A | |
134217728 33554432 float sum 88164 1.52 2.85 N/A 88419 1.52 2.85 N/A | |
268435456 67108864 float sum 177297 1.51 2.84 N/A 177427 1.51 2.84 N/A | |
536870912 134217728 float sum 355817 1.51 2.83 N/A 354394 1.51 2.84 N/A | |
1073741824 268435456 float sum 710322 1.51 2.83 N/A 709347 1.51 2.84 N/A | |
# Out of bounds values : 0 OK | |
# Avg bus bandwidth : 2.80166 | |
# | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3187 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3188 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3189 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3190 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3191 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3193 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3195 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3197 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 1994 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 1995 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 1996 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 1997 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 1998 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 2000 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 2003 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 2006 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1998:1998 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1998:1998 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1998:1998 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-rnls:1994:1994 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1994:1994 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1994:1994 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:2003:2003 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2003:2003 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2003:2003 [6] NCCL INFO rank 14 nranks 16 | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-rnls:1997:1997 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1997:1997 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1997:1997 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-rnls:1995:1995 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1995:1995 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1995:1995 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:2006:2006 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2006:2006 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2006:2006 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-rnls:1996:1996 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:1996:1996 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:1996:1996 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:2000:2000 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2000:2000 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2000:2000 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-sqfj:3193:3193 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3193:3193 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3193:3193 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-sqfj:3189:3189 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3189:3189 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3189:3189 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3191:3191 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3191:3191 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3191:3191 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3188:3188 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3188:3188 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3188:3188 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-sqfj:3190:3190 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3190:3190 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3190:3190 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-sqfj:3197:3197 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3197:3197 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3197:3197 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3195:3195 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3195:3195 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3195:3195 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO comm 0x7f92680566a0 rank 0 nranks 16 | |
managed-worker-rnls:1994:2035 [0] NCCL INFO comm 0x7f50e00566a0 rank 8 nranks 16 | |
managed-worker-rnls:1994:2035 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1994:2035 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1998:2034 [4] NCCL INFO comm 0x7f6c180566a0 rank 12 nranks 16 | |
managed-worker-rnls:1998:2034 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1998:2034 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2003:2036 [6] NCCL INFO comm 0x7fcbfc0566a0 rank 14 nranks 16 | |
managed-worker-rnls:2003:2036 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2003:2036 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO comm 0x7f427c0566a0 rank 5 nranks 16 | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1995:2038 [1] NCCL INFO comm 0x7efd8c0566a0 rank 9 nranks 16 | |
managed-worker-rnls:1995:2038 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1995:2038 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1997:2037 [3] NCCL INFO comm 0x7f34680566a0 rank 11 nranks 16 | |
managed-worker-rnls:1997:2037 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1997:2037 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO comm 0x7fa4740566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:1996:2039 [2] NCCL INFO comm 0x7fba840566a0 rank 10 nranks 16 | |
managed-worker-rnls:1996:2039 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:1996:2039 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2006:2040 [7] NCCL INFO comm 0x7fedc00566a0 rank 15 nranks 16 | |
managed-worker-rnls:2006:2040 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2006:2040 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2000:2041 [5] NCCL INFO comm 0x7f43640566a0 rank 13 nranks 16 | |
managed-worker-rnls:2000:2041 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2000:2041 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO comm 0x7f94140566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO comm 0x7f119c0566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO comm 0x7fc9100566a0 rank 3 nranks 16 | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO comm 0x7f9d2c0566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO comm 0x7efbbc0566a0 rank 6 nranks 16 | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1994:2035 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1995:2038 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1996:2039 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1997:2037 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:1998:2034 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2000:2041 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2003:2036 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2006:2040 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3197:3234 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3195:3235 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3191:3231 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3187:3228 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Duplicating rings to 4 per user request. | |
managed-worker-sqfj:3189:3232 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3190:3233 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3188:3230 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:1998:2034 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1998:2034 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:1998:2034 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:1994:2035 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1994:2035 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:1994:2035 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3193:3229 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:1995:2038 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1995:2038 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:1995:2038 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:1997:2037 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1997:2037 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:1997:2037 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:1996:2039 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:1996:2039 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:1996:2039 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:2003:2036 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2003:2036 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:2003:2036 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:2000:2041 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2000:2041 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:2000:2041 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-rnls:2006:2040 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2006:2040 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 4. | |
managed-worker-rnls:2006:2040 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 4. | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 01 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 03 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:1997:2037 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:1998:2034 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1996:2039 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:2003:2036 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2000:2041 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:1995:2038 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 01 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 01 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 01 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1997:2037 [3] NCCL INFO Ring 01 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO Ring 01 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:1998:2034 [4] NCCL INFO Ring 01 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO Ring 01 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO Ring 01 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-rnls:1996:2039 [2] NCCL INFO Ring 01 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2003:2036 [6] NCCL INFO Ring 01 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO Ring 01 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO Ring 01 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:1995:2038 [1] NCCL INFO Ring 01 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2000:2041 [5] NCCL INFO Ring 01 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO Ring 01 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 02 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 02 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO Ring 02 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 02 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:1997:2037 [3] NCCL INFO Ring 02 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO Ring 02 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:1998:2034 [4] NCCL INFO Ring 02 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO Ring 02 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO Ring 02 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO Ring 02 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO Ring 02 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1996:2039 [2] NCCL INFO Ring 02 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2003:2036 [6] NCCL INFO Ring 02 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:1995:2038 [1] NCCL INFO Ring 02 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2000:2041 [5] NCCL INFO Ring 02 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 03 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 03 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:1994:2035 [0] NCCL INFO Ring 03 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO Ring 03 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:1997:2037 [3] NCCL INFO Ring 03 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO Ring 03 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:1998:2034 [4] NCCL INFO Ring 03 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO Ring 03 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO Ring 03 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO Ring 03 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO Ring 03 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:1996:2039 [2] NCCL INFO Ring 03 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2003:2036 [6] NCCL INFO Ring 03 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:1995:2038 [1] NCCL INFO Ring 03 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2000:2041 [5] NCCL INFO Ring 03 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3187:3228 [0] NCCL INFO comm 0x7f92680566a0 rank 0 nranks 16 - COMPLETE | |
managed-worker-sqfj:3191:3231 [4] NCCL INFO comm 0x7f119c0566a0 rank 4 nranks 16 - COMPLETE | |
managed-worker-sqfj:3190:3233 [3] NCCL INFO comm 0x7fc9100566a0 rank 3 nranks 16 - COMPLETE | |
managed-worker-sqfj:3195:3235 [6] NCCL INFO comm 0x7efbbc0566a0 rank 6 nranks 16 - COMPLETE | |
managed-worker-sqfj:3197:3234 [7] NCCL INFO comm 0x7f9d2c0566a0 rank 7 nranks 16 - COMPLETE | |
managed-worker-sqfj:3189:3232 [2] NCCL INFO comm 0x7fa4740566a0 rank 2 nranks 16 - COMPLETE | |
# | |
# out-of-place in-place | |
# size count type redop time algbw busbw error time algbw busbw error | |
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) | |
managed-worker-sqfj:3187:3187 [0] NCCL INFO Launch mode Parallel | |
managed-worker-sqfj:3188:3230 [1] NCCL INFO comm 0x7f94140566a0 rank 1 nranks 16 - COMPLETE | |
managed-worker-sqfj:3193:3229 [5] NCCL INFO comm 0x7f427c0566a0 rank 5 nranks 16 - COMPLETE | |
managed-worker-rnls:1998:2034 [4] NCCL INFO comm 0x7f6c180566a0 rank 12 nranks 16 - COMPLETE | |
managed-worker-rnls:2003:2036 [6] NCCL INFO comm 0x7fcbfc0566a0 rank 14 nranks 16 - COMPLETE | |
managed-worker-rnls:2000:2041 [5] NCCL INFO comm 0x7f43640566a0 rank 13 nranks 16 - COMPLETE | |
managed-worker-rnls:1995:2038 [1] NCCL INFO comm 0x7efd8c0566a0 rank 9 nranks 16 - COMPLETE | |
managed-worker-rnls:1994:2035 [0] NCCL INFO comm 0x7f50e00566a0 rank 8 nranks 16 - COMPLETE | |
managed-worker-rnls:1996:2039 [2] NCCL INFO comm 0x7fba840566a0 rank 10 nranks 16 - COMPLETE | |
managed-worker-rnls:1997:2037 [3] NCCL INFO comm 0x7f34680566a0 rank 11 nranks 16 - COMPLETE | |
managed-worker-rnls:2006:2040 [7] NCCL INFO comm 0x7fedc00566a0 rank 15 nranks 16 - COMPLETE | |
16777216 4194304 float sum 11254 1.49 2.80 N/A 11332 1.48 2.78 N/A | |
33554432 8388608 float sum 22233 1.51 2.83 N/A 22216 1.51 2.83 N/A | |
67108864 16777216 float sum 43292 1.55 2.91 N/A 43497 1.54 2.89 N/A | |
134217728 33554432 float sum 90321 1.49 2.79 N/A 87696 1.53 2.87 N/A | |
268435456 67108864 float sum 177498 1.51 2.84 N/A 176429 1.52 2.85 N/A | |
536870912 134217728 float sum 353075 1.52 2.85 N/A 355253 1.51 2.83 N/A | |
1073741824 268435456 float sum 705388 1.52 2.85 N/A 695853 1.54 2.89 N/A | |
# Out of bounds values : 0 OK | |
# Avg bus bandwidth : 2.84348 | |
# | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3250 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3251 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3252 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3253 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3254 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3257 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3258 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3263 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 2070 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 2071 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 2072 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 2073 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 2074 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 2076 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 2079 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 2082 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO NET/Socket : 1 interfaces found | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-rnls:2074:2074 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2074:2074 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2074:2074 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-rnls:2070:2070 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2070:2070 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2070:2070 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:2079:2079 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2079:2079 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2079:2079 [6] NCCL INFO rank 14 nranks 16 | |
managed-worker-rnls:2076:2076 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2076:2076 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2076:2076 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-rnls:2071:2071 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2071:2071 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2071:2071 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:2072:2072 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2072:2072 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2072:2072 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:2073:2073 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2073:2073 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2073:2073 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-rnls:2082:2082 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2082:2082 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2082:2082 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-sqfj:3253:3253 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3253:3253 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3253:3253 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-sqfj:3252:3252 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3252:3252 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3252:3252 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3257:3257 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3257:3257 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3257:3257 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-sqfj:3251:3251 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3251:3251 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3251:3251 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-sqfj:3263:3263 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3263:3263 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3263:3263 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3254:3254 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3254:3254 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3254:3254 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3258:3258 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3258:3258 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3258:3258 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO comm 0x7f38b80566a0 rank 0 nranks 16 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO comm 0x7ff6900566a0 rank 8 nranks 16 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2070:2111 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO comm 0x7fbcdc0566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2074:2110 [4] NCCL INFO comm 0x7f80b40566a0 rank 12 nranks 16 | |
managed-worker-rnls:2074:2110 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2074:2110 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2079:2112 [6] NCCL INFO comm 0x7f7fd40566a0 rank 14 nranks 16 | |
managed-worker-rnls:2079:2112 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2079:2112 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO comm 0x7feba80566a0 rank 3 nranks 16 | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO comm 0x7f3d2c0566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO comm 0x7f1afc0566a0 rank 5 nranks 16 | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2076:2113 [5] NCCL INFO comm 0x7fb3c40566a0 rank 13 nranks 16 | |
managed-worker-rnls:2073:2114 [3] NCCL INFO comm 0x7f8d4c0566a0 rank 11 nranks 16 | |
managed-worker-rnls:2076:2113 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2076:2113 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2073:2114 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2073:2114 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO comm 0x7fcf980566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2082:2117 [7] NCCL INFO comm 0x7f19e00566a0 rank 15 nranks 16 | |
managed-worker-rnls:2082:2117 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2082:2117 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO comm 0x7eff380566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO comm 0x7f5f7c0566a0 rank 6 nranks 16 | |
managed-worker-rnls:2071:2115 [1] NCCL INFO comm 0x7f17880566a0 rank 9 nranks 16 | |
managed-worker-rnls:2071:2115 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2071:2115 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2072:2116 [2] NCCL INFO comm 0x7f089c0566a0 rank 10 nranks 16 | |
managed-worker-rnls:2072:2116 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2072:2116 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2071:2115 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2072:2116 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2070:2111 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2073:2114 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2074:2110 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2079:2112 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2082:2117 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2076:2113 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3254:3297 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3251:3296 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3253:3292 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3252:3293 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3258:3298 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3250:3291 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Duplicating rings to 8 per user request. | |
managed-worker-rnls:2074:2110 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2074:2110 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2074:2110 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2079:2112 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2079:2112 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2079:2112 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2082:2117 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2082:2117 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2082:2117 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2076:2113 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2076:2113 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2076:2113 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3257:3295 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3263:3294 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2072:2116 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2072:2116 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2072:2116 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2071:2115 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2071:2115 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2071:2115 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2070:2111 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2070:2111 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2070:2111 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-rnls:2073:2114 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2073:2114 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 8. | |
managed-worker-rnls:2073:2114 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 8. | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 01 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 03 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 04 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 05 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 06 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 07 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 01 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 01 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 01 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 01 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 01 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 01 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 01 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 01 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 01 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 01 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 01 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 01 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 01 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 01 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 01 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 02 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 02 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 02 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 02 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 02 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 02 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 02 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 02 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 02 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 02 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 02 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 02 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 02 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 02 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 02 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 03 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 03 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 03 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 03 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 03 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 03 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 03 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 03 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 03 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 03 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 03 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 03 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 03 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 03 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 03 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 04 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 04 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 04 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 04 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 04 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 04 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 04 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 04 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 04 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 04 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 04 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 04 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 04 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 04 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 04 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 04 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 05 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 05 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 05 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 05 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 05 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 05 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 05 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 05 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 05 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 05 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 05 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 05 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 05 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 05 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 05 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 05 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 06 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 06 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 06 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 06 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 06 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 06 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 06 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 06 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 06 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 06 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 06 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 06 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 06 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 06 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 06 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 06 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 07 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 07 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO Ring 07 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO Ring 07 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2070:2111 [0] NCCL INFO Ring 07 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2073:2114 [3] NCCL INFO Ring 07 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO Ring 07 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2074:2110 [4] NCCL INFO Ring 07 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO Ring 07 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO Ring 07 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO Ring 07 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO Ring 07 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2072:2116 [2] NCCL INFO Ring 07 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2079:2112 [6] NCCL INFO Ring 07 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2071:2115 [1] NCCL INFO Ring 07 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2076:2113 [5] NCCL INFO Ring 07 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3253:3292 [3] NCCL INFO comm 0x7feba80566a0 rank 3 nranks 16 - COMPLETE | |
managed-worker-sqfj:3250:3291 [0] NCCL INFO comm 0x7f38b80566a0 rank 0 nranks 16 - COMPLETE | |
managed-worker-sqfj:3254:3297 [4] NCCL INFO comm 0x7fcf980566a0 rank 4 nranks 16 - COMPLETE | |
managed-worker-rnls:2074:2110 [4] NCCL INFO comm 0x7f80b40566a0 rank 12 nranks 16 - COMPLETE | |
managed-worker-sqfj:3257:3295 [5] NCCL INFO comm 0x7f1afc0566a0 rank 5 nranks 16 - COMPLETE | |
managed-worker-sqfj:3252:3293 [2] NCCL INFO comm 0x7fbcdc0566a0 rank 2 nranks 16 - COMPLETE | |
managed-worker-sqfj:3258:3298 [6] NCCL INFO comm 0x7f5f7c0566a0 rank 6 nranks 16 - COMPLETE | |
managed-worker-sqfj:3251:3296 [1] NCCL INFO comm 0x7eff380566a0 rank 1 nranks 16 - COMPLETE | |
managed-worker-sqfj:3263:3294 [7] NCCL INFO comm 0x7f3d2c0566a0 rank 7 nranks 16 - COMPLETE | |
# | |
# out-of-place in-place | |
# size count type redop time algbw busbw error time algbw busbw error | |
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) | |
managed-worker-sqfj:3250:3250 [0] NCCL INFO Launch mode Parallel | |
managed-worker-rnls:2079:2112 [6] NCCL INFO comm 0x7f7fd40566a0 rank 14 nranks 16 - COMPLETE | |
managed-worker-rnls:2076:2113 [5] NCCL INFO comm 0x7fb3c40566a0 rank 13 nranks 16 - COMPLETE | |
managed-worker-rnls:2082:2117 [7] NCCL INFO comm 0x7f19e00566a0 rank 15 nranks 16 - COMPLETE | |
managed-worker-rnls:2071:2115 [1] NCCL INFO comm 0x7f17880566a0 rank 9 nranks 16 - COMPLETE | |
managed-worker-rnls:2070:2111 [0] NCCL INFO comm 0x7ff6900566a0 rank 8 nranks 16 - COMPLETE | |
managed-worker-rnls:2073:2114 [3] NCCL INFO comm 0x7f8d4c0566a0 rank 11 nranks 16 - COMPLETE | |
managed-worker-rnls:2072:2116 [2] NCCL INFO comm 0x7f089c0566a0 rank 10 nranks 16 - COMPLETE | |
16777216 4194304 float sum 11221 1.50 2.80 N/A 11485 1.46 2.74 N/A | |
33554432 8388608 float sum 21967 1.53 2.86 N/A 21940 1.53 2.87 N/A | |
67108864 16777216 float sum 42484 1.58 2.96 N/A 42274 1.59 2.98 N/A | |
134217728 33554432 float sum 86458 1.55 2.91 N/A 86598 1.55 2.91 N/A | |
268435456 67108864 float sum 179299 1.50 2.81 N/A 183256 1.46 2.75 N/A | |
536870912 134217728 float sum 361021 1.49 2.79 N/A 359748 1.49 2.80 N/A | |
1073741824 268435456 float sum 718069 1.50 2.80 N/A 717100 1.50 2.81 N/A | |
# Out of bounds values : 0 OK | |
# Avg bus bandwidth : 2.84146 | |
# | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3321 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3322 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3323 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3324 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3325 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3328 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3330 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3334 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 2154 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 2155 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 2156 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 2157 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 2158 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 2160 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 2162 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 2167 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3321:3321 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3321:3321 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3321:3321 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3321:3321 [0] NCCL INFO NET/Socket : 1 interfaces found | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3321:3321 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-sqfj:3325:3325 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3325:3325 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3325:3325 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3322:3322 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3322:3322 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3322:3322 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-rnls:2158:2158 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2158:2158 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2158:2158 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-sqfj:3323:3323 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3323:3323 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3323:3323 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3334:3334 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3334:3334 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3334:3334 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3324:3324 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3324:3324 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3324:3324 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-rnls:2160:2160 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2160:2160 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2160:2160 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-sqfj:3328:3328 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3328:3328 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3328:3328 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-sqfj:3330:3330 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3330:3330 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3330:3330 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-rnls:2155:2155 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2155:2155 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2155:2155 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:2162:2162 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2162:2162 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2162:2162 [6] NCCL INFO rank 14 nranks 16 | |
managed-worker-rnls:2156:2156 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2156:2156 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2156:2156 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:2167:2167 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2167:2167 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2167:2167 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-rnls:2154:2154 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2154:2154 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2154:2154 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:2157:2157 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2157:2157 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2157:2157 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO comm 0x7efb6c0566a0 rank 0 nranks 16 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO comm 0x7f37800566a0 rank 12 nranks 16 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2158:2194 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO comm 0x7ff8ac0566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2160:2195 [5] NCCL INFO comm 0x7fd5dc0566a0 rank 13 nranks 16 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2160:2195 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2162:2197 [6] NCCL INFO comm 0x7f4e640566a0 rank 14 nranks 16 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2162:2197 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2156:2196 [2] NCCL INFO comm 0x7fb9340566a0 rank 10 nranks 16 | |
managed-worker-rnls:2156:2196 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2156:2196 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2157:2200 [3] NCCL INFO comm 0x7fe4a00566a0 rank 11 nranks 16 | |
managed-worker-rnls:2157:2200 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2157:2200 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2167:2198 [7] NCCL INFO comm 0x7f2b440566a0 rank 15 nranks 16 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2167:2198 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2155:2199 [1] NCCL INFO comm 0x7fd09c0566a0 rank 9 nranks 16 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2155:2199 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2154:2201 [0] NCCL INFO comm 0x7f5b280566a0 rank 8 nranks 16 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2154:2201 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO comm 0x7fa9940566a0 rank 3 nranks 16 | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO comm 0x7f8e540566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO comm 0x7f3f440566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO comm 0x7f73ac0566a0 rank 5 nranks 16 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO comm 0x7fea940566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO comm 0x7fde300566a0 rank 6 nranks 16 | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2155:2199 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2154:2201 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2156:2196 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2158:2194 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2157:2200 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2162:2197 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2160:2195 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2167:2198 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3328:3368 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3330:3369 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3334:3365 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3323:3367 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3321:3362 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Duplicating rings to 16 per user request. | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3322:3363 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3324:3366 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3325:3364 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2158:2194 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2158:2194 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2158:2194 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2162:2197 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2162:2197 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2162:2197 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2160:2195 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2160:2195 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2160:2195 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2167:2198 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2167:2198 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2167:2198 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2156:2196 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2156:2196 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2156:2196 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2154:2201 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2154:2201 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2154:2201 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2155:2199 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2155:2199 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2155:2199 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-rnls:2157:2200 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2157:2200 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 16. | |
managed-worker-rnls:2157:2200 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 16. | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 01 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 03 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 04 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 05 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 06 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 07 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 08 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 09 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 10 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 11 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 12 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 13 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 14 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 15 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 01 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 01 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 01 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 01 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 01 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 01 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 01 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 01 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 01 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 01 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 01 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 01 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 01 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 01 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 01 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 02 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 02 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 02 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 02 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 02 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 02 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 02 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 02 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 02 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 02 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 02 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 02 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 02 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 02 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 02 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 03 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 03 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 03 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 03 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 03 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 03 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 03 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 03 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 03 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 03 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 03 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 03 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 03 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 03 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 03 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 04 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 04 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 04 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 04 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 04 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 04 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 04 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 04 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 04 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 04 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 04 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 04 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 04 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 04 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 04 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 04 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 05 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 05 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 05 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 05 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 05 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 05 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 05 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 05 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 05 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 05 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 05 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 05 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 05 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 05 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 05 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 05 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 06 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 06 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 06 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 06 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 06 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 06 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 06 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 06 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 06 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 06 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 06 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 06 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 06 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 06 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 06 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 06 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 07 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 07 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 07 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 07 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 07 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 07 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 07 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 07 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 07 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 07 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 07 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 07 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 07 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 07 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 07 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 07 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 08 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 08 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 08 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 08 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 08 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 08 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 08 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 08 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 08 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 08 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 08 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 08 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 08 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 08 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 08 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 08 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 09 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 09 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 09 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 09 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 09 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 09 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 09 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 09 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 09 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 09 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 09 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 09 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 09 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 09 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 09 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 09 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 10 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 10 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 10 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 10 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 10 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 10 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 10 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 10 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 10 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 10 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 10 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 10 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 10 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 10 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 10 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 10 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 11 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 11 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 11 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 11 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 11 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 11 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 11 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 11 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 11 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 11 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 11 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 11 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 11 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 11 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 11 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 11 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 12 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 12 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 12 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 12 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 12 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 12 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 12 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 12 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 12 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 12 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 12 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 12 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 12 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 12 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 12 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 12 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 13 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 13 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 13 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 13 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2158:2194 [4] NCCL INFO Ring 13 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 13 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 13 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO Ring 13 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 13 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 13 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 13 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 13 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 13 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 13 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 13 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 13 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 14 : 7 -> 8 via NET/Socket/0 | |
managed-worker-rnls:2158:2194 [4] include/shm.h:26 NCCL WARN Unable to allocate shared memory (4460544 bytes) : No space left on device | |
managed-worker-rnls:2158:2194 [4] NCCL INFO transport/shm.cu:193 -> 2 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO init.cu:236 -> 2 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO init.cu:263 -> 2 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO init.cu:515 -> 2 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 14 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2158:2194 [4] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3325:3364 [4] include/shm.h:26 NCCL WARN Unable to allocate shared memory (4460544 bytes) : No space left on device | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO transport/shm.cu:193 -> 2 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO init.cu:236 -> 2 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO init.cu:263 -> 2 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO init.cu:515 -> 2 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3325:3364 [4] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO Ring 14 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2154:2201 [0] NCCL INFO Ring 14 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3324:3366 [3] NCCL INFO Ring 14 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2157:2200 [3] NCCL INFO Ring 14 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO Ring 14 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3323:3367 [2] NCCL INFO Ring 14 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO Ring 14 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2156:2196 [2] NCCL INFO Ring 14 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO Ring 14 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-sqfj:3328:3368 [5] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3328:3368 [5] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2155:2199 [1] NCCL INFO Ring 14 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2162:2197 [6] NCCL INFO Ring 14 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] NCCL INFO Ring 14 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2160:2195 [5] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2160:2195 [5] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2160:2195 [5] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3330:3369 [6] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3330:3369 [6] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2155:2199 [1] include/socket.h:398 NCCL WARN Call to write failed : Connection reset by peer | |
managed-worker-rnls:2155:2199 [1] NCCL INFO transport/net_socket.cu:177 -> 2 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO bootstrap.cu:29 -> 2 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO bootstrap.cu:231 -> 2 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2155:2199 [1] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls:2162:2197 [6] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2162:2197 [6] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2162:2197 [6] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3322:3363 [1] include/socket.h:398 NCCL WARN Call to write failed : Connection reset by peer | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO transport/net_socket.cu:177 -> 2 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO bootstrap.cu:29 -> 2 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO bootstrap.cu:231 -> 2 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3322:3363 [1] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3334:3365 [7] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3334:3365 [7] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2167:2198 [7] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2167:2198 [7] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2167:2198 [7] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2154:2201 [0] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2154:2201 [0] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2154:2201 [0] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3321:3362 [0] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3321:3362 [0] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
-------------------------------------------------------------------------- | |
Primary job terminated normally, but 1 process returned | |
a non-zero exit code. Per user-direction, the job has been aborted. | |
-------------------------------------------------------------------------- | |
-------------------------------------------------------------------------- | |
mpirun detected that one or more processes exited with non-zero status, thus causing | |
the job to be terminated. The first process to do so was: | |
Process name: [[5168,1],4] | |
Exit code: 3 | |
-------------------------------------------------------------------------- | |
# nThread 1 nGpus 1 minBytes 16777216 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 20 validation: 0 | |
# | |
# Using devices | |
# Rank 0 Pid 3406 on managed-worker-sqfj device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 1 Pid 3407 on managed-worker-sqfj device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 2 Pid 3408 on managed-worker-sqfj device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 3 Pid 3409 on managed-worker-sqfj device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 4 Pid 3410 on managed-worker-sqfj device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 5 Pid 3415 on managed-worker-sqfj device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 6 Pid 3419 on managed-worker-sqfj device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 7 Pid 3421 on managed-worker-sqfj device 7 [0x00] Tesla V100-SXM2-16GB | |
# Rank 8 Pid 2252 on managed-worker-rnls device 0 [0x00] Tesla V100-SXM2-16GB | |
# Rank 9 Pid 2253 on managed-worker-rnls device 1 [0x00] Tesla V100-SXM2-16GB | |
# Rank 10 Pid 2254 on managed-worker-rnls device 2 [0x00] Tesla V100-SXM2-16GB | |
# Rank 11 Pid 2255 on managed-worker-rnls device 3 [0x00] Tesla V100-SXM2-16GB | |
# Rank 12 Pid 2256 on managed-worker-rnls device 4 [0x00] Tesla V100-SXM2-16GB | |
# Rank 13 Pid 2259 on managed-worker-rnls device 5 [0x00] Tesla V100-SXM2-16GB | |
# Rank 14 Pid 2262 on managed-worker-rnls device 6 [0x00] Tesla V100-SXM2-16GB | |
# Rank 15 Pid 2266 on managed-worker-rnls device 7 [0x00] Tesla V100-SXM2-16GB | |
managed-worker-sqfj:3406:3406 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3406:3406 [0] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3406:3406 [0] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3406:3406 [0] NCCL INFO NET/Socket : 1 interfaces found | |
NCCL version 2.3.7+cuda10.0 | |
managed-worker-sqfj:3406:3406 [0] NCCL INFO rank 0 nranks 16 | |
managed-worker-rnls:2252:2252 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2252:2252 [0] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2252:2252 [0] NCCL INFO rank 8 nranks 16 | |
managed-worker-rnls:2266:2266 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2266:2266 [7] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2266:2266 [7] NCCL INFO rank 15 nranks 16 | |
managed-worker-rnls:2259:2259 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2259:2259 [5] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2259:2259 [5] NCCL INFO rank 13 nranks 16 | |
managed-worker-rnls:2256:2256 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2256:2256 [4] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2256:2256 [4] NCCL INFO rank 12 nranks 16 | |
managed-worker-rnls:2254:2254 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2254:2254 [2] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2254:2254 [2] NCCL INFO rank 10 nranks 16 | |
managed-worker-rnls:2253:2253 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2253:2253 [1] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2253:2253 [1] NCCL INFO rank 9 nranks 16 | |
managed-worker-rnls:2255:2255 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2255:2255 [3] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2255:2255 [3] NCCL INFO rank 11 nranks 16 | |
managed-worker-rnls:2262:2262 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-rnls:2262:2262 [6] NCCL INFO Using internal Network Socket | |
managed-worker-rnls:2262:2262 [6] NCCL INFO rank 14 nranks 16 | |
managed-worker-sqfj:3415:3415 [5] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3415:3415 [5] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3415:3415 [5] NCCL INFO rank 5 nranks 16 | |
managed-worker-sqfj:3409:3409 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3409:3409 [3] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3409:3409 [3] NCCL INFO rank 3 nranks 16 | |
managed-worker-sqfj:3408:3408 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3408:3408 [2] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3408:3408 [2] NCCL INFO rank 2 nranks 16 | |
managed-worker-sqfj:3421:3421 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3421:3421 [7] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3421:3421 [7] NCCL INFO rank 7 nranks 16 | |
managed-worker-sqfj:3419:3419 [6] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3419:3419 [6] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3419:3419 [6] NCCL INFO rank 6 nranks 16 | |
managed-worker-sqfj:3407:3407 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3407:3407 [1] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3407:3407 [1] NCCL INFO rank 1 nranks 16 | |
managed-worker-sqfj:3410:3410 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 1. | |
managed-worker-sqfj:3410:3410 [4] NCCL INFO Using internal Network Socket | |
managed-worker-sqfj:3410:3410 [4] NCCL INFO rank 4 nranks 16 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO comm 0x7f79940566a0 rank 0 nranks 16 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO comm 0x7f084c0566a0 rank 8 nranks 16 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2252:2292 [0] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2259:2294 [5] NCCL INFO comm 0x7f19400566a0 rank 13 nranks 16 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2259:2294 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO comm 0x7fc7400566a0 rank 5 nranks 16 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO comm 0x7fcc7c0566a0 rank 2 nranks 16 | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2266:2293 [7] NCCL INFO comm 0x7f84ec0566a0 rank 15 nranks 16 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2266:2293 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2253:2295 [1] NCCL INFO comm 0x7f815c0566a0 rank 9 nranks 16 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2253:2295 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO comm 0x7fd3680566a0 rank 7 nranks 16 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2255:2297 [3] NCCL INFO comm 0x7fb10c0566a0 rank 11 nranks 16 | |
managed-worker-rnls:2255:2297 [3] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2255:2297 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2256:2296 [4] NCCL INFO comm 0x7fd6300566a0 rank 12 nranks 16 | |
managed-worker-rnls:2254:2298 [2] NCCL INFO comm 0x7f05500566a0 rank 10 nranks 16 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2256:2296 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2254:2298 [2] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2254:2298 [2] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-rnls:2262:2299 [6] NCCL INFO comm 0x7f20b40566a0 rank 14 nranks 16 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO NET : Using interface ens12:10.73.0.19<0> | |
managed-worker-rnls:2262:2299 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO comm 0x7fe1600566a0 rank 4 nranks 16 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO comm 0x7f32d40566a0 rank 6 nranks 16 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO comm 0x7faf440566a0 rank 3 nranks 16 | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO comm 0x7f4f980566a0 rank 1 nranks 16 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO NET : Using interface ens12:10.73.0.95<0> | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO NET/Socket : 1 interfaces found | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2252:2292 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2253:2295 [1] NCCL INFO CUDA Dev 1, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2256:2296 [4] NCCL INFO CUDA Dev 4, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2255:2297 [3] NCCL INFO CUDA Dev 3, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2254:2298 [2] NCCL INFO CUDA Dev 2, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2262:2299 [6] NCCL INFO CUDA Dev 6, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2266:2293 [7] NCCL INFO CUDA Dev 7, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2259:2294 [5] NCCL INFO CUDA Dev 5, IP Interfaces : ens12(PHB) | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO CUDA Dev 0, IP Interfaces : ens12(PHB) | |
managed-worker-rnls:2262:2299 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2262:2299 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2262:2299 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2266:2293 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2266:2293 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2266:2293 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3408:3448 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3415:3449 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3406:3447 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3406:3447 [0] misc/rings.cu:332 NCCL WARN NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (16), limiting it to 16 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Duplicating rings to 16 per user request. | |
managed-worker-sqfj:3409:3451 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3407:3454 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3410:3453 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3421:3450 [7] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3419:3452 [6] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2253:2295 [1] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2253:2295 [1] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2253:2295 [1] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2256:2296 [4] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2256:2296 [4] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2256:2296 [4] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2254:2298 [2] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2254:2298 [2] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2254:2298 [2] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2252:2292 [0] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2252:2292 [0] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2252:2292 [0] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2255:2297 [3] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2255:2297 [3] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2255:2297 [3] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-rnls:2259:2294 [5] misc/rings.cu:319 NCCL WARN Could not create rings, falling back on simple ring | |
managed-worker-rnls:2259:2294 [5] NCCL INFO NCCL_MAX_NRINGS set by environment to 32. | |
managed-worker-rnls:2259:2294 [5] NCCL INFO NCCL_MIN_NRINGS set by environment to 32. | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Using 256 threads | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Min Comp Cap 7 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 00 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 01 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 03 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 04 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 05 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 06 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 07 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 08 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 09 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 10 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 11 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 12 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 13 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 14 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 15 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 00 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 00 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 00 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 00 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 00 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 00 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 00 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 00 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 00 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 00 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 00 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 00 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 00 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 00 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 00 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 01 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 01 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 01 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 01 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 01 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 01 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 01 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 01 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 01 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 01 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 01 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 01 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 01 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 01 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 01 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 02 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 02 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 02 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 02 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 02 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 02 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 02 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 02 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 02 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 02 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 02 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 02 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 02 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 02 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 02 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 03 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 03 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 03 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 03 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 03 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 03 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 03 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 03 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 03 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 03 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 03 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 03 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 03 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 03 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 03 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 04 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 04 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 04 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 04 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 04 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 04 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 04 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 04 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 04 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 04 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 04 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 04 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 04 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 04 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 04 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 04 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 05 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 05 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 05 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 05 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 05 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 05 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 05 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 05 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 05 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 05 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 05 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 05 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 05 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 05 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 05 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 05 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 06 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 06 : 7 -> 8 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 06 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 06 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 06 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 06 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 06 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 06 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 06 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 06 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 06 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 06 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 06 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 06 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 06 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 06 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 07 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 07 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 07 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 07 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 07 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 07 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 07 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 07 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 07 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 07 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 07 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 07 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 07 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 07 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 07 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 07 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 08 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 08 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 08 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 08 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 08 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 08 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 08 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 08 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 08 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 08 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 08 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 08 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 08 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 08 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 08 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 08 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 09 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 09 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 09 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 09 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 09 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 09 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 09 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 09 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 09 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 09 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 09 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 09 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 09 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 09 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 09 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 09 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 10 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 10 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 10 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 10 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 10 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 10 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 10 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 10 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 10 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 10 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 10 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 10 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 10 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 10 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 10 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 10 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 11 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 11 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 11 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 11 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 11 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 11 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 11 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 11 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 11 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 11 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 11 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 11 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 11 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 11 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 11 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 11 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 12 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 12 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 12 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 12 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 12 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 12 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 12 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 12 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 12 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 12 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 12 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 12 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 12 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 12 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 12 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 12 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 13 : 15 -> 0 via NET/Socket/0 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 13 : 7 -> 8 via NET/Socket/0 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 13 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 13 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 13 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 13 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO Ring 13 : 4[4] -> 5[5] via P2P/IPC | |
managed-worker-rnls:2256:2296 [4] NCCL INFO Ring 13 : 12[4] -> 13[5] via P2P/IPC | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 13 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 13 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 13 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 13 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 13 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 13 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 13 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 13 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 14 : 15 -> 0 via NET/Socket/0 | |
managed-worker-sqfj:3410:3453 [4] include/shm.h:26 NCCL WARN Unable to allocate shared memory (4460544 bytes) : No space left on device | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO transport/shm.cu:193 -> 2 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO init.cu:236 -> 2 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO init.cu:263 -> 2 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO init.cu:515 -> 2 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3410:3453 [4] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 14 : 7 -> 8 via NET/Socket/0 | |
managed-worker-rnls:2256:2296 [4] include/shm.h:26 NCCL WARN Unable to allocate shared memory (4460544 bytes) : No space left on device | |
managed-worker-rnls:2256:2296 [4] NCCL INFO transport/shm.cu:193 -> 2 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO init.cu:236 -> 2 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO init.cu:263 -> 2 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO init.cu:515 -> 2 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2256:2296 [4] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO Ring 14 : 0[0] -> 1[1] via P2P/IPC | |
managed-worker-sqfj:3409:3451 [3] NCCL INFO Ring 14 : 3[3] -> 4[4] via direct shared memory | |
managed-worker-rnls:2252:2292 [0] NCCL INFO Ring 14 : 8[0] -> 9[1] via P2P/IPC | |
managed-worker-rnls:2255:2297 [3] NCCL INFO Ring 14 : 11[3] -> 12[4] via direct shared memory | |
managed-worker-sqfj:3408:3448 [2] NCCL INFO Ring 14 : 2[2] -> 3[3] via P2P/IPC | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO Ring 14 : 1[1] -> 2[2] via P2P/IPC | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO Ring 14 : 6[6] -> 7[7] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO Ring 14 : 5[5] -> 6[6] via P2P/IPC | |
managed-worker-sqfj:3415:3449 [5] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3415:3449 [5] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2254:2298 [2] NCCL INFO Ring 14 : 10[2] -> 11[3] via P2P/IPC | |
managed-worker-rnls:2262:2299 [6] NCCL INFO Ring 14 : 14[6] -> 15[7] via P2P/IPC | |
managed-worker-rnls:2253:2295 [1] NCCL INFO Ring 14 : 9[1] -> 10[2] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] NCCL INFO Ring 14 : 13[5] -> 14[6] via P2P/IPC | |
managed-worker-rnls:2259:2294 [5] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2259:2294 [5] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2259:2294 [5] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3419:3452 [6] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3419:3452 [6] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2253:2295 [1] include/socket.h:398 NCCL WARN Call to write failed : Connection reset by peer | |
managed-worker-rnls:2253:2295 [1] NCCL INFO transport/net_socket.cu:177 -> 2 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO bootstrap.cu:29 -> 2 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO bootstrap.cu:231 -> 2 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2253:2295 [1] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2262:2299 [6] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2262:2299 [6] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2262:2299 [6] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3407:3454 [1] include/socket.h:398 NCCL WARN Call to write failed : Connection reset by peer | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO transport/net_socket.cu:177 -> 2 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO bootstrap.cu:29 -> 2 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO bootstrap.cu:231 -> 2 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3407:3454 [1] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3421:3450 [7] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3421:3450 [7] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2266:2293 [7] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2266:2293 [7] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2266:2293 [7] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-rnls:2252:2292 [0] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-rnls:2252:2292 [0] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO init.cu:518 -> 2 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO init.cu:593 -> 2 | |
managed-worker-rnls:2252:2292 [0] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-rnls: Test NCCL failure common.cu:782 'unhandled system error' | |
managed-worker-sqfj:3406:3447 [0] include/socket.h:380 NCCL WARN Net : Connection closed by remote peer | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO transport/net_socket.cu:186 -> 2 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO bootstrap.cu:36 -> 2 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO bootstrap.cu:233 -> 2 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO init.cu:518 -> 2 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO init.cu:593 -> 2 | |
managed-worker-sqfj:3406:3447 [0] NCCL INFO misc/group.cu:69 -> 2 [Async thread] | |
managed-worker-sqfj: Test NCCL failure common.cu:782 'unhandled system error' | |
-------------------------------------------------------------------------- | |
Primary job terminated normally, but 1 process returned | |
a non-zero exit code. Per user-direction, the job has been aborted. | |
-------------------------------------------------------------------------- | |
-------------------------------------------------------------------------- | |
mpirun detected that one or more processes exited with non-zero status, thus causing | |
the job to be terminated. The first process to do so was: | |
Process name: [[5515,1],4] | |
Exit code: 3 | |
-------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment