Created
May 30, 2023 14:05
-
-
Save surak/5f3f236616e5db48f19d31df457b4350 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+ '[' -z '' ']' | |
+ case "$-" in | |
+ __lmod_vx=x | |
+ '[' -n x ']' | |
+ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for this output (/p/software/juwelsbooster/lmod/8.7.12/init/bash) | |
Shell debugging restarted | |
+ unset __lmod_vx | |
+ export SRUN_CPUS_PER_TASK=48 | |
+ SRUN_CPUS_PER_TASK=48 | |
++ scontrol show hostnames 'jwb[0038,0061]' | |
++ head -n 1 | |
+ MASTER_ADDR=jwb0038 | |
+ MASTER_ADDR=jwb0038i | |
++ nslookup jwb0038i | |
++ grep -oP '(?<=Address: ).*' | |
+ export MASTER_ADDR=10.13.23.40 | |
+ MASTER_ADDR=10.13.23.40 | |
+ export MASTER_PORT=7010 | |
+ MASTER_PORT=7010 | |
+ export GPUS_PER_NODE=4 | |
+ GPUS_PER_NODE=4 | |
+ export NNODES=2 | |
+ NNODES=2 | |
+ export CUDA_LAUNCH_BLOCKING=1 | |
+ CUDA_LAUNCH_BLOCKING=1 | |
+ export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json | |
+ TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json | |
+ export NCCL_ASYNC_ERROR_HANDLING=1 | |
+ NCCL_ASYNC_ERROR_HANDLING=1 | |
+ export NCCL_IB_TIMEOUT=20 | |
+ NCCL_IB_TIMEOUT=20 | |
+ cd /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src | |
+ source sc_venv_template/activate.sh | |
++ SOURCE_PATH=sc_venv_template/activate.sh | |
+++ dirname sc_venv_template/activate.sh | |
++ RELATIVE_PATH=sc_venv_template | |
+++ realpath sc_venv_template | |
++ ABSOLUTE_PATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template | |
++ [[ /var/spool/parastation/jobs/7791752 != \s\c\_\v\e\n\v\_\t\e\m\p\l\a\t\e\/\a\c\t\i\v\a\t\e\.\s\h ]] | |
++ echo 'The activation script must be sourced, otherwise the virtual environment will not work.' | |
++ source /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/config.sh | |
+++ SOURCE_PATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/config.sh | |
+++ [[ /var/spool/parastation/jobs/7791752 != \/\p\/\p\r\o\j\e\c\t\/\c\c\s\t\a\o\/\c\s\t\a\o\0\5\/\2\0\2\3\-\m\a\y\-\i\n\t\r\o\-\t\o\-\s\u\p\e\r\c\o\m\p\t\i\n\g\-\j\s\c\/\s\r\c\/\s\c\_\v\e\n\v\_\t\e\m\p\l\a\t\e\/\c\o\n\f\i\g\.\s\h ]] | |
+++ echo 'Setting vars' | |
++++ dirname /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/config.sh | |
+++ RELATIVE_PATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template | |
++++ realpath /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template | |
+++ ABSOLUTE_PATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template | |
++++ basename /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template | |
+++ export ENV_NAME=sc_venv_template | |
+++ ENV_NAME=sc_venv_template | |
+++ export ENV_DIR=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv | |
+++ ENV_DIR=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv | |
++ source /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/modules.sh | |
+++ module purge | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
The following modules were not unloaded: | |
(Use "module --force purge" to unload all): | |
1) Stages/2023 | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load Stages/2023 | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load GCC OpenMPI | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load mpi4py numba tqdm OpenCV matplotlib IPython SciPy-Stack bokeh git | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load Flask Seaborn | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load PyQuil | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ module load PyTorch scikit-learn torchvision PyTorch-Lightning | |
+++ '[' -z '' ']' | |
+++ case "$-" in | |
+++ __lmod_sh_dbg=x | |
+++ '[' -n x ']' | |
+++ set +x | |
Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output | |
Shell debugging restarted | |
+++ unset __lmod_sh_dbg | |
+++ return 0 | |
+++ echo /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv/lib/python3.10/site-packages | |
++ export PYTHONPATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyTorch-Lightning/1.8.2-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/tensorboard/2.11.2-foss-2022a/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/torchvision/0.13.1-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/scikit-learn/1.1.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyTorch/1.12.0-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/expecttest/0.1.3-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/protobuf-python/3.19.4-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyQuil/3.3.3-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/networkx/2.8.4-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Seaborn/0.12.1-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Flask/2.2.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/bokeh/2.4.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/xarray/2022.9.0-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/dask/2022.12.0-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/typing-extensions/4.3.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyYAML/6.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/h5py/3.7.0-GCCcore-11.3.0-serial/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/netcdf4-python/1.6.1-GCCcore-11.3.0-serial/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/sympy/1.11.1-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/gmpy2/2.1.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/IPython/8.5.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/BeautifulSoup/4.10.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/lxml/4.9.1-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/matplotlib/3.5.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Pillow-SIMD/9.2.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Tkinter/3.10.4-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/OpenCV/4.7.0-gcccoremkl-11.3.0-2022.1.0-CUDA-11.7-contrib/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/tqdm/4.64.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/numba/0.56.4-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/pybind11/2.9.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/mpi4py/3.1.4-gompi-2022a/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Python/3.10.4-GCCcore-11.3.0/easybuild/python | |
++ PYTHONPATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyTorch-Lightning/1.8.2-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/tensorboard/2.11.2-foss-2022a/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/torchvision/0.13.1-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/scikit-learn/1.1.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyTorch/1.12.0-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/expecttest/0.1.3-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/protobuf-python/3.19.4-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyQuil/3.3.3-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/networkx/2.8.4-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Seaborn/0.12.1-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Flask/2.2.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/bokeh/2.4.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/xarray/2022.9.0-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/dask/2022.12.0-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/typing-extensions/4.3.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/PyYAML/6.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/h5py/3.7.0-GCCcore-11.3.0-serial/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/netcdf4-python/1.6.1-GCCcore-11.3.0-serial/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/sympy/1.11.1-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/gmpy2/2.1.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/IPython/8.5.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/BeautifulSoup/4.10.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/lxml/4.9.1-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/matplotlib/3.5.2-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Pillow-SIMD/9.2.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Tkinter/3.10.4-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/OpenCV/4.7.0-gcccoremkl-11.3.0-2022.1.0-CUDA-11.7-contrib/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/tqdm/4.64.0-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/numba/0.56.4-foss-2022a-CUDA-11.7/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/pybind11/2.9.2-GCCcore-11.3.0/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/mpi4py/3.1.4-gompi-2022a/lib/python3.10/site-packages:/p/software/juwelsbooster/stages/2023/software/Python/3.10.4-GCCcore-11.3.0/easybuild/python | |
++ source /p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv/bin/activate | |
+++ deactivate nondestructive | |
+++ '[' -n '' ']' | |
+++ '[' -n '' ']' | |
+++ '[' -n /bin/bash -o -n '' ']' | |
+++ hash -r | |
+++ '[' -n '' ']' | |
+++ unset VIRTUAL_ENV | |
+++ unset VIRTUAL_ENV_PROMPT | |
+++ '[' '!' nondestructive = nondestructive ']' | |
+++ VIRTUAL_ENV=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv | |
+++ export VIRTUAL_ENV | |
+++ _OLD_VIRTUAL_PATH=/p/software/juwelsbooster/stages/2023/software/tensorboard/2.11.2-foss-2022a/bin:/p/software/juwelsbooster/stages/2023/software/PyTorch/1.12.0-foss-2022a-CUDA-11.7/bin:/p/software/juwelsbooster/stages/2023/software/Ninja/1.10.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PyQuil/3.3.3-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/Flask/2.2.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/git/2.36.0-GCCcore-11.3.0-nodocs/bin:/p/software/juwelsbooster/stages/2023/software/BioPerl/1.7.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Perl/5.34.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/DB/18.1.40-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/bokeh/2.4.2-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/dask/2022.12.0-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/netcdf4-python/1.6.1-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/netCDF/4.9.0-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/cURL/7.83.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/sympy/1.11.1-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/IPython/8.5.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libxslt/1.1.34-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ZeroMQ/4.3.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/matplotlib/3.5.2-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/Qhull/2020.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Tk/8.6.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenCV/4.7.0-gcccoremkl-11.3.0-2022.1.0-CUDA-11.7-contrib/bin:/p/software/juwelsbooster/stages/2023/software/protobuf/3.19.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/HDF5/1.12.2-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/ant/1.10.13-Java-11:/p/software/juwelsbooster/stages/2023/software/ant/1.10.13-Java-11/bin:/p/software/juwelsbooster/stages/2023/software/Java/11.0.16:/p/software/juwelsbooster/stages/2023/software/Java/11.0.16/bin:/p/software/juwelsbooster/stages/2023/software/JasPer/2.0.33-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenEXR/3.1.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libwebp/1.2.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/giflib/5.2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenJPEG/2.5.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GStreamer/1.20.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/elfutils/0.187-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libarchive/3.6.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GTK+/3.24.34-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/librsvg/2.55.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GObject-Introspection/1.72.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenGL/2022a-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/nettle/3.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Pango/1.50.7-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/HarfBuzz/4.2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ICU/71.1-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/ICU/71.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/cairo/1.17.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Gdk-Pixbuf/2.42.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/LibTIFF/4.3.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libdeflate/1.10-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/zstd/1.5.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/lz4/1.9.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/gzip/1.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/jbigkit/2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libjpeg-turbo/2.1.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/DBus/1.14.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GLib/2.72.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PCRE/8.45-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/gettext/0.21-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GSL/2.7-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FFmpeg/4.4.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FriBidi/1.0.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/X11/20220504-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/fontconfig/2.14.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/util-linux/2.38-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/util-linux/2.38-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/freetype/2.12.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Brotli/1.0.9-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libpng/1.6.37-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/expat/2.4.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libvpx/1.12.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/x265/3.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/LAME/3.100-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/x264/20220620-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/NASM/2.15.05-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/tqdm/4.64.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/numba/0.56.4-foss-2022a-CUDA-11.7/bin:/p/software/juwelsbooster/stages/2023/software/LLVM/14.0.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/pybind11/2.9.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FFTW/3.3.10-gompi-2022a/bin:/p/software/juwelsbooster/stages/2023/software/FlexiBLAS/3.2.0-GCC-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Python/3.10.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/SQLite/3.38.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Tcl/8.6.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ncurses/6.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/bzip2/1.0.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/UCC/default-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PMIx/3.2.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libevent/2.1.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenSSL/1.1/bin:/p/software/juwelsbooster/stages/2023/software/UCX/default-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/CUDA/11.7/nvvm/bin:/p/software/juwelsbooster/stages/2023/software/CUDA/11.7/bin:/p/software/juwelsbooster/stages/2023/software/nvidia-driver/default:/p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/bin:/p/software/juwelsbooster/stages/2023/software/hwloc/2.7.1-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/hwloc/2.7.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libxml2/2.9.13-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/XZ/5.2.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/numactl/2.0.15-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/binutils/2.38-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GCCcore/11.3.0/bin:/p/project/ccstao/cstao05/bin:/p/project/ccstao/cstao05/.local/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ddn/ime/bin:/opt/jsc/bin:/usr/local/jsc/bin:/opt/parastation/bin:/p/software/juwelsbooster/bin | |
+++ PATH=/p/project/ccstao/cstao05/2023-may-intro-to-supercompting-jsc/src/sc_venv_template/venv/bin:/p/software/juwelsbooster/stages/2023/software/tensorboard/2.11.2-foss-2022a/bin:/p/software/juwelsbooster/stages/2023/software/PyTorch/1.12.0-foss-2022a-CUDA-11.7/bin:/p/software/juwelsbooster/stages/2023/software/Ninja/1.10.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PyQuil/3.3.3-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/Flask/2.2.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/git/2.36.0-GCCcore-11.3.0-nodocs/bin:/p/software/juwelsbooster/stages/2023/software/BioPerl/1.7.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Perl/5.34.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/DB/18.1.40-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/bokeh/2.4.2-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/dask/2022.12.0-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/netcdf4-python/1.6.1-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/netCDF/4.9.0-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/cURL/7.83.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/sympy/1.11.1-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/IPython/8.5.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libxslt/1.1.34-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ZeroMQ/4.3.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/matplotlib/3.5.2-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/Qhull/2020.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Tk/8.6.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenCV/4.7.0-gcccoremkl-11.3.0-2022.1.0-CUDA-11.7-contrib/bin:/p/software/juwelsbooster/stages/2023/software/protobuf/3.19.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/HDF5/1.12.2-GCCcore-11.3.0-serial/bin:/p/software/juwelsbooster/stages/2023/software/ant/1.10.13-Java-11:/p/software/juwelsbooster/stages/2023/software/ant/1.10.13-Java-11/bin:/p/software/juwelsbooster/stages/2023/software/Java/11.0.16:/p/software/juwelsbooster/stages/2023/software/Java/11.0.16/bin:/p/software/juwelsbooster/stages/2023/software/JasPer/2.0.33-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenEXR/3.1.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libwebp/1.2.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/giflib/5.2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenJPEG/2.5.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GStreamer/1.20.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/elfutils/0.187-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libarchive/3.6.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GTK+/3.24.34-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/librsvg/2.55.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GObject-Introspection/1.72.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenGL/2022a-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/nettle/3.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Pango/1.50.7-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/HarfBuzz/4.2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ICU/71.1-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/ICU/71.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/cairo/1.17.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Gdk-Pixbuf/2.42.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/LibTIFF/4.3.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libdeflate/1.10-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/zstd/1.5.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/lz4/1.9.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/gzip/1.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/jbigkit/2.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libjpeg-turbo/2.1.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/DBus/1.14.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GLib/2.72.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PCRE/8.45-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/gettext/0.21-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GSL/2.7-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FFmpeg/4.4.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FriBidi/1.0.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/X11/20220504-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/fontconfig/2.14.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/util-linux/2.38-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/util-linux/2.38-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/freetype/2.12.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Brotli/1.0.9-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libpng/1.6.37-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/expat/2.4.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libvpx/1.12.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/x265/3.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/LAME/3.100-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/x264/20220620-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/NASM/2.15.05-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/tqdm/4.64.0-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/numba/0.56.4-foss-2022a-CUDA-11.7/bin:/p/software/juwelsbooster/stages/2023/software/LLVM/14.0.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/bin:/p/software/juwelsbooster/stages/2023/software/pybind11/2.9.2-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/FFTW/3.3.10-gompi-2022a/bin:/p/software/juwelsbooster/stages/2023/software/FlexiBLAS/3.2.0-GCC-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Python/3.10.4-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/SQLite/3.38.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/Tcl/8.6.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/ncurses/6.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/bzip2/1.0.8-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/UCC/default-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/PMIx/3.2.3-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libevent/2.1.12-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/OpenSSL/1.1/bin:/p/software/juwelsbooster/stages/2023/software/UCX/default-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/CUDA/11.7/nvvm/bin:/p/software/juwelsbooster/stages/2023/software/CUDA/11.7/bin:/p/software/juwelsbooster/stages/2023/software/nvidia-driver/default:/p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/bin:/p/software/juwelsbooster/stages/2023/software/hwloc/2.7.1-GCCcore-11.3.0/sbin:/p/software/juwelsbooster/stages/2023/software/hwloc/2.7.1-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/libxml2/2.9.13-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/XZ/5.2.5-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/numactl/2.0.15-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/binutils/2.38-GCCcore-11.3.0/bin:/p/software/juwelsbooster/stages/2023/software/GCCcore/11.3.0/bin:/p/project/ccstao/cstao05/bin:/p/project/ccstao/cstao05/.local/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ddn/ime/bin:/opt/jsc/bin:/usr/local/jsc/bin:/opt/parastation/bin:/p/software/juwelsbooster/bin | |
+++ export PATH | |
+++ '[' -n '' ']' | |
+++ '[' -z '' ']' | |
+++ _OLD_VIRTUAL_PS1= | |
+++ PS1='(sc_venv_template) ' | |
+++ export PS1 | |
+++ VIRTUAL_ENV_PROMPT='(sc_venv_template) ' | |
+++ export VIRTUAL_ENV_PROMPT | |
+++ '[' -n /bin/bash -o -n '' ']' | |
+++ hash -r | |
+ export NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=ALL | |
+ NCCL_DEBUG=INFO | |
+ NCCL_DEBUG_SUBSYS=ALL | |
+ export LOGLEVEL=INFO | |
+ LOGLEVEL=INFO | |
+ srun bash -c 'accelerate launch \ | |
--main_process_ip $MASTER_ADDR \ | |
--main_process_port $MASTER_PORT \ | |
--multi_gpu \ | |
--mixed_precision=no \ | |
--num_processes=$(($NNODES * 4)) \ | |
--dynamo_backend=no \ | |
--num_machines=$NNODES \ | |
--machine_rank=$SLURM_PROCID \ | |
distrib.py' | |
INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs: | |
entrypoint : distrib.py | |
min_nodes : 2 | |
max_nodes : 2 | |
nproc_per_node : 4 | |
run_id : none | |
rdzv_backend : static | |
rdzv_endpoint : 10.13.23.40:7010 | |
rdzv_configs : {'rank': 1, 'timeout': 900} | |
max_restarts : 0 | |
monitor_interval : 5 | |
log_dir : None | |
metrics_cfg : {} | |
INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs: | |
entrypoint : distrib.py | |
min_nodes : 2 | |
max_nodes : 2 | |
nproc_per_node : 4 | |
run_id : none | |
rdzv_backend : static | |
rdzv_endpoint : 10.13.23.40:7010 | |
rdzv_configs : {'rank': 0, 'timeout': 900} | |
max_restarts : 0 | |
monitor_interval : 5 | |
log_dir : None | |
metrics_cfg : {} | |
INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_twsenhke/none_3_6rk4ef | |
INFO:torch.distributed.elastic.agent.server.api:[default] starting workers for entrypoint: python | |
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous'ing worker group | |
[W socket.cpp:401] [c10d] The server socket cannot be initialized on [::]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous complete for workers. Result: | |
restart_count=0 | |
master_addr=10.13.23.40 | |
master_port=7010 | |
group_rank=0 | |
group_world_size=2 | |
local_ranks=[0, 1, 2, 3] | |
role_ranks=[0, 1, 2, 3] | |
global_ranks=[0, 1, 2, 3] | |
role_world_sizes=[8, 8, 8, 8] | |
global_world_sizes=[8, 8, 8, 8] | |
INFO:torch.distributed.elastic.agent.server.api:[default] Starting worker group | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_twsenhke/none_3_6rk4ef/attempt_0/0/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_twsenhke/none_3_6rk4ef/attempt_0/1/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_twsenhke/none_3_6rk4ef/attempt_0/2/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_twsenhke/none_3_6rk4ef/attempt_0/3/error.json | |
INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_h_dj1caf/none__czscsxo | |
INFO:torch.distributed.elastic.agent.server.api:[default] starting workers for entrypoint: python | |
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous'ing worker group | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous complete for workers. Result: | |
restart_count=0 | |
master_addr=10.13.23.40 | |
master_port=7010 | |
group_rank=1 | |
group_world_size=2 | |
local_ranks=[0, 1, 2, 3] | |
role_ranks=[4, 5, 6, 7] | |
global_ranks=[4, 5, 6, 7] | |
role_world_sizes=[8, 8, 8, 8] | |
global_world_sizes=[8, 8, 8, 8] | |
INFO:torch.distributed.elastic.agent.server.api:[default] Starting worker group | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_h_dj1caf/none__czscsxo/attempt_0/0/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_h_dj1caf/none__czscsxo/attempt_0/1/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_h_dj1caf/none__czscsxo/attempt_0/2/error.json | |
INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_h_dj1caf/none__czscsxo/attempt_0/3/error.json | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). | |
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [jwb0038i.juwels]:7010 (errno: 97 - Address family not supported by protocol). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The activation script must be sourced, otherwise the virtual environment will not work. | |
Setting vars | |
jwb0038:16067:16067 [0] NCCL INFO Bootstrap : Using ib0:10.13.23.40<0> | |
jwb0038:16067:16067 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0038:16067:16067 [0] NCCL INFO cudaDriverVersion 12000 | |
NCCL version 2.15.1+cuda11.7 | |
jwb0038:16067:16067 [0] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x151a74600000 | |
jwb0038:16070:16070 [3] NCCL INFO cudaDriverVersion 12000 | |
jwb0038:16068:16068 [1] NCCL INFO cudaDriverVersion 12000 | |
jwb0038:16069:16069 [2] NCCL INFO cudaDriverVersion 12000 | |
jwb0061:16371:16371 [3] NCCL INFO cudaDriverVersion 12000 | |
jwb0061:16369:16369 [1] NCCL INFO cudaDriverVersion 12000 | |
jwb0061:16370:16370 [2] NCCL INFO cudaDriverVersion 12000 | |
jwb0061:16368:16368 [0] NCCL INFO cudaDriverVersion 12000 | |
jwb0038:16067:16132 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.40<0> | |
jwb0038:16067:16132 [0] NCCL INFO Using network IB | |
jwb0038:16070:16070 [3] NCCL INFO Bootstrap : Using ib0:10.13.23.40<0> | |
jwb0038:16070:16070 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0038:16070:16070 [3] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x148ae6600000 | |
jwb0038:16068:16068 [1] NCCL INFO Bootstrap : Using ib0:10.13.23.40<0> | |
jwb0038:16069:16069 [2] NCCL INFO Bootstrap : Using ib0:10.13.23.40<0> | |
jwb0038:16068:16068 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0038:16069:16069 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0038:16068:16068 [1] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x14f22e600000 | |
jwb0038:16069:16069 [2] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x14660c600000 | |
jwb0038:16068:16138 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.40<0> | |
jwb0038:16068:16138 [1] NCCL INFO Using network IB | |
jwb0038:16069:16139 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.40<0> | |
jwb0038:16069:16139 [2] NCCL INFO Using network IB | |
jwb0061:16368:16368 [0] NCCL INFO Bootstrap : Using ib0:10.13.23.55<0> | |
jwb0061:16371:16371 [3] NCCL INFO Bootstrap : Using ib0:10.13.23.55<0> | |
jwb0061:16370:16370 [2] NCCL INFO Bootstrap : Using ib0:10.13.23.55<0> | |
jwb0061:16369:16369 [1] NCCL INFO Bootstrap : Using ib0:10.13.23.55<0> | |
jwb0061:16368:16368 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0061:16368:16368 [0] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x145c42600000 | |
jwb0061:16371:16371 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0061:16370:16370 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0061:16369:16369 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation | |
jwb0061:16371:16371 [3] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x15320e600000 | |
jwb0061:16370:16370 [2] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x14ca60600000 | |
jwb0061:16369:16369 [1] NCCL INFO init.cc:1147 Cuda Host Alloc Size 4 pointer 0x14ce1c600000 | |
jwb0038:16070:16137 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.40<0> | |
jwb0038:16070:16137 [3] NCCL INFO Using network IB | |
jwb0061:16371:16434 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.55<0> | |
jwb0061:16371:16434 [3] NCCL INFO Using network IB | |
jwb0061:16369:16436 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.55<0> | |
jwb0061:16369:16436 [1] NCCL INFO Using network IB | |
jwb0061:16368:16432 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.55<0> | |
jwb0061:16368:16432 [0] NCCL INFO Using network IB | |
jwb0061:16370:16435 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ib0:10.13.23.55<0> | |
jwb0061:16370:16435 [2] NCCL INFO Using network IB | |
jwb0061:16368:16432 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0061:16368:16432 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0061:16368:16432 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0061:16368:16432 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16067:16132 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0038:16067:16132 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0038:16067:16132 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16067:16132 [0] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16068:16138 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0038:16068:16138 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0038:16068:16138 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16068:16138 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0061:16368:16432 [0] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x145c43000000 | |
jwb0038:16067:16132 [0] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x151a75000000 | |
jwb0061:16371:16434 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0061:16371:16434 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0038:16068:16138 [1] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x14f22f000000 | |
jwb0038:16069:16139 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0061:16371:16434 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16069:16139 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0061:16371:16434 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16069:16139 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16069:16139 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0038:16070:16137 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0038:16070:16137 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16068:16138 [1] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0038:16068:16138 [1] NCCL INFO CPU/3 (1/2/-1) | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - GPU/3000 (0) | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0038:16068:16138 [1] NCCL INFO + NET[25.0] - NET/0 (dcf8bf0100380008/1/25.000000) | |
jwb0038:16068:16138 [1] NCCL INFO CPU/1 (1/2/-1) | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - GPU/44000 (1) | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0038:16068:16138 [1] NCCL INFO + NET[25.0] - NET/1 (d8f8bf0100380008/1/25.000000) | |
jwb0038:16068:16138 [1] NCCL INFO CPU/7 (1/2/-1) | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - GPU/84000 (2) | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0038:16068:16138 [1] NCCL INFO + NET[25.0] - NET/2 (d0f8bf0100380008/1/25.000000) | |
jwb0038:16068:16138 [1] NCCL INFO CPU/5 (1/2/-1) | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16068:16138 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - GPU/C4000 (3) | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16068:16138 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16068:16138 [1] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0038:16068:16138 [1] NCCL INFO + NET[25.0] - NET/3 (d4f8bf0100380008/1/25.000000) | |
jwb0038:16068:16138 [1] NCCL INFO ========================================== | |
jwb0038:16068:16138 [1] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16068:16138 [1] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16068:16138 [1] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0038:16068:16138 [1] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0038:16068:16138 [1] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16068:16138 [1] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16068:16138 [1] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0038:16068:16138 [1] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0038:16068:16138 [1] NCCL INFO Setting affinity for GPU 1 to 0fc00000,00000fc0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x14660d000000 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0061:16368:16432 [0] NCCL INFO CPU/3 (1/2/-1) | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - GPU/3000 (4) | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0061:16368:16432 [0] NCCL INFO + NET[25.0] - NET/0 (c4fabf0100380008/1/25.000000) | |
jwb0061:16368:16432 [0] NCCL INFO CPU/1 (1/2/-1) | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - GPU/44000 (5) | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0061:16368:16432 [0] NCCL INFO + NET[25.0] - NET/1 (c0fabf0100380008/1/25.000000) | |
jwb0061:16368:16432 [0] NCCL INFO CPU/7 (1/2/-1) | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - GPU/84000 (6) | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0061:16368:16432 [0] NCCL INFO + NET[25.0] - NET/2 (2092c00100380008/1/25.000000) | |
jwb0061:16368:16432 [0] NCCL INFO CPU/5 (1/2/-1) | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16368:16432 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - GPU/C4000 (7) | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16368:16432 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16368:16432 [0] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0061:16368:16432 [0] NCCL INFO + NET[25.0] - NET/3 (2492c00100380008/1/25.000000) | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO ========================================== | |
jwb0061:16368:16432 [0] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16368:16432 [0] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0061:16368:16432 [0] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16368:16432 [0] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16368:16432 [0] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0061:16368:16432 [0] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0061:16368:16432 [0] NCCL INFO Setting affinity for GPU 0 to fc,00000000,00fc0000 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16069:16139 [2] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0038:16069:16139 [2] NCCL INFO CPU/3 (1/2/-1) | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - GPU/3000 (0) | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0038:16069:16139 [2] NCCL INFO + NET[25.0] - NET/0 (dcf8bf0100380008/1/25.000000) | |
jwb0038:16069:16139 [2] NCCL INFO CPU/1 (1/2/-1) | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - GPU/44000 (1) | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0038:16069:16139 [2] NCCL INFO + NET[25.0] - NET/1 (d8f8bf0100380008/1/25.000000) | |
jwb0038:16069:16139 [2] NCCL INFO CPU/7 (1/2/-1) | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - GPU/84000 (2) | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0038:16069:16139 [2] NCCL INFO + NET[25.0] - NET/2 (d0f8bf0100380008/1/25.000000) | |
jwb0038:16069:16139 [2] NCCL INFO CPU/5 (1/2/-1) | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16069:16139 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0061:16370:16435 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - GPU/C4000 (3) | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16069:16139 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16069:16139 [2] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0038:16069:16139 [2] NCCL INFO + NET[25.0] - NET/3 (d4f8bf0100380008/1/25.000000) | |
jwb0038:16069:16139 [2] NCCL INFO ========================================== | |
jwb0038:16069:16139 [2] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16069:16139 [2] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16069:16139 [2] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0038:16069:16139 [2] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0038:16069:16139 [2] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16069:16139 [2] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16069:16139 [2] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0038:16069:16139 [2] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0038:16069:16139 [2] NCCL INFO Setting affinity for GPU 2 to fc000000,0000fc00,00000000 | |
jwb0061:16370:16435 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0038:16070:16137 [3] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x148ae7000000 | |
jwb0061:16370:16435 [2] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16137 [3] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0038:16070:16137 [3] NCCL INFO CPU/3 (1/2/-1) | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - GPU/3000 (0) | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0038:16070:16137 [3] NCCL INFO + NET[25.0] - NET/0 (dcf8bf0100380008/1/25.000000) | |
jwb0038:16070:16137 [3] NCCL INFO CPU/1 (1/2/-1) | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - GPU/44000 (1) | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0038:16070:16137 [3] NCCL INFO + NET[25.0] - NET/1 (d8f8bf0100380008/1/25.000000) | |
jwb0038:16070:16137 [3] NCCL INFO CPU/7 (1/2/-1) | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - GPU/84000 (2) | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0038:16070:16137 [3] NCCL INFO + NET[25.0] - NET/2 (d0f8bf0100380008/1/25.000000) | |
jwb0038:16070:16137 [3] NCCL INFO CPU/5 (1/2/-1) | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16070:16137 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - GPU/C4000 (3) | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16070:16137 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16070:16137 [3] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0038:16070:16137 [3] NCCL INFO + NET[25.0] - NET/3 (d4f8bf0100380008/1/25.000000) | |
jwb0038:16070:16137 [3] NCCL INFO ========================================== | |
jwb0038:16070:16137 [3] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16070:16137 [3] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16070:16137 [3] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0038:16070:16137 [3] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0038:16070:16137 [3] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16070:16137 [3] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16070:16137 [3] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0038:16070:16137 [3] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0038:16070:16137 [3] NCCL INFO Setting affinity for GPU 3 to 0fc000,0000000f,c0000000 | |
jwb0061:16371:16434 [3] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x15320f000000 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0038:16067:16132 [0] NCCL INFO CPU/3 (1/2/-1) | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - GPU/3000 (0) | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0038:16067:16132 [0] NCCL INFO + NET[25.0] - NET/0 (dcf8bf0100380008/1/25.000000) | |
jwb0038:16067:16132 [0] NCCL INFO CPU/1 (1/2/-1) | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - GPU/44000 (1) | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0038:16067:16132 [0] NCCL INFO + NET[25.0] - NET/1 (d8f8bf0100380008/1/25.000000) | |
jwb0038:16067:16132 [0] NCCL INFO CPU/7 (1/2/-1) | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - GPU/84000 (2) | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0038:16067:16132 [0] NCCL INFO + NET[25.0] - NET/2 (d0f8bf0100380008/1/25.000000) | |
jwb0038:16067:16132 [0] NCCL INFO CPU/5 (1/2/-1) | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0038:16067:16132 [0] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - GPU/C4000 (3) | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0038:16067:16132 [0] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0038:16067:16132 [0] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0038:16067:16132 [0] NCCL INFO + NET[25.0] - NET/3 (d4f8bf0100380008/1/25.000000) | |
jwb0038:16067:16132 [0] NCCL INFO ========================================== | |
jwb0038:16067:16132 [0] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16067:16132 [0] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0038:16067:16132 [0] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0038:16067:16132 [0] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0038:16067:16132 [0] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16067:16132 [0] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0038:16067:16132 [0] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0038:16067:16132 [0] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0038:16067:16132 [0] NCCL INFO Setting affinity for GPU 0 to fc,00000000,00fc0000 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x14ca61000000 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0061:16371:16434 [3] NCCL INFO CPU/3 (1/2/-1) | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - GPU/3000 (4) | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0061:16371:16434 [3] NCCL INFO + NET[25.0] - NET/0 (c4fabf0100380008/1/25.000000) | |
jwb0061:16371:16434 [3] NCCL INFO CPU/1 (1/2/-1) | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - GPU/44000 (5) | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0061:16371:16434 [3] NCCL INFO + NET[25.0] - NET/1 (c0fabf0100380008/1/25.000000) | |
jwb0061:16371:16434 [3] NCCL INFO CPU/7 (1/2/-1) | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - GPU/84000 (6) | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0061:16371:16434 [3] NCCL INFO + NET[25.0] - NET/2 (2092c00100380008/1/25.000000) | |
jwb0061:16371:16434 [3] NCCL INFO CPU/5 (1/2/-1) | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16371:16434 [3] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - GPU/C4000 (7) | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16371:16434 [3] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16371:16434 [3] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0061:16371:16434 [3] NCCL INFO + NET[25.0] - NET/3 (2492c00100380008/1/25.000000) | |
jwb0061:16371:16434 [3] NCCL INFO ========================================== | |
jwb0061:16371:16434 [3] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16371:16434 [3] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16371:16434 [3] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0061:16371:16434 [3] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0061:16371:16434 [3] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16371:16434 [3] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16371:16434 [3] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0061:16371:16434 [3] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0061:16371:16434 [3] NCCL INFO Setting affinity for GPU 3 to 0fc000,0000000f,c0000000 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16370:16435 [2] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0061:16370:16435 [2] NCCL INFO CPU/3 (1/2/-1) | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - GPU/3000 (4) | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0061:16370:16435 [2] NCCL INFO + NET[25.0] - NET/0 (c4fabf0100380008/1/25.000000) | |
jwb0061:16370:16435 [2] NCCL INFO CPU/1 (1/2/-1) | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - GPU/44000 (5) | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0061:16370:16435 [2] NCCL INFO + NET[25.0] - NET/1 (c0fabf0100380008/1/25.000000) | |
jwb0061:16370:16435 [2] NCCL INFO CPU/7 (1/2/-1) | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - GPU/84000 (6) | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0061:16370:16435 [2] NCCL INFO + NET[25.0] - NET/2 (2092c00100380008/1/25.000000) | |
jwb0061:16370:16435 [2] NCCL INFO CPU/5 (1/2/-1) | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16370:16435 [2] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - GPU/C4000 (7) | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16370:16435 [2] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16370:16435 [2] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0061:16370:16435 [2] NCCL INFO + NET[25.0] - NET/3 (2492c00100380008/1/25.000000) | |
jwb0061:16370:16435 [2] NCCL INFO ========================================== | |
jwb0061:16370:16435 [2] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16370:16435 [2] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16370:16435 [2] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0061:16370:16435 [2] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0061:16370:16435 [2] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16370:16435 [2] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16370:16435 [2] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0061:16370:16435 [2] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0061:16370:16435 [2] NCCL INFO Setting affinity for GPU 2 to fc000000,0000fc00,00000000 | |
jwb0061:16369:16436 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0' | |
jwb0061:16369:16436 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 1 'mlx5_1' | |
jwb0061:16369:16436 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 2 'mlx5_2' | |
jwb0061:16369:16436 [1] NCCL INFO NET/IB : GPU Direct RDMA Enabled for HCA 3 'mlx5_3' | |
jwb0061:16369:16436 [1] NCCL INFO transport/p2p.cc:151 Cuda Alloc Size 2097152 pointer 0x14ce1d000000 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO === System : maxBw 24.0 totalBw 264.0 === | |
jwb0061:16369:16436 [1] NCCL INFO CPU/3 (1/2/-1) | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - PCI/1000 (1000c0101000100b) | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - GPU/3000 (4) | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - NIC/4000 | |
jwb0061:16369:16436 [1] NCCL INFO + NET[25.0] - NET/0 (c4fabf0100380008/1/25.000000) | |
jwb0061:16369:16436 [1] NCCL INFO CPU/1 (1/2/-1) | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - PCI/41000 (1000c0101000100b) | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - GPU/44000 (5) | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - NIC/43000 | |
jwb0061:16369:16436 [1] NCCL INFO + NET[25.0] - NET/1 (c0fabf0100380008/1/25.000000) | |
jwb0061:16369:16436 [1] NCCL INFO CPU/7 (1/2/-1) | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/5 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - PCI/81000 (1000c0101000100b) | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - GPU/84000 (6) | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/C4000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - NIC/83000 | |
jwb0061:16369:16436 [1] NCCL INFO + NET[25.0] - NET/2 (2092c00100380008/1/25.000000) | |
jwb0061:16369:16436 [1] NCCL INFO CPU/5 (1/2/-1) | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/3 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/1 | |
jwb0061:16369:16436 [1] NCCL INFO + SYS[5000.0] - CPU/7 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - PCI/C1000 (1000c0101000100b) | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - GPU/C4000 (7) | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/84000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/44000 | |
jwb0061:16369:16436 [1] NCCL INFO + NVL[88.0] - GPU/3000 | |
jwb0061:16369:16436 [1] NCCL INFO + PCI[24.0] - NIC/C3000 | |
jwb0061:16369:16436 [1] NCCL INFO + NET[25.0] - NET/3 (2492c00100380008/1/25.000000) | |
jwb0061:16369:16436 [1] NCCL INFO ========================================== | |
jwb0061:16369:16436 [1] NCCL INFO GPU/3000 :GPU/3000 (0/5000.000000/LOC) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (2/24.000000/PHB) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (3/24.000000/PIX) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16369:16436 [1] NCCL INFO GPU/44000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (0/5000.000000/LOC) GPU/84000 (1/88.000000/NVL) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (2/24.000000/PHB) CPU/7 (3/24.000000/SYS) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (3/24.000000/PIX) NET/2 (4/24.000000/PXN) NET/3 (4/24.000000/PXN) | |
jwb0061:16369:16436 [1] NCCL INFO GPU/84000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (0/5000.000000/LOC) GPU/C4000 (1/88.000000/NVL) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (2/24.000000/PHB) CPU/5 (3/24.000000/SYS) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (3/24.000000/PIX) NET/3 (4/24.000000/PXN) | |
jwb0061:16369:16436 [1] NCCL INFO GPU/C4000 :GPU/3000 (1/88.000000/NVL) GPU/44000 (1/88.000000/NVL) GPU/84000 (1/88.000000/NVL) GPU/C4000 (0/5000.000000/LOC) CPU/3 (3/24.000000/SYS) CPU/1 (3/24.000000/SYS) CPU/7 (3/24.000000/SYS) CPU/5 (2/24.000000/PHB) NET/0 (4/24.000000/PXN) NET/1 (4/24.000000/PXN) NET/2 (4/24.000000/PXN) NET/3 (3/24.000000/PIX) | |
jwb0061:16369:16436 [1] NCCL INFO NET/0 :GPU/3000 (3/24.000000/PIX) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (3/24.000000/PHB) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (0/5000.000000/LOC) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16369:16436 [1] NCCL INFO NET/1 :GPU/3000 (6/24.000000/SYS) GPU/44000 (3/24.000000/PIX) GPU/84000 (6/24.000000/SYS) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (3/24.000000/PHB) CPU/7 (4/24.000000/SYS) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (0/5000.000000/LOC) NET/2 (7/24.000000/SYS) NET/3 (7/24.000000/SYS) | |
jwb0061:16369:16436 [1] NCCL INFO NET/2 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (3/24.000000/PIX) GPU/C4000 (6/24.000000/SYS) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (3/24.000000/PHB) CPU/5 (4/24.000000/SYS) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (0/5000.000000/LOC) NET/3 (7/24.000000/SYS) | |
jwb0061:16369:16436 [1] NCCL INFO NET/3 :GPU/3000 (6/24.000000/SYS) GPU/44000 (6/24.000000/SYS) GPU/84000 (6/24.000000/SYS) GPU/C4000 (3/24.000000/PIX) CPU/3 (4/24.000000/SYS) CPU/1 (4/24.000000/SYS) CPU/7 (4/24.000000/SYS) CPU/5 (3/24.000000/PHB) NET/0 (7/24.000000/SYS) NET/1 (7/24.000000/SYS) NET/2 (7/24.000000/SYS) NET/3 (0/5000.000000/LOC) | |
jwb0061:16369:16436 [1] NCCL INFO Setting affinity for GPU 1 to 0fc00000,00000fc0 | |
jwb0061:16368:16432 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16368:16432 [0] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16368:16432 [0] NCCL INFO 1 : NET/1 GPU/5 GPU/4 GPU/7 GPU/6 NET/1 | |
jwb0061:16368:16432 [0] NCCL INFO 2 : NET/2 GPU/6 GPU/5 GPU/4 GPU/7 NET/2 | |
jwb0061:16368:16432 [0] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16368:16432 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16368:16432 [0] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16368:16432 [0] NCCL INFO 1 : NET/1 GPU/5 GPU/7 GPU/4 GPU/6 NET/1 | |
jwb0061:16368:16432 [0] NCCL INFO 2 : NET/2 GPU/6 GPU/4 GPU/7 GPU/5 NET/2 | |
jwb0061:16368:16432 [0] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16368:16432 [0] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0038:16069:16139 [2] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16069:16139 [2] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16069:16139 [2] NCCL INFO 1 : NET/1 GPU/1 GPU/0 GPU/3 GPU/2 NET/1 | |
jwb0038:16069:16139 [2] NCCL INFO 2 : NET/2 GPU/2 GPU/1 GPU/0 GPU/3 NET/2 | |
jwb0038:16069:16139 [2] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16069:16139 [2] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16069:16139 [2] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16069:16139 [2] NCCL INFO 1 : NET/1 GPU/1 GPU/3 GPU/0 GPU/2 NET/1 | |
jwb0038:16069:16139 [2] NCCL INFO 2 : NET/2 GPU/2 GPU/0 GPU/3 GPU/1 NET/2 | |
jwb0038:16069:16139 [2] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16067:16132 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16067:16132 [0] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16067:16132 [0] NCCL INFO 1 : NET/1 GPU/1 GPU/0 GPU/3 GPU/2 NET/1 | |
jwb0038:16067:16132 [0] NCCL INFO 2 : NET/2 GPU/2 GPU/1 GPU/0 GPU/3 NET/2 | |
jwb0038:16067:16132 [0] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16069:16139 [2] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0038:16067:16132 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16067:16132 [0] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16067:16132 [0] NCCL INFO 1 : NET/1 GPU/1 GPU/3 GPU/0 GPU/2 NET/1 | |
jwb0038:16067:16132 [0] NCCL INFO 2 : NET/2 GPU/2 GPU/0 GPU/3 GPU/1 NET/2 | |
jwb0038:16067:16132 [0] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16070:16137 [3] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16070:16137 [3] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16070:16137 [3] NCCL INFO 1 : NET/1 GPU/1 GPU/0 GPU/3 GPU/2 NET/1 | |
jwb0038:16070:16137 [3] NCCL INFO 2 : NET/2 GPU/2 GPU/1 GPU/0 GPU/3 NET/2 | |
jwb0038:16070:16137 [3] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16067:16132 [0] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0038:16070:16137 [3] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16070:16137 [3] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16070:16137 [3] NCCL INFO 1 : NET/1 GPU/1 GPU/3 GPU/0 GPU/2 NET/1 | |
jwb0038:16070:16137 [3] NCCL INFO 2 : NET/2 GPU/2 GPU/0 GPU/3 GPU/1 NET/2 | |
jwb0038:16070:16137 [3] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0061:16371:16434 [3] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16371:16434 [3] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16371:16434 [3] NCCL INFO 1 : NET/1 GPU/5 GPU/4 GPU/7 GPU/6 NET/1 | |
jwb0061:16371:16434 [3] NCCL INFO 2 : NET/2 GPU/6 GPU/5 GPU/4 GPU/7 NET/2 | |
jwb0061:16371:16434 [3] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16371:16434 [3] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16371:16434 [3] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16371:16434 [3] NCCL INFO 1 : NET/1 GPU/5 GPU/7 GPU/4 GPU/6 NET/1 | |
jwb0061:16371:16434 [3] NCCL INFO 2 : NET/2 GPU/6 GPU/4 GPU/7 GPU/5 NET/2 | |
jwb0061:16371:16434 [3] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0038:16070:16137 [3] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0061:16371:16434 [3] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0061:16370:16435 [2] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16370:16435 [2] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16370:16435 [2] NCCL INFO 1 : NET/1 GPU/5 GPU/4 GPU/7 GPU/6 NET/1 | |
jwb0061:16370:16435 [2] NCCL INFO 2 : NET/2 GPU/6 GPU/5 GPU/4 GPU/7 NET/2 | |
jwb0061:16370:16435 [2] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16370:16435 [2] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16370:16435 [2] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16370:16435 [2] NCCL INFO 1 : NET/1 GPU/5 GPU/7 GPU/4 GPU/6 NET/1 | |
jwb0061:16370:16435 [2] NCCL INFO 2 : NET/2 GPU/6 GPU/4 GPU/7 GPU/5 NET/2 | |
jwb0061:16370:16435 [2] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0038:16068:16138 [1] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16068:16138 [1] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16068:16138 [1] NCCL INFO 1 : NET/1 GPU/1 GPU/0 GPU/3 GPU/2 NET/1 | |
jwb0038:16068:16138 [1] NCCL INFO 2 : NET/2 GPU/2 GPU/1 GPU/0 GPU/3 NET/2 | |
jwb0038:16068:16138 [1] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0038:16068:16138 [1] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0038:16068:16138 [1] NCCL INFO 0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0 | |
jwb0038:16068:16138 [1] NCCL INFO 1 : NET/1 GPU/1 GPU/3 GPU/0 GPU/2 NET/1 | |
jwb0038:16068:16138 [1] NCCL INFO 2 : NET/2 GPU/2 GPU/0 GPU/3 GPU/1 NET/2 | |
jwb0038:16068:16138 [1] NCCL INFO 3 : NET/3 GPU/3 GPU/2 GPU/1 GPU/0 NET/3 | |
jwb0061:16370:16435 [2] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0038:16068:16138 [1] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0061:16369:16436 [1] NCCL INFO Pattern 4, crossNic 0, nChannels 4, bw 24.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16369:16436 [1] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16369:16436 [1] NCCL INFO 1 : NET/1 GPU/5 GPU/4 GPU/7 GPU/6 NET/1 | |
jwb0061:16369:16436 [1] NCCL INFO 2 : NET/2 GPU/6 GPU/5 GPU/4 GPU/7 NET/2 | |
jwb0061:16369:16436 [1] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16369:16436 [1] NCCL INFO Pattern 1, crossNic 0, nChannels 4, bw 48.000000/24.000000, type NVL/PXN, sameChannels 0 | |
jwb0061:16369:16436 [1] NCCL INFO 0 : NET/0 GPU/4 GPU/5 GPU/6 GPU/7 NET/0 | |
jwb0061:16369:16436 [1] NCCL INFO 1 : NET/1 GPU/5 GPU/7 GPU/4 GPU/6 NET/1 | |
jwb0061:16369:16436 [1] NCCL INFO 2 : NET/2 GPU/6 GPU/4 GPU/7 GPU/5 NET/2 | |
jwb0061:16369:16436 [1] NCCL INFO 3 : NET/3 GPU/7 GPU/6 GPU/5 GPU/4 NET/3 | |
jwb0061:16369:16436 [1] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1 | |
jwb0061:16369:16436 [1] NCCL INFO Tree 0 : 4 -> 5 -> 6/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Tree 4 : 4 -> 5 -> 6/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Tree 1 : 1 -> 5 -> 7/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Tree 5 : -1 -> 5 -> 7/1/-1 | |
jwb0061:16370:16435 [2] NCCL INFO Tree 2 : 2 -> 6 -> 4/-1/-1 | |
jwb0061:16370:16435 [2] NCCL INFO Tree 6 : -1 -> 6 -> 4/2/-1 | |
jwb0061:16370:16435 [2] NCCL INFO Tree 3 : 7 -> 6 -> 5/-1/-1 | |
jwb0061:16370:16435 [2] NCCL INFO Tree 7 : 7 -> 6 -> 5/-1/-1 | |
jwb0061:16371:16434 [3] NCCL INFO Tree 1 : 5 -> 7 -> 4/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 00 : 4 -> 5 -> 6 | |
jwb0061:16371:16434 [3] NCCL INFO Tree 5 : 5 -> 7 -> 4/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 01 : 2 -> 5 -> 4 | |
jwb0061:16371:16434 [3] NCCL INFO Tree 3 : 3 -> 7 -> 6/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 02 : 6 -> 5 -> 4 | |
jwb0061:16371:16434 [3] NCCL INFO Tree 7 : -1 -> 7 -> 6/3/-1 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 00 : 5 -> 6 -> 7 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 03 : 6 -> 5 -> 4 | |
jwb0038:16068:16138 [1] NCCL INFO Tree 0 : 0 -> 1 -> 2/-1/-1 | |
jwb0038:16067:16132 [0] NCCL INFO Tree 0 : -1 -> 0 -> 1/4/-1 | |
jwb0038:16068:16138 [1] NCCL INFO Tree 4 : 0 -> 1 -> 2/-1/-1 | |
jwb0038:16067:16132 [0] NCCL INFO Tree 4 : 4 -> 0 -> 1/-1/-1 | |
jwb0038:16068:16138 [1] NCCL INFO Tree 1 : -1 -> 1 -> 3/5/-1 | |
jwb0038:16067:16132 [0] NCCL INFO Tree 2 : 2 -> 0 -> 3/-1/-1 | |
jwb0038:16070:16137 [3] NCCL INFO Tree 1 : 1 -> 3 -> 0/-1/-1 | |
jwb0038:16068:16138 [1] NCCL INFO Tree 5 : 5 -> 1 -> 3/-1/-1 | |
jwb0038:16067:16132 [0] NCCL INFO Tree 6 : 2 -> 0 -> 3/-1/-1 | |
jwb0038:16070:16137 [3] NCCL INFO Tree 5 : 1 -> 3 -> 0/-1/-1 | |
jwb0038:16069:16139 [2] NCCL INFO Tree 2 : -1 -> 2 -> 0/6/-1 | |
jwb0038:16070:16137 [3] NCCL INFO Tree 3 : -1 -> 3 -> 2/7/-1 | |
jwb0038:16069:16139 [2] NCCL INFO Tree 6 : 6 -> 2 -> 0/-1/-1 | |
jwb0038:16070:16137 [3] NCCL INFO Tree 7 : 7 -> 3 -> 2/-1/-1 | |
jwb0038:16069:16139 [2] NCCL INFO Tree 3 : 3 -> 2 -> 1/-1/-1 | |
jwb0038:16069:16139 [2] NCCL INFO Tree 7 : 3 -> 2 -> 1/-1/-1 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 00 : 0 -> 1 -> 2 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 01 : 6 -> 1 -> 0 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 00/08 : 0 1 2 3 4 5 6 7 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 02 : 2 -> 1 -> 0 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 00 : 2 -> 3 -> 4 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 01/08 : 0 3 2 5 4 7 6 1 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 03 : 2 -> 1 -> 0 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 01 : 0 -> 3 -> 2 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 00 : 1 -> 2 -> 3 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 04 : 0 -> 1 -> 2 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 02/08 : 0 3 6 5 4 7 2 1 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 02 : 0 -> 3 -> 6 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 01 : 3 -> 2 -> 5 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 05 : 6 -> 1 -> 0 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 03 : 4 -> 3 -> 2 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 02 : 7 -> 2 -> 1 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 01 : 7 -> 6 -> 1 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 04 : 4 -> 5 -> 6 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 03/08 : 0 7 6 5 4 3 2 1 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 02 : 3 -> 6 -> 5 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 05 : 2 -> 5 -> 4 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 00 : 6 -> 7 -> 0 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 03 : 7 -> 6 -> 5 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 06 : 6 -> 5 -> 4 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 01 : 4 -> 7 -> 6 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 04 : 5 -> 6 -> 7 | |
jwb0061:16369:16436 [1] NCCL INFO Ring 07 : 6 -> 5 -> 4 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 02 : 4 -> 7 -> 2 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 05 : 7 -> 6 -> 1 | |
jwb0061:16368:16432 [0] NCCL INFO Tree 0 : 0 -> 4 -> 5/-1/-1 | |
jwb0061:16369:16436 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 7/-1/-1->5->1 [2] -1/-1/-1->5->7 [3] 4/-1/-1->5->6 [4] 6/-1/-1->5->4 [5] 7/1/-1->5->-1 [6] -1/-1/-1->5->7 [7] 4/-1/-1->5->6 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 06 : 2 -> 1 -> 0 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 04 : 2 -> 3 -> 4 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 03 : 3 -> 2 -> 1 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 04/08 : 0 1 2 3 4 5 6 7 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 05 : 0 -> 3 -> 2 | |
jwb0038:16068:16138 [1] NCCL INFO Ring 07 : 2 -> 1 -> 0 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 04 : 1 -> 2 -> 3 | |
jwb0038:16068:16138 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 3/5/-1->1->-1 [2] -1/-1/-1->1->3 [3] 0/-1/-1->1->2 [4] 2/-1/-1->1->0 [5] 3/-1/-1->1->5 [6] -1/-1/-1->1->3 [7] 0/-1/-1->1->2 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 06 : 0 -> 3 -> 6 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 05/08 : 0 3 2 5 4 7 6 1 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 03 : 0 -> 7 -> 6 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 06 : 3 -> 6 -> 5 | |
jwb0061:16368:16432 [0] NCCL INFO Tree 4 : -1 -> 4 -> 5/0/-1 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 04 : 6 -> 7 -> 0 | |
jwb0061:16370:16435 [2] NCCL INFO Ring 07 : 7 -> 6 -> 5 | |
jwb0038:16070:16137 [3] NCCL INFO Ring 07 : 4 -> 3 -> 2 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 05 : 3 -> 2 -> 5 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 06/08 : 0 3 6 5 4 7 2 1 | |
jwb0038:16070:16137 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 0/-1/-1->3->1 [2] 1/-1/-1->3->0 [3] 2/7/-1->3->-1 [4] -1/-1/-1->3->2 [5] 0/-1/-1->3->1 [6] 1/-1/-1->3->0 [7] 2/-1/-1->3->7 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 06 : 7 -> 2 -> 1 | |
jwb0061:16370:16435 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] -1/-1/-1->6->4 [2] 4/-1/-1->6->2 [3] 5/-1/-1->6->7 [4] 7/-1/-1->6->5 [5] -1/-1/-1->6->4 [6] 4/2/-1->6->-1 [7] 5/-1/-1->6->7 | |
jwb0061:16368:16432 [0] NCCL INFO Tree 2 : 6 -> 4 -> 7/-1/-1 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 05 : 4 -> 7 -> 6 | |
jwb0061:16368:16432 [0] NCCL INFO Tree 6 : 6 -> 4 -> 7/-1/-1 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 07/08 : 0 7 6 5 4 3 2 1 | |
jwb0038:16069:16139 [2] NCCL INFO Ring 07 : 3 -> 2 -> 1 | |
jwb0038:16069:16139 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] -1/-1/-1->2->0 [2] 0/6/-1->2->-1 [3] 1/-1/-1->2->3 [4] 3/-1/-1->2->1 [5] -1/-1/-1->2->0 [6] 0/-1/-1->2->6 [7] 1/-1/-1->2->3 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 00 : 7 -> 0 -> 1 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 06 : 4 -> 7 -> 2 | |
jwb0061:16371:16434 [3] NCCL INFO Ring 07 : 0 -> 7 -> 6 | |
jwb0061:16371:16434 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 4/-1/-1->7->5 [2] 5/-1/-1->7->4 [3] 6/-1/-1->7->3 [4] -1/-1/-1->7->6 [5] 4/-1/-1->7->5 [6] 5/-1/-1->7->4 [7] 6/3/-1->7->-1 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 01 : 1 -> 0 -> 3 | |
jwb0038:16068:16138 [1] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16369:16436 [1] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO Ring 00 : 3 -> 4 -> 5 | |
jwb0061:16370:16435 [2] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16371:16434 [3] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO Ring 01 : 5 -> 4 -> 7 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 02 : 1 -> 0 -> 3 | |
jwb0038:16070:16137 [3] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0038:16067:16132 [0] NCCL INFO Ring 03 : 1 -> 0 -> 7 | |
jwb0038:16069:16139 [2] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO Ring 02 : 5 -> 4 -> 7 | |
jwb0061:16368:16432 [0] NCCL INFO Ring 03 : 5 -> 4 -> 3 | |
jwb0061:16368:16432 [0] NCCL INFO Ring 04 : 3 -> 4 -> 5 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 04 : 7 -> 0 -> 1 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 05 : 1 -> 0 -> 3 | |
jwb0061:16368:16432 [0] NCCL INFO Ring 05 : 5 -> 4 -> 7 | |
jwb0061:16368:16432 [0] NCCL INFO Ring 06 : 5 -> 4 -> 7 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 06 : 1 -> 0 -> 3 | |
jwb0038:16067:16132 [0] NCCL INFO Ring 07 : 1 -> 0 -> 7 | |
jwb0038:16067:16132 [0] NCCL INFO Trees [0] 1/4/-1->0->-1 [1] 2/-1/-1->0->3 [2] 3/-1/-1->0->2 [3] -1/-1/-1->0->1 [4] 1/-1/-1->0->4 [5] 2/-1/-1->0->3 [6] 3/-1/-1->0->2 [7] -1/-1/-1->0->1 | |
jwb0061:16368:16432 [0] NCCL INFO Ring 07 : 5 -> 4 -> 3 | |
jwb0061:16368:16432 [0] NCCL INFO Trees [0] 5/-1/-1->4->0 [1] 6/-1/-1->4->7 [2] 7/-1/-1->4->6 [3] -1/-1/-1->4->5 [4] 5/0/-1->4->-1 [5] 6/-1/-1->4->7 [6] 7/-1/-1->4->6 [7] -1/-1/-1->4->5 | |
jwb0038:16067:16132 [0] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61000000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61000e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43000000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61001000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43000e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61001e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61002000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43001000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61002e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43001e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61003000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43002000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61003e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43002e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61004000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43003000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61004e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43003e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61005000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f000000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43004000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61005e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f000e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43004e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61006000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f001000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43005000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61006e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43005e00 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ca61007000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f001e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f000000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43006000 | |
jwb0061:16370:16435 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ca61007e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f002000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d000000 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43006e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x145c43007000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f000e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f002e00 | |
jwb0061:16368:16432 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x145c43007e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f001000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f001e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f003000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f003e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f002000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d000e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f002e00 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f003000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f004000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f003e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f004e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d001000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f004000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d001e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f005000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f004e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f005e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d002000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d002e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f006000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f006e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d003000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d003e00 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14f22f007000 | |
jwb0038:16068:16138 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14f22f007e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d004000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d004e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d005000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d005e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d006000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d006e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f005000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f005e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f006000 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14660d007000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f006e00 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x15320f007000 | |
jwb0061:16371:16434 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x15320f007e00 | |
jwb0038:16069:16139 [2] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14660d007e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d000000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75000000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75000e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75001000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75001e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75002000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75002e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75003000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75003e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75004000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75004e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75005000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75005e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75006000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75006e00 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x151a75007000 | |
jwb0038:16067:16132 [0] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x151a75007e00 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d000e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d001000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d001e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d002000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d002e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d003000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d003e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d004000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d004e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d005000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d005e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d006000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d006e00 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x14ce1d007000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7000000 | |
jwb0061:16369:16436 [1] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x14ce1d007e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7000e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7001000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7001e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7002000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7002e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7003000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7003e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7004000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7004e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7005000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7005e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7006000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7006e00 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:23 Cuda Alloc Size 3456 pointer 0x148ae7007000 | |
jwb0038:16070:16137 [3] NCCL INFO channel.cc:27 Cuda Alloc Size 32 pointer 0x148ae7007e00 | |
jwb0061:16370:16455 [2] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x14ca5c004c70 | |
jwb0061:16368:16453 [0] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x145c34004c70 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004c90 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 0 from local rank 2, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-orxzPj | |
jwb0061:16371:16454 [3] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x153200004c70 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004c90 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 0 from local rank 3, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x14f220004c70 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004c90 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 0 from local rank 1, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x146608004c70 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004c90 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 0 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca61200000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004cd0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 1 from local rank 2, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x15320f200000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004cd0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 1 from local rank 3, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f22f200000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004cd0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 1 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x151a6c004c70 | |
jwb0038:16067:16155 [0] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-0POx2g | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14660d200000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca61800000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 00/0 : 6[84000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x15320f800000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004d10 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 2 from local rank 2, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f22f800000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004cd0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 1 from local rank 2, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 00/0 : 1[44000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004d10 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 2 from local rank 1, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca56000000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 04/0 : 6[84000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004d50 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 3 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14660d800000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 00/0 : 2[84000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f226000000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004d10 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 2 from local rank 2, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 04/0 : 1[44000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004d50 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 3 from local rank 1, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca56600000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x146602000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f226600000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 04/0 : 2[84000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004d50 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 3 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x146602600000 | |
jwb0061:16369:16456 [1] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x14ce18004c70 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004c90 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 0 from local rank 1, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 0 from local rank 0, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004c90 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 00/0 : 3[c4000] -> 4[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 1 from local rank 0, transport 2 | |
jwb0038:16070:16153 [3] NCCL INFO Mem Realloc old size 0, new size 8 pointer 0x148ad8004c70 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004c90 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 0 from local rank 3, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce1d200000 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 0 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004c90 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004cd0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 1 from local rank 1, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 00/0 : 7[c4000] -> 0[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 1 from local rank 0, transport 2 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce1d800000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 00/0 : 5[44000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004d10 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 2 from local rank 1, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 2 from local rank 3, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004d10 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 00/0 : 7[c4000] -> 0[3000] [send] via NET/IB/0(4)/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ae7200000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004cd0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 1 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 3 from local rank 3, transport 2 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce12000000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 04/0 : 5[44000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004d50 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 3 from local rank 1, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ae7800000 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce12600000 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 2 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004d10 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 00/0 : 3[c4000] -> 4[3000] [send] via NET/IB/0(0)/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 3 from local rank 3, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004cd0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004cd0 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 04/0 : 7[c4000] -> 0[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO Channel 00/0 : 0[3000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004d90 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 4 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 04/0 : 3[c4000] -> 4[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO Channel 00/0 : 4[3000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004d90 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 4 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c43200000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 04/0 : 4[3000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004dd0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 5 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a75200000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 04/0 : 0[3000] -> 1[44000] via P2P/IPC/read | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c43800000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004dd0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 5 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a75800000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004d50 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 04/0 : 7[c4000] -> 0[3000] [send] via NET/IB/0(4)/GDRDMA | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x145c34023000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004d50 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 04/0 : 3[c4000] -> 4[3000] [send] via NET/IB/0(0)/GDRDMA | |
jwb0061:16368:16453 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 31123 mtu 5 LID 5812 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x145c34045000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x151a6c023000 | |
jwb0038:16067:16155 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 20179 mtu 5 LID 5858 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x151a6c045000 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x145c32000000 | |
jwb0061:16369:16456 [1] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-Z9g10P | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16152 [1] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-mb3ePP | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x151a6ac00000 | |
jwb0038:16067:16155 [0] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-5Bbcxj | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 4 from local rank 1, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004d90 | |
jwb0038:16067:16155 [0] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x151a6c031010 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 01/0 : 2[84000] -> 5[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 5 from local rank 1, transport 2 | |
jwb0061:16370:16455 [2] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-hRKy3B | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 4 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004d90 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 01/0 : 6[84000] -> 1[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 5 from local rank 1, transport 2 | |
jwb0061:16368:16453 [0] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-uQtt2F | |
jwb0061:16368:16453 [0] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x145c34031010 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16154 [2] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-6JqO6H | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 4 from local rank 2, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004d90 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 02/0 : 3[c4000] -> 6[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 5 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x151a6c047000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x151a6c04e000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x145c34047000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x145c3404e000 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 4 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004d90 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 02/0 : 7[c4000] -> 2[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 5 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0061:16368:16453 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x151a6c06a000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x151a6b600000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x151a74600200 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x145c3406a000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x145c33600000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x145c42600200 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x151a6c06c000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x145c3406c000 | |
jwb0038:16067:16155 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 20182 mtu 5 LID 5858 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x151a6c083000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x151a62000000 | |
jwb0038:16067:16155 [0] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-dXNMuf | |
jwb0061:16368:16453 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 31126 mtu 5 LID 5812 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x145c34083000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x145c30000000 | |
jwb0061:16368:16453 [0] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-q9QV0x | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x151a6c085000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x151a6c08c000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x145c34085000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x145c3408c000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x151a6c0a7000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004d10 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 2 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x151a62a00000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x151a74602200 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x145c340a7000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ad6000000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004d50 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 3 from local rank 3, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004d10 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 2 from local rank 3, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ad6600000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004d90 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 4 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x145c30a00000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x145c42602200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ad6c00000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004dd0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 5 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531fe000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ad7200000 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004d50 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 3 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531fe600000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004d90 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 4 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531fec00000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 01/0 : 0[3000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004e10 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 6 from local rank 0, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004dd0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 5 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a63400000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 02/0 : 0[3000] -> 3[c4000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ff200000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004e50 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 7 from local rank 0, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a63a00000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 05/0 : 0[3000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004e90 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 8 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a60000000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 06/0 : 0[3000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004ed0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 9 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a60600000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 01/0 : 4[3000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004e10 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 6 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c31400000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 02/0 : 4[3000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004e50 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 7 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c31a00000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 05/0 : 4[3000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004e90 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 8 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c2e000000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 06/0 : 4[3000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004ed0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 9 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c2e600000 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 6 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004e10 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 02/0 : 3[c4000] -> 6[84000] [send] via NET/IB/2(2)/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 7 from local rank 3, transport 2 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 6 from local rank 3, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004e10 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 02/0 : 7[c4000] -> 2[84000] [send] via NET/IB/2(6)/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 7 from local rank 3, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004dd0 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 05/0 : 2[84000] -> 5[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004dd0 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 05/0 : 6[84000] -> 1[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004dd0 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 06/0 : 3[c4000] -> 6[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004dd0 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 06/0 : 7[c4000] -> 2[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 6 from local rank 2, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004e10 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 01/0 : 6[84000] -> 1[44000] [send] via NET/IB/1(5)/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 7 from local rank 2, transport 2 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 6 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004e10 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 01/0 : 2[84000] -> 5[44000] [send] via NET/IB/1(1)/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 7 from local rank 2, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004e50 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 06/0 : 3[c4000] -> 6[84000] [send] via NET/IB/2(2)/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004e50 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 06/0 : 7[c4000] -> 2[84000] [send] via NET/IB/2(6)/GDRDMA | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 10 from local rank 0, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004f10 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5e600000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004f50 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 11 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5ec00000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004f90 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 12 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5f200000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c004fd0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 13 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5f800000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005010 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 14 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5c000000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005050 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 15 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004f10 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 10 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a5c600000 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2c600000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004f50 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 11 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2cc00000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004f90 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 12 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2d200000 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 13 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34004fd0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2d800000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005010 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 14 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2a000000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005050 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 15 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c2a600000 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16070:16153 [3] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-1i6b16 | |
jwb0061:16371:16454 [3] NCCL INFO Allocated 4194656 bytes of shared memory in /dev/shm/nccl-KboMPu | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 6 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004e10 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 03/0 : 0[3000] -> 7[c4000] [send] via NET/IB/3(3)/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 7 from local rank 0, transport 2 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 6 from local rank 0, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004e10 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 03/0 : 4[3000] -> 3[c4000] [send] via NET/IB/3(7)/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 7 from local rank 0, transport 2 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x146608024000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004e50 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 05/0 : 6[84000] -> 1[44000] [send] via NET/IB/1(5)/GDRDMA | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ce18024000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14f22001f000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14f220026000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004e50 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 05/0 : 2[84000] -> 5[44000] [send] via NET/IB/1(1)/GDRDMA | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14f220032000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ca5c024000 | |
jwb0038:16069:16154 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 18879 mtu 5 LID 5813 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x146608046000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x146600600000 | |
jwb0038:16069:16154 [2] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-BltQnw | |
jwb0038:16069:16154 [2] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x1466080453b0 | |
jwb0061:16369:16456 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 30060 mtu 5 LID 5803 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ce18046000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14ce10600000 | |
jwb0061:16369:16456 [1] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-POYe1w | |
jwb0061:16369:16456 [1] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x14ce180453b0 | |
jwb0038:16068:16152 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 17331 mtu 5 LID 5814 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14f220054000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14f21e600000 | |
jwb0038:16068:16152 [1] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-dQDje2 | |
jwb0061:16370:16455 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 30063 mtu 5 LID 5808 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ca5c046000 | |
jwb0038:16068:16152 [1] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x14f220041320 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14ca54600000 | |
jwb0061:16370:16455 [2] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-DZhFyF | |
jwb0061:16370:16455 [2] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x14ca5c0453b0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ce18048000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ce1804f000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x146608048000 | |
jwb0061:16369:16456 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0038:16069:16154 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 18880 mtu 5 LID 5813 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14660805f000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x1465fe000000 | |
jwb0038:16069:16154 [2] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-OsqIiL | |
jwb0038:16068:16152 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ce1806b000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ca5c048000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ce11000000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ce1c600200 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14f22006b000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14f21f000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14f22e600200 | |
jwb0061:16370:16455 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 30064 mtu 5 LID 5808 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ca5c05f000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ce1806d000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14ca52000000 | |
jwb0061:16370:16455 [2] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-WEXrGv | |
jwb0061:16369:16456 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 30063 mtu 5 LID 5803 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ce18084000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14ce0e000000 | |
jwb0061:16369:16456 [1] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-KQze9u | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14f22006d000 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 8 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004e90 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 03/0 : 4[3000] -> 3[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 9 from local rank 3, transport 2 | |
jwb0038:16068:16152 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 17334 mtu 5 LID 5814 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14f220084000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x14f216000000 | |
jwb0038:16068:16152 [1] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-40NnVL | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 8 from local rank 3, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004e90 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 03/0 : 0[3000] -> 7[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 9 from local rank 3, transport 2 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ce18086000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ce1808d000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14f220086000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14f22008d000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ca5c061000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ca5c068000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x146608061000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x146608068000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ce180a8000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14f2200a8000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ce0ea00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14f216a00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14f22e602200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ce1c602200 | |
jwb0061:16370:16455 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0038:16069:16154 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ca5c084000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004e90 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 8 from local rank 1, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ca53400000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004e90 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 8 from local rank 1, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x146608084000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ca60600200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce0f400000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004ed0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 9 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f217400000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004ed0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 9 from local rank 1, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1465ff400000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14660c600200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce0fa00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f217a00000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004f10 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 10 from local rank 1, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004f10 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 10 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce11a00000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004f50 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 11 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f21fa00000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004f50 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 11 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f214000000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 01/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004f90 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 12 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce06000000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 01/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004f90 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 12 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce06600000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f214600000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 02/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Channel 02/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220004fd0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 13 from local rank 1, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18004fd0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 13 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f214c00000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 03/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 14 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005010 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce06c00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f215200000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 05/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Channel 03/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005050 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 15 from local rank 1, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005010 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 14 from local rank 1, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004e50 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 07/0 : 0[3000] -> 7[c4000] [send] via NET/IB/3(3)/GDRDMA | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f215800000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 06/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005090 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 16 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce07200000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 05/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005050 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 15 from local rank 1, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ca5c086000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ca5c08d000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce07800000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 06/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005090 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 16 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f212000000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 07/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 17 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200050d0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x146608086000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce04000000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14660808d000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 07/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 17 from local rank 1, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180050d0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f212600000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce04600000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004e50 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 07/0 : 4[3000] -> 3[c4000] [send] via NET/IB/3(7)/GDRDMA | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ca5c0a8000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ca4a000000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ca60602200 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x1466080a8000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1465f6000000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14660c602200 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004e90 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 8 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca4aa00000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004ed0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 9 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca4b000000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004f10 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 10 from local rank 2, transport 0 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004e90 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 8 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca4b600000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004f50 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 11 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465f6a00000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004ed0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 9 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca55a00000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 02/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004f90 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 12 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca48000000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465f7000000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 03/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c004fd0 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004f10 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 10 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 13 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca48600000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465f7600000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004f50 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 11 from local rank 2, transport 0 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 06/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005010 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 14 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca48c00000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x146601a00000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 02/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004f90 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 12 from local rank 2, transport 0 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 07/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005050 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 15 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca49200000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465f4000000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 03/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608004fd0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 13 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465f4600000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 06/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005010 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 14 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465f4c00000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 07/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005050 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 15 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465f5200000 | |
jwb0061:16369:16436 [1] NCCL INFO Connected all rings | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005110 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 18 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce00c00000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005150 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 19 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ce01200000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 03/0 : 5[44000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005190 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 20 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connected all rings | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005110 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 18 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ce01800000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 07/0 : 5[44000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180051d0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 21 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f20ec00000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005150 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 19 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdfe000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f20f200000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 03/0 : 1[44000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005190 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 20 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20f800000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 07/0 : 1[44000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 21 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200051d0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20c000000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004ed0 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 07/0 : 4[3000] -> 3[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO Channel 01/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004f10 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 10 from local rank 3, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148acc600000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 03/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004f50 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 11 from local rank 3, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004ed0 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 07/0 : 0[3000] -> 7[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148accc00000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 01/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004f10 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 05/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 10 from local rank 3, transport 0 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004f90 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 12 from local rank 3, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148acd200000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 07/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 13 from local rank 3, transport 0 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8004fd0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531f4600000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148acd800000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 03/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004f50 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 11 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531f4c00000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 05/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004f90 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 12 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531f5200000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 07/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x153200028000 | |
jwb0061:16371:16454 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 30056 mtu 5 LID 5809 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x15320004a000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x1531f2000000 | |
jwb0061:16371:16454 [3] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-1jjBtL | |
jwb0061:16371:16454 [3] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x1532000372e0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200004fd0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 13 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531f2a00000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x148ad8029000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x15320004c000 | |
jwb0061:16371:16454 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 30057 mtu 5 LID 5809 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x153200063000 | |
jwb0038:16070:16153 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 17333 mtu 5 LID 5857 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x148ad804b000 | |
jwb0038:16069:16139 [2] NCCL INFO Connected all rings | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005090 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 16 from local rank 2, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x148ac8000000 | |
jwb0038:16070:16153 [3] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-dpTnGO | |
jwb0038:16070:16153 [3] NCCL INFO Mem Realloc old size 0, new size 768 pointer 0x148ad80387c0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465f0c00000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080050d0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 17 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465f1200000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 03/0 : 2[84000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005110 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 18 from local rank 2, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x1531f3600000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465f1800000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 07/0 : 2[84000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005150 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 19 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465ee000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x148ad804b000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x148ad8052000 | |
jwb0038:16070:16153 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0061:16371:16454 [3] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-LF6jQ7 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x148ad806f000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x148ac8a00000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x148ae6600200 | |
jwb0061:16370:16435 [2] NCCL INFO Connected all rings | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005090 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 16 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca44c00000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0050d0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 17 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca45200000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 03/0 : 6[84000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005110 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 18 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca45800000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 07/0 : 6[84000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005150 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 19 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca42000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x148ad8071000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x153200065000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x15320006c000 | |
jwb0038:16070:16153 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 17336 mtu 5 LID 5857 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x148ad8088000 | |
jwb0061:16371:16454 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 20. | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 10485760 pointer 0x148ac9400000 | |
jwb0038:16070:16153 [3] NCCL INFO Allocated 532480 bytes of shared memory in /dev/shm/nccl-BylfPW | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x153200088000 | |
jwb0061:16368:16432 [0] NCCL INFO Connected all rings | |
jwb0061:16368:16432 [0] NCCL INFO Channel 03/0 : 4[3000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1531f0c00000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005090 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x15320e600200 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 16 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c26600000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 07/0 : 4[3000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340050d0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 17 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c26c00000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x148ad808a000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x148ad8091000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 01/0 : 4[3000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005110 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 18 from local rank 0, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x148ad80ac000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x148ac6000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x148ae6602200 | |
jwb0038:16067:16132 [0] NCCL INFO Connected all rings | |
jwb0038:16067:16132 [0] NCCL INFO Channel 03/0 : 0[3000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005090 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 16 from local rank 0, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x15320008a000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x153200091000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 01/0 : 5[44000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c24000000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005210 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 02/0 : 4[3000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 22 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a58600000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 07/0 : 0[3000] -> 1[44000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005150 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 19 from local rank 0, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0050d0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 17 from local rank 0, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a58c00000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c24600000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 05/0 : 4[3000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005190 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 20 from local rank 0, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdfc000000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 02/0 : 5[44000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005250 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 23 from local rank 1, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c24c00000 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 06/0 : 4[3000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340051d0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 21 from local rank 0, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdfc600000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 05/0 : 5[44000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x145c25200000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005290 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 24 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdfcc00000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 06/0 : 5[44000] -> 7[c4000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180052d0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 25 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdfd200000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x1532000ac000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1531f1600000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 01/0 : 0[3000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005110 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 18 from local rank 0, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x15320e602200 | |
jwb0038:16070:16137 [3] NCCL INFO Connected all rings | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005010 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 14 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a56000000 | |
jwb0061:16371:16434 [3] NCCL INFO Connected all rings | |
jwb0038:16068:16138 [1] NCCL INFO Channel 01/0 : 1[44000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Channel 02/0 : 0[3000] -> 2[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005010 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 14 from local rank 3, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005150 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 22 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 19 from local rank 0, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005210 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac6a00000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005050 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 15 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ee000000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005050 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 15 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a56600000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 05/0 : 0[3000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20a000000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005190 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 20 from local rank 0, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac7000000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 02/0 : 1[44000] -> 3[c4000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ee600000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005250 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 23 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a56c00000 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 06/0 : 0[3000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0051d0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 21 from local rank 0, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20a600000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 05/0 : 1[44000] -> 3[c4000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005290 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 24 from local rank 1, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 16 from local rank 3, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005090 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ef800000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000050d0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 17 from local rank 3, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 16 from local rank 3, transport 0 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005090 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20ac00000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 06/0 : 1[44000] -> 3[c4000] via P2P/IPC/read | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ec000000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200052d0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 25 from local rank 1, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac4000000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005110 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 18 from local rank 3, transport 0 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005190 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 20 from local rank 2, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x151a57200000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ec600000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005150 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 19 from local rank 3, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca40000000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0051d0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 21 from local rank 2, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1531ecc00000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80050d0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 17 from local rank 3, transport 0 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005190 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 20 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca40600000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f20b200000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac4600000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005110 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 18 from local rank 3, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465ec000000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080051d0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 21 from local rank 2, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac4c00000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005150 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 19 from local rank 3, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465ec600000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x148ac5200000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005210 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 22 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465ecc00000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005210 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 22 from local rank 2, transport 0 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005250 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 23 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca40c00000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005250 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 23 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465ed200000 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 20 from local rank 3, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca41200000 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 26 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 26 from local rank 1, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 20 from local rank 3, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 22 from local rank 0, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 24 from local rank 2, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 22 from local rank 0, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 24 from local rank 2, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005190 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 03/0 : 3[c4000] -> 7[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy recv connection 21 from local rank 3, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005310 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 01/0 : 1[44000] -> 5[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 27 from local rank 1, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005190 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005310 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 03/0 : 7[c4000] -> 3[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 0 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 01/0 : 5[44000] -> 1[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy recv connection 21 from local rank 3, transport 2 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 27 from local rank 1, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005210 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 00/0 : 0[3000] -> 4[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 23 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005210 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005290 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 02/0 : 6[84000] -> 2[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO Channel 00/0 : 4[3000] -> 0[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 25 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 23 from local rank 0, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005290 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 02/0 : 2[84000] -> 6[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 25 from local rank 2, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005350 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000051d0 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 07/0 : 3[c4000] -> 7[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 05/0 : 1[44000] -> 5[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 22 from local rank 3, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 28 from local rank 1, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80051d0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005350 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 05/0 : 5[44000] -> 1[44000] [receive] via NET/IB/1/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 28 from local rank 1, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 07/0 : 7[c4000] -> 3[c4000] [receive] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 22 from local rank 3, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005250 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 04/0 : 0[3000] -> 4[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 24 from local rank 0, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0052d0 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 06/0 : 2[84000] -> 6[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 26 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080052d0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005250 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 06/0 : 6[84000] -> 2[84000] [receive] via NET/IB/2/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 04/0 : 4[3000] -> 0[3000] [receive] via NET/IB/0/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 26 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 24 from local rank 0, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005390 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005210 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 03/0 : 7[c4000] -> 3[c4000] [send] via NET/IB/3/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 01/0 : 5[44000] -> 1[44000] [send] via NET/IB/1/GDRDMA | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 23 from local rank 3, transport 2 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 29 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005390 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005210 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 03/0 : 3[c4000] -> 7[c4000] [send] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 01/0 : 1[44000] -> 5[44000] [send] via NET/IB/1/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 23 from local rank 3, transport 2 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 29 from local rank 1, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005290 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005310 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 00/0 : 4[3000] -> 0[3000] [send] via NET/IB/0/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO Channel 02/0 : 6[84000] -> 2[84000] [send] via NET/IB/2/GDRDMA | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 27 from local rank 2, transport 2 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 25 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005290 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005310 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 00/0 : 0[3000] -> 4[3000] [send] via NET/IB/0/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO Channel 02/0 : 2[84000] -> 6[84000] [send] via NET/IB/2/GDRDMA | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 27 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 25 from local rank 0, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180053d0 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005250 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 05/0 : 5[44000] -> 1[44000] [send] via NET/IB/1/GDRDMA | |
jwb0061:16371:16434 [3] NCCL INFO Channel 07/0 : 7[c4000] -> 3[c4000] [send] via NET/IB/3/GDRDMA | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005250 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200053d0 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 07/0 : 3[c4000] -> 7[c4000] [send] via NET/IB/3/GDRDMA | |
jwb0038:16068:16138 [1] NCCL INFO Channel 05/0 : 1[44000] -> 5[44000] [send] via NET/IB/1/GDRDMA | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x1532000c2000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x148ad80c7000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14f2200c7000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ce180c6000 | |
jwb0061:16371:16454 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 30062 mtu 5 LID 5809 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x1532000da000 | |
jwb0061:16369:16456 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 30066 mtu 5 LID 5803 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ce180de000 | |
jwb0038:16070:16153 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 17339 mtu 5 LID 5857 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x148ad8025000 | |
jwb0038:16068:16152 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 17337 mtu 5 LID 5814 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14f2200df000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x1531eac00000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x15320e604200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x148ac3200000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14cdfb200000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x148ae6604200 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14f209200000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14ce1c604200 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14f22e604200 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x1532000da000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ce180e0000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x148ad80de000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14f2200e1000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340052d0 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005350 | |
jwb0061:16368:16432 [0] NCCL INFO Channel 04/0 : 4[3000] -> 0[3000] [send] via NET/IB/0/GDRDMA | |
jwb0061:16370:16435 [2] NCCL INFO Channel 06/0 : 6[84000] -> 2[84000] [send] via NET/IB/2/GDRDMA | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005350 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 06/0 : 2[84000] -> 6[84000] [send] via NET/IB/2/GDRDMA | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x1466080c5000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ca5c0c5000 | |
jwb0061:16371:16454 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 30063 mtu 5 LID 5809 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x1532000f2000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x1531eb600000 | |
jwb0038:16070:16153 [3] NCCL INFO NET/IB: Dev 3 Port 1 qpn 17340 mtu 5 LID 5857 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x148ad8027000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x15320e686200 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0052d0 | |
jwb0038:16067:16132 [0] NCCL INFO Channel 04/0 : 0[3000] -> 4[3000] [send] via NET/IB/0/GDRDMA | |
jwb0061:16369:16456 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 30067 mtu 5 LID 5803 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ce180f7000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x148ac0000000 | |
jwb0038:16068:16152 [1] NCCL INFO NET/IB: Dev 1 Port 1 qpn 17338 mtu 5 LID 5814 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x151a6c0c3000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14f2200f8000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x148ae6686200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14cdf8000000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14ce1c686200 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14f206000000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x145c340c3000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14f22e686200 | |
jwb0061:16370:16455 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 30069 mtu 5 LID 5808 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ca5c0dd000 | |
jwb0038:16069:16154 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 18885 mtu 5 LID 5813 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x1466080dd000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14ca3f200000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14ca60604200 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x1465eb200000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14660c604200 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x1532000f4000 | |
jwb0038:16067:16155 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 20185 mtu 5 LID 5858 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x151a6c0db000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x1532000fb000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x148ad80f5000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x148ad80fc000 | |
jwb0061:16368:16453 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 31129 mtu 5 LID 5812 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x145c340db000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x151a55200000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x151a74604200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ce180f9000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ce18100000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14f2200fa000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14f220101000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x145c23200000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x145c42604200 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x14ca5c0df000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x1466080df000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x153200117000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x151a6c0dd000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x148ad8118000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1531e8000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x148ac0a00000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x15320e708200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x148ae6708200 | |
jwb0061:16370:16455 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 30070 mtu 5 LID 5808 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x14ca5c0f6000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:596 Ib Alloc Size 26560 pointer 0x145c340dd000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x14ca3c000000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14ca60686200 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14f22011d000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ce1811c000 | |
jwb0038:16069:16154 [2] NCCL INFO NET/IB: Dev 2 Port 1 qpn 18886 mtu 5 LID 5813 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x1466080f6000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14f206a00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14f22e708200 | |
jwb0038:16067:16155 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 20186 mtu 5 LID 5858 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x151a6c0f4000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x1465e8000000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14cdf8a00000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ce1c708200 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x14660c686200 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x151a52000000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x151a74686200 | |
jwb0061:16368:16453 [0] NCCL INFO NET/IB: Dev 0 Port 1 qpn 31130 mtu 5 LID 5812 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:653 Ib Alloc Size 552 pointer 0x145c340f5000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x153200119000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x153200120000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:569 Cuda Alloc Size 9109504 pointer 0x145c20000000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:577 Cuda Host Alloc Size 532480 pointer 0x145c42686200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x148ad811a000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x148ad8121000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ca5c0f8000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ca5c0ff000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ce1811e000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ce18125000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x1466080f8000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x1466080ff000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14f22011f000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14f220126000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x151a6c0f6000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x151a6c0fd000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x145c340f7000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x15320013b000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x145c340fe000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1531e8a00000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x15320e70a200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x148ad813c000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x148ac1400000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x148ae670a200 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ca5c11b000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ca3ca00000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ca60708200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ce18140000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14f220141000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14f207400000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14f22e70a200 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14cdf9400000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ce1c70a200 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x151a6c119000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14660811b000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 01/0 : 7[c4000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005290 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 24 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e9400000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 02/0 : 7[c4000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000052d0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 25 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x145c3411a000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x151a52a00000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x151a74708200 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1465e8a00000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 01/0 : 3[c4000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14660c708200 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005290 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 24 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e9a00000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 05/0 : 7[c4000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005310 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 26 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x145c20a00000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x145c42708200 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abe000000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 02/0 : 3[c4000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80052d0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 25 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e6000000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 06/0 : 7[c4000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005350 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 27 from local rank 3, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abe600000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 05/0 : 3[c4000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005310 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 26 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e6600000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14ca5c11d000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x14ca5c124000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abec00000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 06/0 : 3[c4000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005350 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 27 from local rank 3, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005410 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 30 from local rank 1, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf6000000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005450 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 31 from local rank 1, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abf200000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf6600000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005490 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 32 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005410 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 30 from local rank 1, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x145c3411c000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x145c34123000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf6c00000 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 33 from local rank 1, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180054d0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f204000000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005450 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 31 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x151a6c11b000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x151a6c122000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf7200000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f204600000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:683 Ib Alloc Size 21688 pointer 0x14660811d000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005490 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 32 from local rank 1, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:696 Ib Alloc Size 552 pointer 0x146608124000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f204c00000 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200054d0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 33 from local rank 1, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f205200000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14ca5c13f000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x14ca3d400000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14ca6070a200 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x145c3413e000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x145c21400000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x145c4270a200 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x151a6c13d000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x151a53400000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x151a7470a200 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net_ib.cc:771 Ib Alloc Size 552 pointer 0x14660813f000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:700 Cuda Alloc Size 9633792 pointer 0x1465e9400000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:704 Cuda Host Alloc Size 8192 pointer 0x14660c70a200 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 01/0 : 6[84000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005390 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 28 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca3a000000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 02/0 : 6[84000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0053d0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 29 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca3a600000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 05/0 : 6[84000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005410 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 30 from local rank 2, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca3ac00000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 06/0 : 6[84000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005450 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 31 from local rank 2, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005310 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 26 from local rank 0, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca3b200000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1e000000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005350 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 27 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1e600000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005310 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 26 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005390 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 28 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1ec00000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a50000000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340053d0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 29 from local rank 0, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005350 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 27 from local rank 0, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1f200000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 01/0 : 2[84000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a50600000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005390 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 28 from local rank 2, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005390 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 28 from local rank 0, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e6000000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a50c00000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 02/0 : 2[84000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080053d0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 29 from local rank 2, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0053d0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 29 from local rank 0, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e6600000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 05/0 : 2[84000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a51200000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005410 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 30 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e6c00000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 06/0 : 2[84000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005450 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 31 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e7200000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 01/0 : 7[c4000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005390 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 28 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 30 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005410 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 01/0 : 3[c4000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005390 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 28 from local rank 3, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e4600000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 02/0 : 7[c4000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000053d0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 29 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1d200000 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 31 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005450 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e4c00000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 05/0 : 7[c4000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005410 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 30 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1d800000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005490 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 32 from local rank 0, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e5200000 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 06/0 : 7[c4000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005450 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 31 from local rank 3, transport 0 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1a000000 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 30 from local rank 0, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005410 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340054d0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 33 from local rank 0, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e5800000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abd200000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 02/0 : 3[c4000] -> 1[44000] via P2P/IPC/read | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c1a600000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80053d0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 29 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4f200000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005450 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 31 from local rank 0, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148abd800000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 05/0 : 3[c4000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005410 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 30 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4f800000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005490 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 32 from local rank 0, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148aba000000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 06/0 : 3[c4000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005450 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 31 from local rank 3, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4c000000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0054d0 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 33 from local rank 0, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148aba600000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4c600000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005510 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 34 from local rank 1, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 00/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005490 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 32 from local rank 3, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 32 from local rank 2, transport 0 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005490 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf5200000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e3800000 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005550 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy recv connection 35 from local rank 1, transport 0 | |
jwb0061:16371:16434 [3] NCCL INFO Channel 04/0 : 7[c4000] -> 6[84000] via P2P/IPC/read | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000054d0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 33 from local rank 3, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca39200000 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0054d0 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy recv connection 33 from local rank 2, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 34 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005510 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14cdf5800000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 00/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0061:16369:16436 [1] NCCL INFO Channel 00/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005490 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 32 from local rank 3, transport 0 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005590 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 36 from local rank 1, transport 0 | |
jwb0061:16371:16454 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1531e0000000 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 32 from local rank 2, transport 0 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005490 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 34 from local rank 0, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005510 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14ca39800000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 00/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005510 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 34 from local rank 2, transport 0 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdf2000000 | |
jwb0061:16369:16436 [1] NCCL INFO Channel 04/0 : 5[44000] -> 4[3000] via P2P/IPC/read | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c18600000 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 34 from local rank 0, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005510 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180055d0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 37 from local rank 1, transport 0 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005550 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy recv connection 35 from local rank 0, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca36000000 | |
jwb0061:16370:16435 [2] NCCL INFO Channel 04/0 : 6[84000] -> 5[44000] via P2P/IPC/read | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005550 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 35 from local rank 2, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f203200000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14cdf2600000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x145c18c00000 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy recv connection 35 from local rank 1, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005550 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148ab8600000 | |
jwb0038:16070:16137 [3] NCCL INFO Channel 04/0 : 3[c4000] -> 2[84000] via P2P/IPC/read | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465e5200000 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80054d0 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 33 from local rank 3, transport 0 | |
jwb0061:16370:16455 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14ca36600000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4a600000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080054d0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy recv connection 33 from local rank 2, transport 0 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005550 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy recv connection 35 from local rank 0, transport 0 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x14f203800000 | |
jwb0038:16068:16138 [1] NCCL INFO Channel 00/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005590 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 36 from local rank 1, transport 0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x151a4ac00000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:449 Cuda Alloc Size 6291456 pointer 0x1465e5800000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f200000000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 00/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16068:16138 [1] NCCL INFO Channel 04/0 : 1[44000] -> 0[3000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005510 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 34 from local rank 2, transport 0 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200055d0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 37 from local rank 1, transport 0 | |
jwb0038:16070:16153 [3] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x148ab8c00000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x14f200600000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e2000000 | |
jwb0038:16069:16139 [2] NCCL INFO Channel 04/0 : 2[84000] -> 1[44000] via P2P/IPC/read | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005550 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 35 from local rank 2, transport 0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/p2p.cc:430 Cuda Alloc Size 6291456 pointer 0x1465e2600000 | |
jwb0038:16067:16132 [0] NCCL INFO Connected all trees | |
jwb0038:16067:16132 [0] NCCL INFO Latency/AlgBw | Tree/ LL | Tree/ LL128 | Tree/Simple | Ring/ LL | Ring/ LL128 | Ring/Simple | CollNetDirect/ LL | CollNetDirect/ LL128 | CollNetDirect/Simple | CollNetChain/ LL | CollNetChain/ LL128 | CollNetChain/Simple | | |
jwb0038:16067:16132 [0] NCCL INFO Max NThreads | 512 | 640 | 512 | 512 | 640 | 512 | 0 | 0 | 512 | 0 | 0 | 512 | | |
jwb0038:16067:16132 [0] NCCL INFO Broadcast | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 25.2/ 22.5 | 42.0/ 0.0 | 232.4/ 96.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | | |
jwb0038:16067:16132 [0] NCCL INFO Reduce | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 25.2/ 22.5 | 42.0/ 0.0 | 232.4/ 96.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | | |
jwb0038:16067:16132 [0] NCCL INFO AllGather | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 11.4/ 25.7 | 25.4/ 0.0 | 38.4/ 109.7 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | | |
jwb0038:16067:16132 [0] NCCL INFO ReduceScatter | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 11.4/ 25.7 | 25.4/ 0.0 | 38.4/ 109.7 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | 0.0/ 0.0 | | |
jwb0038:16067:16132 [0] NCCL INFO AllReduce | 20.0/ 11.2 | 28.9/ 0.0 | 224.0/ 47.2 | 22.9/ 12.9 | 45.0/ 0.0 | 80.8/ 54.9 | 7.4/ 0.0 | 7.4/ 0.0 | 29.7/ 0.0 | 4.4/ 0.0 | 4.4/ 0.0 | 48.0/ 0.0 | | |
jwb0038:16067:16132 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0038:16067:16132 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 36 from local rank 0, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connected all trees | |
jwb0061:16368:16432 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0061:16368:16432 [0] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 36 from local rank 0, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connected all trees | |
jwb0061:16371:16434 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0061:16371:16434 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 34 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connected all trees | |
jwb0038:16070:16137 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0038:16070:16137 [3] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 34 from local rank 3, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connected all trees | |
jwb0061:16369:16436 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0061:16369:16436 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 38 from local rank 1, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connected all trees | |
jwb0061:16370:16435 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0061:16370:16435 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 36 from local rank 2, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connected all trees | |
jwb0038:16068:16138 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0038:16068:16138 [1] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 38 from local rank 1, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connected all trees | |
jwb0038:16069:16139 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
jwb0038:16069:16139 [2] NCCL INFO 8 coll channels, 8 p2p channels, 2 p2p channels per peer | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 36 from local rank 2, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005590 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x151a7045dfb0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x151a704b6c80 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x151a7045dfb0 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16067:16132 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x151a48000000 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005590 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16368:16432 [0] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x145c3c45dfb0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16368:16432 [0] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x145c3c4b6820 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16368:16432 [0] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x145c3c45dfb0 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16368:16432 [0] NCCL INFO GPU Direct RDMA Enabled for GPU 3000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x145c16000000 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005510 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005510 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16070:16137 [3] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x148ae045dfb0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16070:16137 [3] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x148ae04c2650 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16070:16137 [3] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x148ae045dfb0 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16070:16137 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x15320845dfb0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x1532084c2600 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x15320845dfb0 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO GPU Direct RDMA Enabled for GPU c4000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c340055d0 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 37 from local rank 3, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 37 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c0055d0 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 39 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005650 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x145c17000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x148ab6000000 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x151a49000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14f1fe600000 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 39 from local rank 0, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005650 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14cdf0600000 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 40 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005690 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 40 from local rank 3, transport 2 | |
jwb0038:16070:16137 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x1466080055d0 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 37 from local rank 3, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005690 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14f1fc000000 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 37 from local rank 3, transport 2 | |
jwb0061:16371:16434 [3] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c0055d0 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1465e0600000 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 38 from local rank 0, transport 2 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005610 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14cdee000000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1531de000000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14ca34600000 | |
jwb0038:16067:16132 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005550 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 35 from local rank 0, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005590 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce18005610 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f220005610 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x14f22845dfb0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x14f2284ab120 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x14f22845dfb0 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0038:16068:16138 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1465de000000 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x14ca5845dfb0 | |
jwb0061:16369:16436 [1] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x14ce1445dfb0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x14ca584b6ea0 | |
jwb0061:16369:16436 [1] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x14ce144ab080 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x14ca5845dfb0 | |
jwb0061:16369:16436 [1] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x14ce1445dfb0 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 2 (distance 3 <= 4), read 1 | |
jwb0061:16370:16435 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16369:16436 [1] NCCL INFO GPU Direct RDMA Enabled for GPU 44000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14cdef000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14f1fd000000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x148ab7000000 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14ca32000000 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 38 from local rank 0, transport 2 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005610 | |
jwb0061:16368:16432 [0] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005550 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 35 from local rank 0, transport 2 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14ca33000000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1531df000000 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005590 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO Mem Realloc old size 0, new size 4 pointer 0x14660445dfb0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO Mem Realloc old size 4, new size 8 pointer 0x1466044b6ea0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO Mem Realloc old size 8, new size 12 pointer 0x14660445dfb0 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 0 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 1 (distance 3 <= 4), read 1 | |
jwb0038:16069:16139 [2] NCCL INFO GPU Direct RDMA Enabled for GPU 84000 / HCA 3 (distance 3 <= 4), read 1 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1465df000000 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 38 from local rank 2, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005610 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 38 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005610 | |
jwb0061:16368:16453 [0] NCCL INFO New proxy send connection 39 from local rank 1, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 0 -> connection 0x145c34005650 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14ce180056d0 | |
jwb0061:16369:16456 [1] NCCL INFO New proxy send connection 41 from local rank 2, transport 2 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x151a46000000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x145c14000000 | |
jwb0061:16369:16456 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14cdec000000 | |
jwb0061:16368:16453 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x145c15000000 | |
jwb0038:16067:16155 [0] NCCL INFO New proxy send connection 39 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 0 -> connection 0x151a6c005650 | |
jwb0038:16068:16152 [1] NCCL INFO New proxy send connection 41 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 1 -> connection 0x14f2200056d0 | |
jwb0038:16067:16155 [0] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x151a47000000 | |
jwb0038:16068:16152 [1] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14f1fa000000 | |
jwb0038:16069:16154 [2] NCCL INFO New proxy send connection 39 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 2 -> connection 0x146608005650 | |
jwb0061:16370:16455 [2] NCCL INFO New proxy send connection 39 from local rank 1, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 2 -> connection 0x14ca5c005650 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 36 from local rank 2, transport 2 | |
jwb0061:16370:16435 [2] NCCL INFO Connection to proxy localRank 3 -> connection 0x153200005590 | |
jwb0061:16370:16455 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x14ca30000000 | |
jwb0038:16069:16154 [2] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1465dc000000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1531dc000000 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 36 from local rank 2, transport 2 | |
jwb0038:16069:16139 [2] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad8005590 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x148ab4000000 | |
jwb0038:16070:16153 [3] NCCL INFO New proxy send connection 37 from local rank 1, transport 2 | |
jwb0038:16068:16138 [1] NCCL INFO Connection to proxy localRank 3 -> connection 0x148ad80055d0 | |
jwb0061:16371:16454 [3] NCCL INFO New proxy send connection 37 from local rank 1, transport 2 | |
jwb0061:16369:16436 [1] NCCL INFO Connection to proxy localRank 3 -> connection 0x1532000055d0 | |
jwb0038:16069:16139 [2] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x14660d008000 | |
jwb0038:16068:16138 [1] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x14f22f008000 | |
jwb0038:16070:16137 [3] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x148ae7008000 | |
jwb0038:16067:16132 [0] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x151a75008000 | |
jwb0038:16070:16153 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x148ab5000000 | |
jwb0061:16371:16454 [3] NCCL INFO transport/net.cc:381 Cuda Alloc Size 16777216 pointer 0x1531dd000000 | |
jwb0061:16370:16435 [2] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x14ca61008000 | |
jwb0061:16368:16432 [0] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x145c43008000 | |
jwb0061:16369:16436 [1] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x14ce1d008000 | |
jwb0061:16371:16434 [3] NCCL INFO init.cc:367 Cuda Alloc Size 5168 pointer 0x15320f008000 | |
jwb0038:16068:16138 [1] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x14f1f8000000 | |
jwb0038:16068:16138 [1] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x14f22e70c200 | |
jwb0038:16068:16138 [1] NCCL INFO comm 0x37fab430 rank 1 nranks 8 cudaDev 1 busId 44000 - Init COMPLETE | |
jwb0038:16068:16068 [1] NCCL INFO AllReduce: opCount 0 sendbuff 0x14f266c00000 recvbuff 0x14f266c00000 count 1 datatype 1 op 0 root 0 comm 0x37fab430 [nranks=8] stream 0x37d9e480 | |
jwb0038:16068:16068 [1] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16370:16435 [2] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x14ca2e000000 | |
jwb0061:16370:16435 [2] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x14ca6070c200 | |
jwb0061:16370:16435 [2] NCCL INFO comm 0x3724b850 rank 6 nranks 8 cudaDev 2 busId 84000 - Init COMPLETE | |
jwb0038:16070:16137 [3] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x148ab2000000 | |
jwb0061:16369:16436 [1] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x14cdea000000 | |
jwb0061:16369:16436 [1] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x14ce1c70c200 | |
jwb0061:16369:16436 [1] NCCL INFO comm 0x37bcc8f0 rank 5 nranks 8 cudaDev 1 busId 44000 - Init COMPLETE | |
jwb0038:16067:16132 [0] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x151a44000000 | |
jwb0061:16370:16370 [2] NCCL INFO AllReduce: opCount 0 sendbuff 0x14ca96c00000 recvbuff 0x14ca96c00000 count 1 datatype 1 op 0 root 0 comm 0x3724b850 [nranks=8] stream 0x3703fe90 | |
jwb0038:16070:16137 [3] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x148ae670c200 | |
jwb0061:16370:16370 [2] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x145c12000000 | |
jwb0038:16070:16137 [3] NCCL INFO comm 0x38b2ffe0 rank 3 nranks 8 cudaDev 3 busId c4000 - Init COMPLETE | |
jwb0038:16069:16139 [2] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x1465da000000 | |
jwb0061:16368:16432 [0] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x145c4270c200 | |
jwb0061:16369:16369 [1] NCCL INFO AllReduce: opCount 0 sendbuff 0x14ce52c00000 recvbuff 0x14ce52c00000 count 1 datatype 1 op 0 root 0 comm 0x37bcc8f0 [nranks=8] stream 0x379c0450 | |
jwb0061:16369:16369 [1] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16368:16432 [0] NCCL INFO comm 0x38855390 rank 4 nranks 8 cudaDev 0 busId 3000 - Init COMPLETE | |
jwb0061:16371:16434 [3] NCCL INFO init.cc:392 Cuda Host Alloc Size 33554432 pointer 0x1531da000000 | |
jwb0038:16069:16139 [2] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x14660c70c200 | |
jwb0038:16069:16139 [2] NCCL INFO comm 0x375f9e90 rank 2 nranks 8 cudaDev 2 busId 84000 - Init COMPLETE | |
jwb0061:16371:16434 [3] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x15320e70c200 | |
jwb0038:16070:16070 [3] NCCL INFO AllReduce: opCount 0 sendbuff 0x148b1cc00000 recvbuff 0x148b1cc00000 count 1 datatype 1 op 0 root 0 comm 0x38b2ffe0 [nranks=8] stream 0x3891fce0 | |
jwb0038:16070:16070 [3] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0038:16067:16132 [0] NCCL INFO init.cc:398 Cuda Host Alloc Size 128 pointer 0x151a7470c200 | |
jwb0061:16371:16434 [3] NCCL INFO comm 0x36170d70 rank 7 nranks 8 cudaDev 3 busId c4000 - Init COMPLETE | |
jwb0038:16067:16132 [0] NCCL INFO comm 0x3868f010 rank 0 nranks 8 cudaDev 0 busId 3000 - Init COMPLETE | |
jwb0061:16368:16368 [0] NCCL INFO AllReduce: opCount 0 sendbuff 0x145c78c00000 recvbuff 0x145c78c00000 count 1 datatype 1 op 0 root 0 comm 0x38855390 [nranks=8] stream 0x386499d0 | |
jwb0061:16368:16368 [0] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0061:16371:16371 [3] NCCL INFO AllReduce: opCount 0 sendbuff 0x153246e00000 recvbuff 0x153246e00000 count 1 datatype 1 op 0 root 0 comm 0x36170d70 [nranks=8] stream 0x35f648d0 | |
jwb0061:16371:16371 [3] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0038:16067:16067 [0] NCCL INFO AllReduce: opCount 0 sendbuff 0x151aaac00000 recvbuff 0x151aaac00000 count 1 datatype 1 op 0 root 0 comm 0x3868f010 [nranks=8] stream 0x3862d2a0 | |
jwb0038:16067:16067 [0] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) | |
jwb0038:16069:16069 [2] NCCL INFO AllReduce: opCount 0 sendbuff 0x146646c00000 recvbuff 0x146646c00000 count 1 datatype 1 op 0 root 0 comm 0x375f9e90 [nranks=8] stream 0x373ed9f0 | |
jwb0038:16069:16069 [2] NCCL INFO misc/utils.cc:235 memory stack hunk malloc(65536) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment