Created
March 31, 2021 19:57
-
-
Save anj-s/5ff0eafd4309a16fd480cc5662aff448 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(byteps_env) anj@devfair0443:~/byteps$ python byteps_launcher.py | |
rank to be set 0 | |
os.environ environ({'SHELL': '/bin/bash', 'COLORTERM': 'truecolor', 'TERM_PROGRAM_VERSION': '1.54.3', 'CONDA_EXE': '/public/apps/anaconda3/5.0.1/bin/conda', 'ENV': '/usr/share/modules/init/profile.sh', 'PWD': '/private/home/anj/byteps', 'KRB5CCNAME': 'KEYRING:persistent:1185200796', 'LOGNAME': 'anj', 'XDG_SESSION_TYPE': 'tty', 'CONDA_PREFIX': '/private/home/anj/.conda/envs/byteps_env', 'MODULESHOME': '/usr/share/modules', 'MANPATH': ':', 'VSCODE_GIT_ASKPASS_NODE': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/node', 'MOTD_SHOWN': 'pam', 'HOME': '/private/home/anj', 'LANG': 'C.UTF-8', 'CONDA_PROMPT_MODIFIER': '(byteps_env) ', 'GIT_ASKPASS': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/extensions/git/dist/askpass.sh', 'SSH_CONNECTION': '100.104.68.71 50562 100.96.161.85 22', 'MODULEPATH_modshare': '/public/modulefiles:1', 'XDG_SESSION_CLASS': 'user', 'TERM': 'xterm-256color', 'USER': 'anj', 'VSCODE_GIT_IPC_HANDLE': '/run/user/1185200796/vscode-git-d2c382147c.sock', 'CONDA_SHLVL': '2', 'SHLVL': '1', 'BASH_ENV': '/usr/share/modules/init/bash', 'XDG_SESSION_ID': '7749', 'CONDA_PYTHON_EXE': '/public/apps/anaconda3/5.0.1/bin/python', 'XDG_RUNTIME_DIR': '/run/user/1185200796', 'SSH_CLIENT': '100.104.68.71 50562 22', 'CONDA_DEFAULT_ENV': 'byteps_env', 'FAIR_ENV_CLUSTER': 'h2', 'VSCODE_GIT_ASKPASS_MAIN': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/extensions/git/dist/askpass-main.js', 'XDG_DATA_DIRS': '/usr/local/share:/usr/share:/var/lib/snapd/desktop', 'BROWSER': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin/helpers/browser.sh', 'SLURM_CONF': '/public/slurm/20.11.3/etc/slurm.conf', 'PATH': '/private/home/anj/.conda/envs/byteps_env/bin:/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin:/usr/local/cuda/bin:/opt/bin/:/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin:/private/home/anj/.conda/envs/fairscale/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin', 'MODULEPATH': '/public/modulefiles', 'DBUS_SESSION_BUS_ADDRESS': 'unix:path=/run/user/1185200796/bus', 'CONDA_PREFIX_1': '/private/home/anj/.conda/envs/fairscale', 'MODULES_CMD': '/usr/lib/x86_64-linux-gnu/modulecmd.tcl', 'TERM_PROGRAM': 'vscode', 'VSCODE_IPC_HOOK_CLI': '/run/user/1185200796/vscode-ipc-2cae41ba-29af-4e92-bdeb-ffacc6cfc7bb.sock', 'BASH_FUNC_switchml%%': '() { typeset swfound=1;\n if [ "${MODULES_USE_COMPAT_VERSION:-0}" = \'1\' ]; then\n typeset swname=\'main\';\n if [ -e /usr/lib/x86_64-linux-gnu/modulecmd.tcl ]; then\n typeset swfound=0;\n unset MODULES_USE_COMPAT_VERSION;\n fi;\n else\n typeset swname=\'compatibility\';\n if [ -e /usr/lib/x86_64-linux-gnu/modulecmd-compat ]; then\n typeset swfound=0;\n MODULES_USE_COMPAT_VERSION=1;\n export MODULES_USE_COMPAT_VERSION;\n fi;\n fi;\n if [ $swfound -eq 0 ]; then\n echo "Switching to Modules $swname version";\n source /usr/share/modules/init/bash;\n else\n echo "Cannot switch to Modules $swname version, command not found";\n return 1;\n fi\n}', 'BASH_FUNC_module%%': '() { unset _mlshdbg;\n if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = \'1\' ]; then\n case "$-" in \n *v*x*)\n set +vx;\n _mlshdbg=\'vx\'\n ;;\n *v*)\n set +v;\n _mlshdbg=\'v\'\n ;;\n *x*)\n set +x;\n _mlshdbg=\'x\'\n ;;\n *)\n _mlshdbg=\'\'\n ;;\n esac;\n fi;\n unset _mlre _mlIFS;\n if [ -n "${IFS+x}" ]; then\n _mlIFS=$IFS;\n fi;\n IFS=\' \';\n for _mlv in ${MODULES_RUN_QUARANTINE:-};\n do\n if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then\n if [ -n "`eval \'echo ${\'$_mlv\'+x}\'`" ]; then\n _mlre="${_mlre:-}${_mlv}_modquar=\'`eval \'echo ${\'$_mlv\'}\'`\' ";\n fi;\n _mlrv="MODULES_RUNENV_${_mlv}";\n _mlre="${_mlre:-}${_mlv}=\'`eval \'echo ${\'$_mlrv\':-}\'`\' ";\n fi;\n done;\n if [ -n "${_mlre:-}" ]; then\n eval `eval ${_mlre}/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash \'"$@"\'`;\n else\n eval `/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash "$@"`;\n fi;\n _mlstatus=$?;\n if [ -n "${_mlIFS+x}" ]; then\n IFS=$_mlIFS;\n else\n unset IFS;\n fi;\n unset _mlre _mlv _mlrv _mlIFS;\n if [ -n "${_mlshdbg:-}" ]; then\n set -$_mlshdbg;\n fi;\n unset _mlshdbg;\n return $_mlstatus\n}', '_': '/private/home/anj/.conda/envs/byteps_env/bin/python', 'DMLC_ROLE': 'worker', 'DMLC_WORKER_ID': '0', 'DMLC_NUM_WORKER': '1', 'NVIDIA_VISIBLE_DEVICES': '0,1', 'BYTEPS_LOG_LEVEL': 'INFO', 'NCCL_DEBUG': 'INFO', 'BYTEPS_ENABLE_GDB': '1', 'BYTEPS_LOCAL_RANK': '0', 'BYTEPS_LOCAL_SIZE': '2', 'DMLC_NUM_SERVER': '1', 'DMLC_PS_ROOT_URI': '10.0.0.1', 'DMLC_PS_ROOT_PORT': '1234', 'BYTEPS_CUDA_HOME': '/usr/local/cuda', 'BYTEPS_NCCL_HOME': '/usr/local/nccl'}) | |
rank to be set 1 | |
[2021-03-31 19:55:34.595026: I byteps/common/compressor/compressor_registry.cc:28] dithering_compressor compressor is registered | |
[2021-03-31 19:55:34.595038: I byteps/common/compressor/compressor_registry.cc:28] dithering_compressor compressor is registered | |
[2021-03-31 19:55:34.595058: I byteps/common/compressor/compressor_registry.cc:28] onebit_compressor compressor is registered | |
[2021-03-31 19:55:34.595066: I byteps/common/compressor/compressor_registry.cc:28] onebit_compressor compressor is registered | |
[2021-03-31 19:55:34.595066: I byteps/common/compressor/compressor_registry.cc:28] randomk_compressor compressor is registered | |
[2021-03-31 19:55:34.595076: I byteps/common/compressor/compressor_registry.cc:28] randomk_compressor compressor is registered | |
[2021-03-31 19:55:34.595080: I byteps/common/compressor/compressor_registry.cc:28] topk_compressor compressor is registered | |
[2021-03-31 19:55:34.595084: I byteps/common/compressor/compressor_registry.cc:28] topk_compressor compressor is registered | |
[2021-03-31 19:55:34.595089: I byteps/common/compressor/compressor_registry.cc:28] vanilla_ef compressor is registered | |
[2021-03-31 19:55:34.595091: I byteps/common/compressor/compressor_registry.cc:28] vanilla_ef compressor is registered | |
[2021-03-31 19:55:34.595097: I byteps/common/compressor/compressor_registry.cc:28] nesterov_momentum compressor is registered | |
[2021-03-31 19:55:34.595098: I byteps/common/compressor/compressor_registry.cc:28] nesterov_momentum compressor is registered | |
Initialize | |
Initialize | |
devfair0443:2192327:2192327 [1] NCCL INFO NET/Socket : Using [0]enp59s0:100.96.161.85<0> | |
devfair0443:2192327:2192327 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so). | |
devfair0443:2192327:2192327 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB ; OOB enp59s0:100.96.161.85<0> | |
devfair0443:2192328:2192328 [0] NCCL INFO NET/Socket : Using [0]enp59s0:100.96.161.85<0> | |
devfair0443:2192328:2192328 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so). | |
devfair0443:2192327:2192327 [1] NCCL INFO Setting affinity for GPU 1 to ffff,f00000ff,fff00000 | |
devfair0443:2192328:2192328 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB ; OOB enp59s0:100.96.161.85<0> | |
NCCL version 2.4.7+cuda10.0 | |
devfair0443:2192328:2192328 [0] NCCL INFO Setting affinity for GPU 0 to ffff,f00000ff,fff00000 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 00 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 01 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 02 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 03 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 04 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 05 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 06 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Channel 07 : 0 1 | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 00 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 01 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 02 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 03 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 04 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 04 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 05 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 05 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 06 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 06 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Ring 07 : 0[0] -> 1[1] via P2P/IPC | |
devfair0443:2192327:2192327 [1] NCCL INFO Ring 07 : 1[1] -> 0[0] via P2P/IPC | |
devfair0443:2192328:2192328 [0] NCCL INFO Using 256 threads, Min Comp Cap 6, Trees disabled | |
devfair0443:2192328:2192328 [0] NCCL INFO comm 0x55605d18fcc0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 - Init COMPLETE | |
devfair0443:2192327:2192327 [1] NCCL INFO comm 0x5588e1f30fb0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 - Init COMPLETE | |
Downloading MNIST | |
Downloading MNIST | |
Moving model to CUDA | |
Broadcasting Params 0 | |
Broadcasting.. | |
Synchronizing at 0 | |
synchronize:byteps_torch_wait_and_clear | |
[2021-03-31 19:55:36.512771: I byteps/common/operations.cc:360] tensor size=40 | |
Moving model to CUDA | |
Broadcasting Params 1 | |
Broadcasting.. | |
Synchronizing at 1 | |
synchronize:byteps_torch_wait_and_clear | |
[2021-03-31 19:55:36.515136: I byteps/common/operations.cc:360] tensor size=40 | |
devfair0443:2192328:2192474 [0] NCCL INFO Launch mode Parallel | |
devfair0443:2192328:2192474 [0] enqueue.cc:197 NCCL WARN Cuda failure 'invalid device function' | |
devfair0443:2192328:2192474 [0] NCCL INFO misc/group.cc:148 -> 1 | |
[2021-03-31 19:55:36.515878: F byteps/common/core_loops.cc:355] Check failed: r == ncclSuccess NCCL error: unhandled cuda error | |
devfair0443:2192327:2192476 [1] enqueue.cc:197 NCCL WARN Cuda failure 'invalid device function' | |
devfair0443:2192327:2192476 [1] NCCL INFO misc/group.cc:148 -> 1 | |
[2021-03-31 19:55:36.516247: F byteps/common/core_loops.cc:307] Check failed: r == ncclSuccess NCCL error: unhandled cuda error | |
Aborted (core dumped) | |
Traceback (most recent call last): | |
File "byteps_launcher.py", line 44, in <module> | |
mp.spawn(run_worker, args=(num_devices,), nprocs=num_devices, join=True) | |
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn | |
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') | |
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes | |
while not context.join(): | |
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join | |
raise Exception(msg) | |
Exception: | |
-- Process 0 terminated with the following error: | |
Traceback (most recent call last): | |
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap | |
fn(i, *args) | |
File "/private/home/anj/byteps/byteps_launcher.py", line 38, in run_worker | |
subprocess.check_call(command, | |
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/subprocess.py", line 364, in check_call | |
raise CalledProcessError(retcode, cmd) | |
subprocess.CalledProcessError: Command 'python example/pytorch/train_mnist_byteps.py' returned non-zero exit status 134. | |
(byteps_env) anj@devfair0443:~/byteps$ Aborted (core dumped) | |
(byteps_env) anj@devfair0443:~/byteps$ A |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment