Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save anj-s/5ff0eafd4309a16fd480cc5662aff448 to your computer and use it in GitHub Desktop.
Save anj-s/5ff0eafd4309a16fd480cc5662aff448 to your computer and use it in GitHub Desktop.
(byteps_env) anj@devfair0443:~/byteps$ python byteps_launcher.py
rank to be set 0
os.environ environ({'SHELL': '/bin/bash', 'COLORTERM': 'truecolor', 'TERM_PROGRAM_VERSION': '1.54.3', 'CONDA_EXE': '/public/apps/anaconda3/5.0.1/bin/conda', 'ENV': '/usr/share/modules/init/profile.sh', 'PWD': '/private/home/anj/byteps', 'KRB5CCNAME': 'KEYRING:persistent:1185200796', 'LOGNAME': 'anj', 'XDG_SESSION_TYPE': 'tty', 'CONDA_PREFIX': '/private/home/anj/.conda/envs/byteps_env', 'MODULESHOME': '/usr/share/modules', 'MANPATH': ':', 'VSCODE_GIT_ASKPASS_NODE': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/node', 'MOTD_SHOWN': 'pam', 'HOME': '/private/home/anj', 'LANG': 'C.UTF-8', 'CONDA_PROMPT_MODIFIER': '(byteps_env) ', 'GIT_ASKPASS': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/extensions/git/dist/askpass.sh', 'SSH_CONNECTION': '100.104.68.71 50562 100.96.161.85 22', 'MODULEPATH_modshare': '/public/modulefiles:1', 'XDG_SESSION_CLASS': 'user', 'TERM': 'xterm-256color', 'USER': 'anj', 'VSCODE_GIT_IPC_HANDLE': '/run/user/1185200796/vscode-git-d2c382147c.sock', 'CONDA_SHLVL': '2', 'SHLVL': '1', 'BASH_ENV': '/usr/share/modules/init/bash', 'XDG_SESSION_ID': '7749', 'CONDA_PYTHON_EXE': '/public/apps/anaconda3/5.0.1/bin/python', 'XDG_RUNTIME_DIR': '/run/user/1185200796', 'SSH_CLIENT': '100.104.68.71 50562 22', 'CONDA_DEFAULT_ENV': 'byteps_env', 'FAIR_ENV_CLUSTER': 'h2', 'VSCODE_GIT_ASKPASS_MAIN': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/extensions/git/dist/askpass-main.js', 'XDG_DATA_DIRS': '/usr/local/share:/usr/share:/var/lib/snapd/desktop', 'BROWSER': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin/helpers/browser.sh', 'SLURM_CONF': '/public/slurm/20.11.3/etc/slurm.conf', 'PATH': '/private/home/anj/.conda/envs/byteps_env/bin:/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin:/usr/local/cuda/bin:/opt/bin/:/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/bin:/private/home/anj/.conda/envs/fairscale/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin', 'MODULEPATH': '/public/modulefiles', 'DBUS_SESSION_BUS_ADDRESS': 'unix:path=/run/user/1185200796/bus', 'CONDA_PREFIX_1': '/private/home/anj/.conda/envs/fairscale', 'MODULES_CMD': '/usr/lib/x86_64-linux-gnu/modulecmd.tcl', 'TERM_PROGRAM': 'vscode', 'VSCODE_IPC_HOOK_CLI': '/run/user/1185200796/vscode-ipc-2cae41ba-29af-4e92-bdeb-ffacc6cfc7bb.sock', 'BASH_FUNC_switchml%%': '() { typeset swfound=1;\n if [ "${MODULES_USE_COMPAT_VERSION:-0}" = \'1\' ]; then\n typeset swname=\'main\';\n if [ -e /usr/lib/x86_64-linux-gnu/modulecmd.tcl ]; then\n typeset swfound=0;\n unset MODULES_USE_COMPAT_VERSION;\n fi;\n else\n typeset swname=\'compatibility\';\n if [ -e /usr/lib/x86_64-linux-gnu/modulecmd-compat ]; then\n typeset swfound=0;\n MODULES_USE_COMPAT_VERSION=1;\n export MODULES_USE_COMPAT_VERSION;\n fi;\n fi;\n if [ $swfound -eq 0 ]; then\n echo "Switching to Modules $swname version";\n source /usr/share/modules/init/bash;\n else\n echo "Cannot switch to Modules $swname version, command not found";\n return 1;\n fi\n}', 'BASH_FUNC_module%%': '() { unset _mlshdbg;\n if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = \'1\' ]; then\n case "$-" in \n *v*x*)\n set +vx;\n _mlshdbg=\'vx\'\n ;;\n *v*)\n set +v;\n _mlshdbg=\'v\'\n ;;\n *x*)\n set +x;\n _mlshdbg=\'x\'\n ;;\n *)\n _mlshdbg=\'\'\n ;;\n esac;\n fi;\n unset _mlre _mlIFS;\n if [ -n "${IFS+x}" ]; then\n _mlIFS=$IFS;\n fi;\n IFS=\' \';\n for _mlv in ${MODULES_RUN_QUARANTINE:-};\n do\n if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then\n if [ -n "`eval \'echo ${\'$_mlv\'+x}\'`" ]; then\n _mlre="${_mlre:-}${_mlv}_modquar=\'`eval \'echo ${\'$_mlv\'}\'`\' ";\n fi;\n _mlrv="MODULES_RUNENV_${_mlv}";\n _mlre="${_mlre:-}${_mlv}=\'`eval \'echo ${\'$_mlrv\':-}\'`\' ";\n fi;\n done;\n if [ -n "${_mlre:-}" ]; then\n eval `eval ${_mlre}/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash \'"$@"\'`;\n else\n eval `/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash "$@"`;\n fi;\n _mlstatus=$?;\n if [ -n "${_mlIFS+x}" ]; then\n IFS=$_mlIFS;\n else\n unset IFS;\n fi;\n unset _mlre _mlv _mlrv _mlIFS;\n if [ -n "${_mlshdbg:-}" ]; then\n set -$_mlshdbg;\n fi;\n unset _mlshdbg;\n return $_mlstatus\n}', '_': '/private/home/anj/.conda/envs/byteps_env/bin/python', 'DMLC_ROLE': 'worker', 'DMLC_WORKER_ID': '0', 'DMLC_NUM_WORKER': '1', 'NVIDIA_VISIBLE_DEVICES': '0,1', 'BYTEPS_LOG_LEVEL': 'INFO', 'NCCL_DEBUG': 'INFO', 'BYTEPS_ENABLE_GDB': '1', 'BYTEPS_LOCAL_RANK': '0', 'BYTEPS_LOCAL_SIZE': '2', 'DMLC_NUM_SERVER': '1', 'DMLC_PS_ROOT_URI': '10.0.0.1', 'DMLC_PS_ROOT_PORT': '1234', 'BYTEPS_CUDA_HOME': '/usr/local/cuda', 'BYTEPS_NCCL_HOME': '/usr/local/nccl'})
rank to be set 1
[2021-03-31 19:55:34.595026: I byteps/common/compressor/compressor_registry.cc:28] dithering_compressor compressor is registered
[2021-03-31 19:55:34.595038: I byteps/common/compressor/compressor_registry.cc:28] dithering_compressor compressor is registered
[2021-03-31 19:55:34.595058: I byteps/common/compressor/compressor_registry.cc:28] onebit_compressor compressor is registered
[2021-03-31 19:55:34.595066: I byteps/common/compressor/compressor_registry.cc:28] onebit_compressor compressor is registered
[2021-03-31 19:55:34.595066: I byteps/common/compressor/compressor_registry.cc:28] randomk_compressor compressor is registered
[2021-03-31 19:55:34.595076: I byteps/common/compressor/compressor_registry.cc:28] randomk_compressor compressor is registered
[2021-03-31 19:55:34.595080: I byteps/common/compressor/compressor_registry.cc:28] topk_compressor compressor is registered
[2021-03-31 19:55:34.595084: I byteps/common/compressor/compressor_registry.cc:28] topk_compressor compressor is registered
[2021-03-31 19:55:34.595089: I byteps/common/compressor/compressor_registry.cc:28] vanilla_ef compressor is registered
[2021-03-31 19:55:34.595091: I byteps/common/compressor/compressor_registry.cc:28] vanilla_ef compressor is registered
[2021-03-31 19:55:34.595097: I byteps/common/compressor/compressor_registry.cc:28] nesterov_momentum compressor is registered
[2021-03-31 19:55:34.595098: I byteps/common/compressor/compressor_registry.cc:28] nesterov_momentum compressor is registered
Initialize
Initialize
devfair0443:2192327:2192327 [1] NCCL INFO NET/Socket : Using [0]enp59s0:100.96.161.85<0>
devfair0443:2192327:2192327 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so).
devfair0443:2192327:2192327 [1] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB ; OOB enp59s0:100.96.161.85<0>
devfair0443:2192328:2192328 [0] NCCL INFO NET/Socket : Using [0]enp59s0:100.96.161.85<0>
devfair0443:2192328:2192328 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so).
devfair0443:2192327:2192327 [1] NCCL INFO Setting affinity for GPU 1 to ffff,f00000ff,fff00000
devfair0443:2192328:2192328 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB ; OOB enp59s0:100.96.161.85<0>
NCCL version 2.4.7+cuda10.0
devfair0443:2192328:2192328 [0] NCCL INFO Setting affinity for GPU 0 to ffff,f00000ff,fff00000
devfair0443:2192328:2192328 [0] NCCL INFO Channel 00 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 01 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 02 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 03 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 04 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 05 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 06 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Channel 07 : 0 1
devfair0443:2192328:2192328 [0] NCCL INFO Ring 00 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 00 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 01 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 01 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 02 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 02 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 03 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 03 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 04 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 04 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 05 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 05 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 06 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 06 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Ring 07 : 0[0] -> 1[1] via P2P/IPC
devfair0443:2192327:2192327 [1] NCCL INFO Ring 07 : 1[1] -> 0[0] via P2P/IPC
devfair0443:2192328:2192328 [0] NCCL INFO Using 256 threads, Min Comp Cap 6, Trees disabled
devfair0443:2192328:2192328 [0] NCCL INFO comm 0x55605d18fcc0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 - Init COMPLETE
devfair0443:2192327:2192327 [1] NCCL INFO comm 0x5588e1f30fb0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 - Init COMPLETE
Downloading MNIST
Downloading MNIST
Moving model to CUDA
Broadcasting Params 0
Broadcasting..
Synchronizing at 0
synchronize:byteps_torch_wait_and_clear
[2021-03-31 19:55:36.512771: I byteps/common/operations.cc:360] tensor size=40
Moving model to CUDA
Broadcasting Params 1
Broadcasting..
Synchronizing at 1
synchronize:byteps_torch_wait_and_clear
[2021-03-31 19:55:36.515136: I byteps/common/operations.cc:360] tensor size=40
devfair0443:2192328:2192474 [0] NCCL INFO Launch mode Parallel
devfair0443:2192328:2192474 [0] enqueue.cc:197 NCCL WARN Cuda failure 'invalid device function'
devfair0443:2192328:2192474 [0] NCCL INFO misc/group.cc:148 -> 1
[2021-03-31 19:55:36.515878: F byteps/common/core_loops.cc:355] Check failed: r == ncclSuccess NCCL error: unhandled cuda error
devfair0443:2192327:2192476 [1] enqueue.cc:197 NCCL WARN Cuda failure 'invalid device function'
devfair0443:2192327:2192476 [1] NCCL INFO misc/group.cc:148 -> 1
[2021-03-31 19:55:36.516247: F byteps/common/core_loops.cc:307] Check failed: r == ncclSuccess NCCL error: unhandled cuda error
Aborted (core dumped)
Traceback (most recent call last):
File "byteps_launcher.py", line 44, in <module>
mp.spawn(run_worker, args=(num_devices,), nprocs=num_devices, join=True)
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/private/home/anj/byteps/byteps_launcher.py", line 38, in run_worker
subprocess.check_call(command,
File "/private/home/anj/.conda/envs/byteps_env/lib/python3.8/subprocess.py", line 364, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command 'python example/pytorch/train_mnist_byteps.py' returned non-zero exit status 134.
(byteps_env) anj@devfair0443:~/byteps$ Aborted (core dumped)
(byteps_env) anj@devfair0443:~/byteps$ A
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment