Skip to content

Instantly share code, notes, and snippets.

@anj-s
anj-s / distirbuted_tensorflow_setup.txt
Last active October 15, 2020 09:57
Commands required to setup a GCE instance to run Distributed Tensorflow
# Install pip
wget https://bootstrap.pypa.io/get-pip.py
sudo python get-pip.py
# Install CUDA 9
curl -O http://developer.download.nvidia.com/compute/cuda/rep
os/ubuntu1604/x86_64/cuda-repo-ubuntu1604_9.0.176-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1604_9.0.176-1_amd64.deb
sudo apt-get update
sudo apt-get install cuda
import tensorflow as tf
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu', input_dim=100))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adadelta(rho=0.9),
loss='binary_crossentropy',
metrics=['accuracy'])
# Generate dummy data
(byteps_env) anj@devfair0443:~/byteps$ python byteps_launcher.py
rank to be set 0
os.environ environ({'SHELL': '/bin/bash', 'COLORTERM': 'truecolor', 'TERM_PROGRAM_VERSION': '1.54.3', 'CONDA_EXE': '/public/apps/anaconda3/5.0.1/bin/conda', 'ENV': '/usr/share/modules/init/profile.sh', 'PWD': '/private/home/anj/byteps', 'KRB5CCNAME': 'KEYRING:persistent:1185200796', 'LOGNAME': 'anj', 'XDG_SESSION_TYPE': 'tty', 'CONDA_PREFIX': '/private/home/anj/.conda/envs/byteps_env', 'MODULESHOME': '/usr/share/modules', 'MANPATH': ':', 'VSCODE_GIT_ASKPASS_NODE': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/node', 'MOTD_SHOWN': 'pam', 'HOME': '/private/home/anj', 'LANG': 'C.UTF-8', 'CONDA_PROMPT_MODIFIER': '(byteps_env) ', 'GIT_ASKPASS': '/private/home/anj/.vscode-server/bin/2b9aebd5354a3629c3aba0a5f5df49f43d6689f8/extensions/git/dist/askpass.sh', 'SSH_CONNECTION': '100.104.68.71 50562 100.96.161.85 22', 'MODULEPATH_modshare': '/public/modulefiles:1', 'XDG_SESSION_CLASS': 'user', 'TERM': 'xter
# File used to run bytePS on a local worker with 2 GPUs
import os
import subprocess
import sys
import torch
import torch.multiprocessing as mp
def run_worker(rank, world_size):
torch.cuda.set_device(rank)
# packages in environment at /private/home/anj/.conda/envs/byteps_env:
#
# Name Version Build Channel
_libgcc_mutex 0.1 conda_forge conda-forge
_openmp_mutex 4.5 1_gnu conda-forge
appdirs 1.4.4 <pip>
appdirs 1.4.4 py_0
attrs 20.3.0 pyhd3eb1b0_0
black 19.10b0 py_0
blas 1.0 mkl
Wed Mar 31 20:15:11 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02 Driver Version: 450.80.02 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Quadro GP100 On | 00000000:AF:00.0 Off | 0 |
| 26% 36C P0 30W / 235W | 4638MiB / 16278MiB | 0% Default |
@anj-s
anj-s / repro_seg_fault_rpc_sync.py
Last active April 28, 2021 15:02
Repro rpc_sync segmentation fault
# Example repro for failing to profile a callback.
import torch
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
import os
import argparse
import subprocess
@anj-s
anj-s / repro_rpc_torch_script.py
Created April 28, 2021 15:25
Example demonstrating torch.jit.script + rpc_async/rpc_sync + Rrefs
# Example repro for failing to profile a callback.
import torch
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
import time
import argparse
RPC_PORT = 25001
@anj-s
anj-s / repro_bucket_rtts.py
Created April 28, 2021 16:53
Monotonically increasing bucket RTTs in parameter servers.
# Repro increasing bucket RTTs.
import argparse
import os
import socket
import threading
import subprocess
import time
import torch
@anj-s
anj-s / repro_bucket_rtts.txt
Created April 28, 2021 16:57
Output of `python repro_bucket_rtts.py --bucket_size=10 --use_cuda_tensors --num_buckets=20`
run_worker 1 with world size 2
---Warm Up-----
Callback triggered in 7664.990643 ms
Callback triggered in 7664.933709 ms
Callback triggered in 7664.819333 ms
Callback triggered in 7665.318029 ms
Callback triggered in 7668.967457 ms
Callback triggered in 7673.087738 ms
Callback triggered in 7677.450334 ms
Callback triggered in 7684.611007 ms