Skip to content

Instantly share code, notes, and snippets.

@ticapix
Created December 11, 2019 13:14
Show Gist options
  • Save ticapix/bef4d6deb5f70242f0d88c15b60845fb to your computer and use it in GitHub Desktop.
Save ticapix/bef4d6deb5f70242f0d88c15b60845fb to your computer and use it in GitHub Desktop.
#!/bin/sh
set -e
TERM=ansi
if [ $# -ne 1 ]; then
echo "usage: $0 <ngpus:int>"
echo "This script has to be executed on the t1-(45|90|180) VM directly"
exit 1
fi
NGPUS=$1
if [ `id -u` -ne 0 ]; then
sudo $0 $@
exit 0
fi
echo "Testing VM `hostname` with $NGPUS GPU(s)"
# 0 – Black # 1 – Red # 2 – Green # 3 – Yellow # 4 – Blue # 5 – Magenta # 6 – Cyan # 7 – White
#"GTX 1080 Ti" "10de:1b06" --precision=fp16 --batch_size=128 / --precision=fp32 --batch_size=64
#"V100" "10de:1db4" --precision=fp16 --batch_size=256 / --precision=fp32 --batch_size=128
assert_eq() {
val1=$1
val2=$2
msg=$3
tput bold || true
echo -n "$msg: "
tput sgr0 || true
if [ $val1 = $val2 ]; then
tput setaf 2 || true
echo "OK ($val1 = $val2)"
tput sgr0 || true
else
tput setaf 1 || true
echo "FAIL ($val1 != $val2)"
tput sgr0 || true
exit 1
fi
}
check() {
tput bold || true
tput setaf 6 || true
echo $1
tput sgr0 || true
eval $1
}
test_lspci() {
count=`lspci -n | grep '10de:1db4' | wc -l`
assert_eq $count $NGPUS "Number of devices returned by lspci"
}
test_nvidia_smi_binary() {
count=`(which nvidia-smi || true) | grep nvidia-smi | wc -l`
assert_eq $count 1 "Is nvidia-smi installed"
}
test_nvidia_smi() {
count=`nvidia-smi --list-gpus | grep 'V100' | wc -l`
assert_eq $count $NGPUS "Number of devices returned by nvidia-smi"
}
test_docker_hello_world() {
docker run hello-world
}
test_docker_nvidia_smi() {
count=`docker run --runtime=nvidia --rm nvidia/cuda:10.2-base nvidia-smi --list-gpus | grep 'V100' | wc -l`
assert_eq $count $NGPUS "Number of devices returned by nvidia-smi inside docker"
}
latest_tag() {
latest=`date --date='-60 day' "+%y.%m-py3"`
echo -n $latest
}
test_tf_basic_fp16() {
nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` mpiexec \
--allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
--layers=50 --precision=fp16 --batch_size=256 --num_iter=100
}
test_tf_basic_fp32() {
nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` \
mpiexec --allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
--layers=50 --precision=fp32 --batch_size=128 --num_iter=100
}
check test_lspci
check test_nvidia_smi_binary
check test_nvidia_smi
check test_docker_hello_world
check test_docker_nvidia_smi
check test_tf_basic_fp16
check test_tf_basic_fp32
exit 0
# # run LSTM (~9400wps to t1-45)
# cd /workspace/nvidia-examples/big_lstm
# ./download_1b_words_data.sh
# python single_lm_train.py --mode=train --logdir=./logs --num_gpus=1 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output/ --hpconfig run_profiler=False,max_time=90,num_steps=20,num_shards=8,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512
# # Run basic PyTorch code
# nvidia-docker run --rm --ipc=host nvcr.io/nvidia/pytorch:`latest_tag` \
# python /opt/pytorch/examples/word_language_model/main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment