ticapix/v100_sanity_check.sh

## v100_sanity_check.sh
#!/bin/sh

set -e
TERM=ansi

if [ $# -ne 1 ]; then
    echo "usage: $0 <ngpus:int>"
    echo "This script has to be executed on the t1-(45|90|180) VM directly"
    exit 1
fi

NGPUS=$1

if [ `id -u` -ne 0 ]; then
    sudo $0 $@
    exit 0
fi

echo "Testing VM `hostname` with $NGPUS GPU(s)"
# 0 – Black    # 1 – Red    # 2 – Green    # 3 – Yellow    # 4 – Blue    # 5 – Magenta    # 6 – Cyan    # 7 – White

#"GTX 1080 Ti" "10de:1b06" --precision=fp16 --batch_size=128 / --precision=fp32 --batch_size=64
#"V100" "10de:1db4" --precision=fp16 --batch_size=256 / --precision=fp32 --batch_size=128

assert_eq() {
    val1=$1
    val2=$2
    msg=$3
    tput bold || true
    echo -n "$msg: "
    tput sgr0 || true
    if [ $val1 = $val2 ]; then
        tput setaf 2 || true
        echo "OK ($val1 = $val2)"
        tput sgr0 || true
    else
        tput setaf 1 || true
        echo "FAIL ($val1 != $val2)"
        tput sgr0 || true
        exit 1
    fi
}

check() {
    tput bold || true
    tput setaf 6 || true
    echo $1
    tput sgr0 || true
    eval $1
}

test_lspci() {
    count=`lspci -n | grep '10de:1db4' | wc -l`
    assert_eq $count $NGPUS "Number of devices returned by lspci"
}

test_nvidia_smi_binary() {
    count=`(which nvidia-smi || true) | grep nvidia-smi | wc -l`
    assert_eq $count 1 "Is nvidia-smi installed"
}

test_nvidia_smi() {
    count=`nvidia-smi --list-gpus | grep 'V100' | wc -l`
    assert_eq $count $NGPUS "Number of devices returned by nvidia-smi"
}

test_docker_hello_world() {
    docker run hello-world
}

test_docker_nvidia_smi() {
    count=`docker run --runtime=nvidia --rm nvidia/cuda:10.2-base nvidia-smi --list-gpus | grep 'V100' | wc -l`
    assert_eq $count $NGPUS "Number of devices returned by nvidia-smi inside docker"
}

latest_tag() {
    latest=`date --date='-60 day'  "+%y.%m-py3"`
    echo -n $latest
}

test_tf_basic_fp16() {
    nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` mpiexec \
    --allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
    --layers=50 --precision=fp16 --batch_size=256 --num_iter=100
}

test_tf_basic_fp32() {
    nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` \
    mpiexec --allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
    --layers=50 --precision=fp32 --batch_size=128 --num_iter=100
}

check test_lspci
check test_nvidia_smi_binary
check test_nvidia_smi
check test_docker_hello_world
check test_docker_nvidia_smi
check test_tf_basic_fp16
check test_tf_basic_fp32

exit 0

# # run LSTM (~9400wps to t1-45)
# cd /workspace/nvidia-examples/big_lstm
# ./download_1b_words_data.sh
# python single_lm_train.py --mode=train --logdir=./logs --num_gpus=1 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output/ --hpconfig run_profiler=False,max_time=90,num_steps=20,num_shards=8,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512

# # Run basic PyTorch code
# nvidia-docker run --rm --ipc=host nvcr.io/nvidia/pytorch:`latest_tag` \
#     python /opt/pytorch/examples/word_language_model/main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 3
	#!/bin/sh

	set -e
	TERM=ansi

	if [ $# -ne 1 ]; then
	echo "usage: $0 <ngpus:int>"
	echo "This script has to be executed on the t1-(45\|90\|180) VM directly"
	exit 1
	fi

	NGPUS=$1

	if [ `id -u` -ne 0 ]; then
	sudo $0 $@
	exit 0
	fi

	echo "Testing VM `hostname` with $NGPUS GPU(s)"
	# 0 – Black # 1 – Red # 2 – Green # 3 – Yellow # 4 – Blue # 5 – Magenta # 6 – Cyan # 7 – White

	#"GTX 1080 Ti" "10de:1b06" --precision=fp16 --batch_size=128 / --precision=fp32 --batch_size=64
	#"V100" "10de:1db4" --precision=fp16 --batch_size=256 / --precision=fp32 --batch_size=128

	assert_eq() {
	val1=$1
	val2=$2
	msg=$3
	tput bold \|\| true
	echo -n "$msg: "
	tput sgr0 \|\| true
	if [ $val1 = $val2 ]; then
	tput setaf 2 \|\| true
	echo "OK ($val1 = $val2)"
	tput sgr0 \|\| true
	else
	tput setaf 1 \|\| true
	echo "FAIL ($val1 != $val2)"
	tput sgr0 \|\| true
	exit 1
	fi
	}

	check() {
	tput bold \|\| true
	tput setaf 6 \|\| true
	echo $1
	tput sgr0 \|\| true
	eval $1
	}

	test_lspci() {
	count=`lspci -n \| grep '10de:1db4' \| wc -l`
	assert_eq $count $NGPUS "Number of devices returned by lspci"
	}

	test_nvidia_smi_binary() {
	count=`(which nvidia-smi \|\| true) \| grep nvidia-smi \| wc -l`
	assert_eq $count 1 "Is nvidia-smi installed"
	}

	test_nvidia_smi() {
	count=`nvidia-smi --list-gpus \| grep 'V100' \| wc -l`
	assert_eq $count $NGPUS "Number of devices returned by nvidia-smi"
	}

	test_docker_hello_world() {
	docker run hello-world
	}

	test_docker_nvidia_smi() {
	count=`docker run --runtime=nvidia --rm nvidia/cuda:10.2-base nvidia-smi --list-gpus \| grep 'V100' \| wc -l`
	assert_eq $count $NGPUS "Number of devices returned by nvidia-smi inside docker"
	}

	latest_tag() {
	latest=`date --date='-60 day' "+%y.%m-py3"`
	echo -n $latest
	}

	test_tf_basic_fp16() {
	nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` mpiexec \
	--allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
	--layers=50 --precision=fp16 --batch_size=256 --num_iter=100
	}

	test_tf_basic_fp32() {
	nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` \
	mpiexec --allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \
	--layers=50 --precision=fp32 --batch_size=128 --num_iter=100
	}

	check test_lspci
	check test_nvidia_smi_binary
	check test_nvidia_smi
	check test_docker_hello_world
	check test_docker_nvidia_smi
	check test_tf_basic_fp16
	check test_tf_basic_fp32

	exit 0

	# # run LSTM (~9400wps to t1-45)
	# cd /workspace/nvidia-examples/big_lstm
	# ./download_1b_words_data.sh
	# python single_lm_train.py --mode=train --logdir=./logs --num_gpus=1 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output/ --hpconfig run_profiler=False,max_time=90,num_steps=20,num_shards=8,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512

	# # Run basic PyTorch code
	# nvidia-docker run --rm --ipc=host nvcr.io/nvidia/pytorch:`latest_tag` \
	# python /opt/pytorch/examples/word_language_model/main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 3