Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MattsonThieme/1f3ee54b561dd182ea55d99157be887a to your computer and use it in GitHub Desktop.
Save MattsonThieme/1f3ee54b561dd182ea55d99157be887a to your computer and use it in GitHub Desktop.
Run TensorFlow CNN benchmarks on a single node with multiple workers
# !/bin/bash
# Mattson Thieme | 2018
# Run training in TensorFlow's tf_cnn benchmarks with four workers and one ps on a 36 core Skylake
# Activate TensorFlow virtual environment
source activate tensorflow_p36
# Set worker parameters - may need to update port numbers when running on your machine
ps_list="127.0.0.1:26214"
workers_list="127.0.0.1:1104,127.0.0.1:10628,127.0.0.1:1124,127.0.0.1:2003"
worker_env="export OMP_NUM_THREADS=9"
ps_env="export OMP_NUM_THREADS=9"
ps_args="--num_intra_threads 4 --num_inter_threads 2"
worker_args="--num_intra_threads 9 --num_inter_threads 4"
# Clone benchmark scripts
git clone -b mkl_experiment https://github.com/tensorflow/benchmarks.git # May need to change line in benchmarks/scripts/tf_cnn_benchmarks/datasets.py from 'import cPickle' to 'import _pickle as cPickle' if using python3
cd benchmarks/scripts/tf_cnn_benchmarks
rm *.log # remove logs from any previous benchmark runs
## Run training benchmark scripts
networks=( inception3 resnet50 resnet152 vgg16 ) # May run out of memory with four workers on larger networks
batch_sizes=( 32 64 96 )
num_batches=30
for network in "${networks[@]}" ; do
# Start single PS task for each topology
numactl -l python tf_cnn_benchmarks.py $ps_args --job_name ps --task_index 0 --ps_hosts $ps_list --worker_hosts $workers_list &
# Start worker tasks
for bs in "${batch_sizes[@]}"; do
echo -e "\n\n #### Starting $network and batch size = $bs ####\n\n"
# Worker 0
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[0-9,35-41],explicit,verbose" --job_name worker --task_index 0 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_0.log &
# Worker 1
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[10-19,42-48],explicit,verbose" --job_name worker --task_index 1 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_1.log &
# Worker 2
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[20-29,49-55],explicit,verbose" --job_name worker --task_index 2 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_2.log &
# Worker 3
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[30-36,56-72],explicit,verbose" --job_name worker --task_index 3 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_3.log
done
# Kill previous PS task
kill $(pidof python)
done
## Print throughput
workers=( 0 1 2 3 )
sleep 1
echo -e "\n Network batch_size images/second worker\n"
for network in "${networks[@]}" ; do
for bs in "${batch_sizes[@]}"; do
for worker in "${workers[@]}"; do
fps=$(grep "total images/sec:" net_"$network"_bs_"$bs"_"$worker".log | cut -d ":" -f2 | xargs)
echo "$network $bs $fps $worker"
done
done
echo -e "\n"
done
# Deactivate virtual environment
source deactivate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment