Created
April 12, 2020 06:34
-
-
Save mwrnd/52799f24fcad44d24f759076c323d203 to your computer and use it in GitHub Desktop.
Generate permutations of command-line calls to tensorflow/benchmark's tf_cnn_benchmarks.py script.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Purpose: Generate permutations of command-line calls to | |
tensorflow/benchmark's tf_cnn_benchmarks.py script. | |
Usage: 0) place this file in /benchmarks/scripts/tf_cnn_benchmarks | |
1) python3 cmds.py > benchmarkcommands | |
2) bash -x <benchmarkcommands 2>&1 | tee benchmarklog | |
Notes: - XLA and ROCM are options specific to AMD's ROCm | |
- the associated parse.py will parse log output into summary form | |
https://gist.github.com/mwrnd/3c929a85a2a43632288b564bc5f9c62a | |
""" | |
import future | |
from models import model_config | |
batch_sizes = ["16 ", "32 ", "64 ", "128", "256"] | |
additional_arguments = ["", " --use_fp16"] | |
imgnet_models = sorted(model_config._model_name_to_imagenet_model.keys()) | |
cifar_models = sorted(model_config._model_name_to_cifar_model.keys()) | |
def print_cmdline(d, b, m, s, a): | |
# build up the command-line call to tf_cnn_benchmarks.py, | |
# run through timeout to deal with crashes | |
cmdline="echo ======"+d+"dev;env | grep \"TF_\";timeout -k 17.0m 15.0m " + \ | |
"python3 tf_cnn_benchmarks.py" | |
if d == "CPU": | |
cmdline=cmdline+" --device=CPU" | |
else: | |
# --compute_lr_on_cpu speeds up runs but is not strictly GPU-focused | |
cmdline=cmdline+" --device=GPU --num_gpus=1" | |
cmdline = cmdline + " --batch_size="+b+" --num_batches=40 --data_name="+s+ \ | |
" --model=" + m + a | |
print(cmdline) | |
def model_batchsize_permuations(device, modelname, models, batchsizes): | |
for b in batchsizes: | |
for m in models: | |
for a in additional_arguments: | |
print_cmdline(device, b, m, modelname, a) | |
def all_GPU_benchmarks(): | |
model_batchsize_permuations("GPU", "imagenet", imgnet_models, batch_sizes) | |
model_batchsize_permuations("GPU", "cifar10", cifar_models, batch_sizes) | |
model_batchsize_permuations("GPU", "coco", ["ssd3000", "trivial"], batch_sizes) | |
model_batchsize_permuations("GPU", "imagenet", ["nasnetlarge"], ["8 "]) | |
model_batchsize_permuations("GPU", "librispeech", ["deepspeech2"], ["16 "]) | |
print("timedatectl") | |
print("python3 --version") | |
print("python3 -c \'import future; import tensorflow; " \ | |
"print(\"tensorflow version: {0}\".format(tensorflow.__version__))\'") | |
print("dkms status | grep amd") | |
print("dmesg | grep kfd") | |
print("rocm-bandwidth-test") | |
print("python3 all_reduce_benchmark.py --variable_update=replicated") | |
print("rocm-smi") | |
### GPU and CPU Benchmarks with no flags | |
print("unset TF_XLA_FLAGS") | |
print("unset TF_ROCM_FUSION_ENABLE") | |
all_GPU_benchmarks() | |
model_batchsize_permuations("CPU", "imagenet", imgnet_models, batch_sizes) | |
model_batchsize_permuations("CPU", "cifar10", cifar_models, batch_sizes) | |
### GPU Benchmarks with ROCm Fusion enabled | |
#print("unset TF_XLA_FLAGS") | |
#print("export TF_ROCM_FUSION_ENABLE=1") | |
#all_GPU_benchmarks() | |
### GPU Benchmarks with XLA enabled | |
#print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit") | |
#print("unset TF_ROCM_FUSION_ENABLE") | |
#all_GPU_benchmarks() | |
### GPU Benchmarks with XLA and ROCm Fusion enabled | |
print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit") | |
print("export TF_ROCM_FUSION_ENABLE=1") | |
all_GPU_benchmarks() | |
print("timedatectl") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment