mwrnd/cmds.py

## cmds.py
"""
Purpose: Generate permutations of command-line calls to
         tensorflow/benchmark's tf_cnn_benchmarks.py script.

Usage: 00) sudo pip install future
        0) place this file in /benchmarks/scripts/tf_cnn_benchmarks
        1) python cmds.py > benchmarkcommands
        2) bash -x <benchmarkcommands 2>&1 | tee benchmarklog

Notes: - XLA and ROCM are options specific to AMD's ROCm
       - the associated parse.py will parse log output into summary form
"""

import future
from models import model_config


batch_sizes = ["16 ", "32 ", "64 ", "128", "256"]
additional_arguments = ["", " --use_fp16"]
imgnet_models = sorted(model_config._model_name_to_imagenet_model.keys())
cifar_models = sorted(model_config._model_name_to_cifar_model.keys())


def print_cmdline(d, b, m, s, a):
  # build up the command-line call to tf_cnn_benchmarks.py,
  # run through timeout to deal with crashes
  cmdline="echo ======"+d+"dev;env | grep \"TF_\";timeout -k 17.0m 15.0m " + \
          "python tf_cnn_benchmarks.py"
  if d == "CPU":
    cmdline=cmdline+" --device=CPU"
  else:
    # --compute_lr_on_cpu speeds up runs but is not strictly GPU-focused
    cmdline=cmdline+" --device=GPU --num_gpus=1"

  cmdline = cmdline + " --batch_size="+b+" --num_batches=40 --data_name="+s+ \
                    " --model=" + m + a
  print(cmdline)


def model_batchsize_permuations(device, modelname, models, batchsizes):
  for m in models:
    for b in batchsizes:
      for a in additional_arguments:
        print_cmdline(device, b, m, modelname, a)

def all_GPU_benchmarks():
  model_batchsize_permuations("GPU", "imagenet", imgnet_models, batch_sizes)
  model_batchsize_permuations("GPU", "cifar10",  cifar_models,  batch_sizes)
  model_batchsize_permuations("GPU", "coco", ["ssd3000", "trivial"], batch_sizes)
  model_batchsize_permuations("GPU", "imagenet", ["nasnetlarge"], ["8  "])
  model_batchsize_permuations("GPU", "librispeech", ["deepspeech2"], ["16 "])


print("timedatectl")
print("python --version")
print("python -c \'import future; import tensorflow; " \
  "print(\"tensorflow version: {0}\".format(tensorflow.__version__))\'")
print("dkms status | grep amd")
print("dmesg | grep kfd")
print("rocm_bandwidth_test")
print("python all_reduce_benchmark.py --variable_update=replicated")
print("rocm-smi")


### GPU and CPU Benchmarks with no flags
print("unset TF_XLA_FLAGS")
print("unset TF_ROCM_FUSION_ENABLE")
all_GPU_benchmarks()
model_batchsize_permuations("CPU", "imagenet", imgnet_models, batch_sizes)
model_batchsize_permuations("CPU", "cifar10",  cifar_models,  batch_sizes)


### GPU Benchmarks with ROCm Fusion enabled

#print("unset TF_XLA_FLAGS")
#print("export TF_ROCM_FUSION_ENABLE=1")
#all_GPU_benchmarks()


### GPU Benchmarks with XLA enabled

#print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit")
#print("unset TF_ROCM_FUSION_ENABLE")
#all_GPU_benchmarks()


### GPU Benchmarks with XLA and ROCm Fusion enabled

print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit")
print("export TF_ROCM_FUSION_ENABLE=1")
all_GPU_benchmarks()


print("timedatectl")
	"""
	Purpose: Generate permutations of command-line calls to
	tensorflow/benchmark's tf_cnn_benchmarks.py script.

	Usage: 00) sudo pip install future
	0) place this file in /benchmarks/scripts/tf_cnn_benchmarks
	1) python cmds.py > benchmarkcommands
	2) bash -x <benchmarkcommands 2>&1 \| tee benchmarklog

	Notes: - XLA and ROCM are options specific to AMD's ROCm
	- the associated parse.py will parse log output into summary form
	"""

	import future
	from models import model_config


	batch_sizes = ["16 ", "32 ", "64 ", "128", "256"]
	additional_arguments = ["", " --use_fp16"]
	imgnet_models = sorted(model_config._model_name_to_imagenet_model.keys())
	cifar_models = sorted(model_config._model_name_to_cifar_model.keys())


	def print_cmdline(d, b, m, s, a):
	# build up the command-line call to tf_cnn_benchmarks.py,
	# run through timeout to deal with crashes
	cmdline="echo ======"+d+"dev;env \| grep \"TF_\";timeout -k 17.0m 15.0m " + \
	"python tf_cnn_benchmarks.py"
	if d == "CPU":
	cmdline=cmdline+" --device=CPU"
	else:
	# --compute_lr_on_cpu speeds up runs but is not strictly GPU-focused
	cmdline=cmdline+" --device=GPU --num_gpus=1"

	cmdline = cmdline + " --batch_size="+b+" --num_batches=40 --data_name="+s+ \
	" --model=" + m + a
	print(cmdline)


	def model_batchsize_permuations(device, modelname, models, batchsizes):
	for m in models:
	for b in batchsizes:
	for a in additional_arguments:
	print_cmdline(device, b, m, modelname, a)

	def all_GPU_benchmarks():
	model_batchsize_permuations("GPU", "imagenet", imgnet_models, batch_sizes)
	model_batchsize_permuations("GPU", "cifar10", cifar_models, batch_sizes)
	model_batchsize_permuations("GPU", "coco", ["ssd3000", "trivial"], batch_sizes)
	model_batchsize_permuations("GPU", "imagenet", ["nasnetlarge"], ["8 "])
	model_batchsize_permuations("GPU", "librispeech", ["deepspeech2"], ["16 "])



	print("timedatectl")
	print("python --version")
	print("python -c \'import future; import tensorflow; " \
	"print(\"tensorflow version: {0}\".format(tensorflow.__version__))\'")
	print("dkms status \| grep amd")
	print("dmesg \| grep kfd")
	print("rocm_bandwidth_test")
	print("python all_reduce_benchmark.py --variable_update=replicated")
	print("rocm-smi")


	### GPU and CPU Benchmarks with no flags
	print("unset TF_XLA_FLAGS")
	print("unset TF_ROCM_FUSION_ENABLE")
	all_GPU_benchmarks()
	model_batchsize_permuations("CPU", "imagenet", imgnet_models, batch_sizes)
	model_batchsize_permuations("CPU", "cifar10", cifar_models, batch_sizes)


	### GPU Benchmarks with ROCm Fusion enabled

	#print("unset TF_XLA_FLAGS")
	#print("export TF_ROCM_FUSION_ENABLE=1")
	#all_GPU_benchmarks()


	### GPU Benchmarks with XLA enabled

	#print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit")
	#print("unset TF_ROCM_FUSION_ENABLE")
	#all_GPU_benchmarks()


	### GPU Benchmarks with XLA and ROCm Fusion enabled

	print("export TF_XLA_FLAGS=--tf_xla_cpu_global_jit")
	print("export TF_ROCM_FUSION_ENABLE=1")
	all_GPU_benchmarks()


	print("timedatectl")