melgor/alexnet_benchmark_fp16.py

## alexnet_benchmark_fp16.py
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Timing benchmark for AlexNet inference.

To run, use:
  bazel run -c opt --config=cuda \
      models/tutorials/image/alexnet:alexnet_benchmark

Across 100 steps on batch size = 128.

Forward pass:
Run on Tesla K40c: 145 +/- 1.5 ms / batch
Run on Titan X:     70 +/- 0.1 ms / batch

Forward-backward pass:
Run on Tesla K40c: 480 +/- 48 ms / batch
Run on Titan X:    244 +/- 30 ms / batch
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
from datetime import datetime
import math
import sys
import time

from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
slim = tf.contrib.slim
from tensorflow.contrib.slim.nets import resnet_v1
from tensorflow.contrib.slim.nets import resnet_utils

FLAGS = None


def float32_variable_storage_getter(getter, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, *args, **kwargs):
    """Custom variable getter that forces trainable variables to be stored in float32 precision and then casts them to the training precision. """
    storage_dtype = tf.float32 if trainable else dtype
    variable = getter(name, shape, dtype=storage_dtype, initializer=initializer, regularizer=regularizer, trainable=trainable, *args, **kwargs)
    if trainable and dtype != tf.float32:
        variable = tf.cast(variable, dtype)
    return variable

def gradients_with_loss_scaling(loss, variables, loss_scale):
    """Gradient calculation with loss scaling to improve numerical stability when training with float16. """
    return [grad / loss_scale for grad in tf.gradients(loss * loss_scale, variables)]


def print_activations(t):
  print(t.op.name, ' ', t.get_shape().as_list())


def inference(images, dtype):
  """Build the AlexNet model.

  Args:
    images: Images Tensor

  Returns:
    pool5: the last Tensor in the convolutional component of AlexNet.
    parameters: a list of Tensors corresponding to the weights and biases of the
        AlexNet model.
  """
  parameters = []
  # conv1
  with tf.name_scope('conv1') as scope:
    kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=dtype,
                                             stddev=1e-1), name='weights')
    conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
    biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=dtype),
                         trainable=True, name='biases')
    bias = tf.nn.bias_add(conv, biases)
    conv1 = tf.nn.relu(bias, name=scope)
    print_activations(conv1)
    parameters += [kernel, biases]

  ## lrn1
  #with tf.name_scope('lrn1') as scope:
    #lrn1 = tf.nn.local_response_normalization(conv1,
                                              #alpha=1e-4,
                                              #beta=0.75,
                                              #depth_radius=2,
                                              #bias=2.0)

  # pool1
  pool1 = tf.nn.max_pool(conv1,
                         ksize=[1, 3, 3, 1],
                         strides=[1, 2, 2, 1],
                         padding='VALID',
                         name='pool1')
  print_activations(pool1)

  # conv2
  with tf.name_scope('conv2') as scope:
    kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=dtype,
                                             stddev=1e-1), name='weights')
    conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME')
    biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=dtype),
                         trainable=True, name='biases')
    bias = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu(bias, name=scope)
    parameters += [kernel, biases]
  print_activations(conv2)

  ## lrn2
  #with tf.name_scope('lrn2') as scope:
    #lrn2 = tf.nn.local_response_normalization(conv2,
                                              #alpha=1e-4,
                                              #beta=0.75,
                                              #depth_radius=2,
                                              #bias=2.0)

  # pool2
  pool2 = tf.nn.max_pool(conv2,
                         ksize=[1, 3, 3, 1],
                         strides=[1, 2, 2, 1],
                         padding='VALID',
                         name='pool2')
  print_activations(pool2)

  # conv3
  with tf.name_scope('conv3') as scope:
    kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384],
                                             dtype=dtype,
                                             stddev=1e-1), name='weights')
    conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME')
    biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=dtype),
                         trainable=True, name='biases')
    bias = tf.nn.bias_add(conv, biases)
    conv3 = tf.nn.relu(bias, name=scope)
    parameters += [kernel, biases]
    print_activations(conv3)

  # conv4
  with tf.name_scope('conv4') as scope:
    kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256],
                                             dtype=dtype,
                                             stddev=1e-1), name='weights')
    conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
    biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
                         trainable=True, name='biases')
    bias = tf.nn.bias_add(conv, biases)
    conv4 = tf.nn.relu(bias, name=scope)
    parameters += [kernel, biases]
    print_activations(conv4)

  # conv5
  with tf.name_scope('conv5') as scope:
    kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256],
                                             dtype=dtype,
                                             stddev=1e-1), name='weights')
    conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
    biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
                         trainable=True, name='biases')
    bias = tf.nn.bias_add(conv, biases)
    conv5 = tf.nn.relu(bias, name=scope)
    parameters += [kernel, biases]
    print_activations(conv5)

  # pool5
  pool5 = tf.nn.max_pool(conv5,
                         ksize=[1, 3, 3, 1],
                         strides=[1, 2, 2, 1],
                         padding='VALID',
                         name='pool5')
  print_activations(pool5)

  return pool5, parameters


def time_tensorflow_run(session, target, info_string):
  """Run the computation to obtain the target tensor and print timing stats.

  Args:
    session: the TensorFlow session to run the computation under.
    target: the target Tensor that is passed to the session's run() function.
    info_string: a string summarizing this run, to be printed with the stats.

  Returns:
    None
  """
  num_steps_burn_in = 10
  total_duration = 0.0
  total_duration_squared = 0.0
  for i in xrange(FLAGS.num_batches + num_steps_burn_in):
    start_time = time.time()
    _ = session.run(target)
    duration = time.time() - start_time
    if i >= num_steps_burn_in:
      if not i % 10:
        print ('%s: step %d, duration = %.3f' %
               (datetime.now(), i - num_steps_burn_in, duration))
      total_duration += duration
      total_duration_squared += duration * duration
  mn = total_duration / FLAGS.num_batches
  vr = total_duration_squared / FLAGS.num_batches - mn * mn
  sd = math.sqrt(vr)
  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))


def run_benchmark():
  """Run the benchmark on AlexNet."""
  dtype         = tf.float16
  with tf.device('/gpu:0'),  tf.variable_scope('fp32_storage', custom_getter=float32_variable_storage_getter):


        # Generate some dummy images.
        image_size = 224
        # Note that our padding definition is slightly different the cuda-convnet.
        # In order to force the model to start with the same activations sizes,
        # we add 3 to the image_size and employ VALID padding above.
        images = tf.Variable(tf.random_normal([FLAGS.batch_size,
                                            image_size,
                                            image_size, 3],
                                            dtype=dtype,
                                            stddev=1e-1))

        # Build a Graph that computes the logits predictions from the
        # inference model.
        pool5, parameters = inference(images, dtype)

        # Build an initialization operation.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph.
        config = tf.ConfigProto()
        config.gpu_options.allocator_type = 'BFC'
        sess = tf.Session(config=config)
        sess.run(init)

        # Run the forward benchmark.
        time_tensorflow_run(sess, pool5, "Forward")

        # Add a simple objective so we can calculate the backward pass.
        objective = tf.nn.l2_loss(pool5)
        # Compute the gradient with respect to all the parameters.
        grad = tf.gradients(objective, parameters)
        # Run the backward benchmark.
        time_tensorflow_run(sess, grad, "Forward-backward")


def main(_):
  run_benchmark()


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--batch_size',
      type=int,
      default=128,
      help='Batch size.'
  )
  parser.add_argument(
      '--num_batches',
      type=int,
      default=100,
      help='Number of batches to run.'
  )
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
	# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Timing benchmark for AlexNet inference.

	To run, use:
	bazel run -c opt --config=cuda \
	models/tutorials/image/alexnet:alexnet_benchmark

	Across 100 steps on batch size = 128.

	Forward pass:
	Run on Tesla K40c: 145 +/- 1.5 ms / batch
	Run on Titan X: 70 +/- 0.1 ms / batch

	Forward-backward pass:
	Run on Tesla K40c: 480 +/- 48 ms / batch
	Run on Titan X: 244 +/- 30 ms / batch
	"""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import argparse
	from datetime import datetime
	import math
	import sys
	import time

	from six.moves import xrange # pylint: disable=redefined-builtin
	import tensorflow as tf
	slim = tf.contrib.slim
	from tensorflow.contrib.slim.nets import resnet_v1
	from tensorflow.contrib.slim.nets import resnet_utils

	FLAGS = None


	def float32_variable_storage_getter(getter, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, args, *kwargs):
	"""Custom variable getter that forces trainable variables to be stored in float32 precision and then casts them to the training precision. """
	storage_dtype = tf.float32 if trainable else dtype
	variable = getter(name, shape, dtype=storage_dtype, initializer=initializer, regularizer=regularizer, trainable=trainable, args, *kwargs)
	if trainable and dtype != tf.float32:
	variable = tf.cast(variable, dtype)
	return variable

	def gradients_with_loss_scaling(loss, variables, loss_scale):
	"""Gradient calculation with loss scaling to improve numerical stability when training with float16. """
	return [grad / loss_scale for grad in tf.gradients(loss * loss_scale, variables)]


	def print_activations(t):
	print(t.op.name, ' ', t.get_shape().as_list())


	def inference(images, dtype):
	"""Build the AlexNet model.

	Args:
	images: Images Tensor

	Returns:
	pool5: the last Tensor in the convolutional component of AlexNet.
	parameters: a list of Tensors corresponding to the weights and biases of the
	AlexNet model.
	"""
	parameters = []
	# conv1
	with tf.name_scope('conv1') as scope:
	kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=dtype,
	stddev=1e-1), name='weights')
	conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
	biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=dtype),
	trainable=True, name='biases')
	bias = tf.nn.bias_add(conv, biases)
	conv1 = tf.nn.relu(bias, name=scope)
	print_activations(conv1)
	parameters += [kernel, biases]

	## lrn1
	#with tf.name_scope('lrn1') as scope:
	#lrn1 = tf.nn.local_response_normalization(conv1,
	#alpha=1e-4,
	#beta=0.75,
	#depth_radius=2,
	#bias=2.0)

	# pool1
	pool1 = tf.nn.max_pool(conv1,
	ksize=[1, 3, 3, 1],
	strides=[1, 2, 2, 1],
	padding='VALID',
	name='pool1')
	print_activations(pool1)

	# conv2
	with tf.name_scope('conv2') as scope:
	kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=dtype,
	stddev=1e-1), name='weights')
	conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME')
	biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=dtype),
	trainable=True, name='biases')
	bias = tf.nn.bias_add(conv, biases)
	conv2 = tf.nn.relu(bias, name=scope)
	parameters += [kernel, biases]
	print_activations(conv2)

	## lrn2
	#with tf.name_scope('lrn2') as scope:
	#lrn2 = tf.nn.local_response_normalization(conv2,
	#alpha=1e-4,
	#beta=0.75,
	#depth_radius=2,
	#bias=2.0)

	# pool2
	pool2 = tf.nn.max_pool(conv2,
	ksize=[1, 3, 3, 1],
	strides=[1, 2, 2, 1],
	padding='VALID',
	name='pool2')
	print_activations(pool2)

	# conv3
	with tf.name_scope('conv3') as scope:
	kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384],
	dtype=dtype,
	stddev=1e-1), name='weights')
	conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME')
	biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=dtype),
	trainable=True, name='biases')
	bias = tf.nn.bias_add(conv, biases)
	conv3 = tf.nn.relu(bias, name=scope)
	parameters += [kernel, biases]
	print_activations(conv3)

	# conv4
	with tf.name_scope('conv4') as scope:
	kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256],
	dtype=dtype,
	stddev=1e-1), name='weights')
	conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
	biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
	trainable=True, name='biases')
	bias = tf.nn.bias_add(conv, biases)
	conv4 = tf.nn.relu(bias, name=scope)
	parameters += [kernel, biases]
	print_activations(conv4)

	# conv5
	with tf.name_scope('conv5') as scope:
	kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256],
	dtype=dtype,
	stddev=1e-1), name='weights')
	conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
	biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
	trainable=True, name='biases')
	bias = tf.nn.bias_add(conv, biases)
	conv5 = tf.nn.relu(bias, name=scope)
	parameters += [kernel, biases]
	print_activations(conv5)

	# pool5
	pool5 = tf.nn.max_pool(conv5,
	ksize=[1, 3, 3, 1],
	strides=[1, 2, 2, 1],
	padding='VALID',
	name='pool5')
	print_activations(pool5)

	return pool5, parameters


	def time_tensorflow_run(session, target, info_string):
	"""Run the computation to obtain the target tensor and print timing stats.

	Args:
	session: the TensorFlow session to run the computation under.
	target: the target Tensor that is passed to the session's run() function.
	info_string: a string summarizing this run, to be printed with the stats.

	Returns:
	None
	"""
	num_steps_burn_in = 10
	total_duration = 0.0
	total_duration_squared = 0.0
	for i in xrange(FLAGS.num_batches + num_steps_burn_in):
	start_time = time.time()
	_ = session.run(target)
	duration = time.time() - start_time
	if i >= num_steps_burn_in:
	if not i % 10:
	print ('%s: step %d, duration = %.3f' %
	(datetime.now(), i - num_steps_burn_in, duration))
	total_duration += duration
	total_duration_squared += duration * duration
	mn = total_duration / FLAGS.num_batches
	vr = total_duration_squared / FLAGS.num_batches - mn * mn
	sd = math.sqrt(vr)
	print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
	(datetime.now(), info_string, FLAGS.num_batches, mn, sd))



	def run_benchmark():
	"""Run the benchmark on AlexNet."""
	dtype = tf.float16
	with tf.device('/gpu:0'), tf.variable_scope('fp32_storage', custom_getter=float32_variable_storage_getter):


	# Generate some dummy images.
	image_size = 224
	# Note that our padding definition is slightly different the cuda-convnet.
	# In order to force the model to start with the same activations sizes,
	# we add 3 to the image_size and employ VALID padding above.
	images = tf.Variable(tf.random_normal([FLAGS.batch_size,
	image_size,
	image_size, 3],
	dtype=dtype,
	stddev=1e-1))

	# Build a Graph that computes the logits predictions from the
	# inference model.
	pool5, parameters = inference(images, dtype)

	# Build an initialization operation.
	init = tf.global_variables_initializer()

	# Start running operations on the Graph.
	config = tf.ConfigProto()
	config.gpu_options.allocator_type = 'BFC'
	sess = tf.Session(config=config)
	sess.run(init)

	# Run the forward benchmark.
	time_tensorflow_run(sess, pool5, "Forward")

	# Add a simple objective so we can calculate the backward pass.
	objective = tf.nn.l2_loss(pool5)
	# Compute the gradient with respect to all the parameters.
	grad = tf.gradients(objective, parameters)
	# Run the backward benchmark.
	time_tensorflow_run(sess, grad, "Forward-backward")


	def main(_):
	run_benchmark()


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--batch_size',
	type=int,
	default=128,
	help='Batch size.'
	)
	parser.add_argument(
	'--num_batches',
	type=int,
	default=100,
	help='Number of batches to run.'
	)
	FLAGS, unparsed = parser.parse_known_args()
	tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)