alsrgv/horovod_model_parallelism.py

## horovod_model_parallelism.py
# Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#!/usr/bin/env python

import tensorflow as tf
import horovod.tensorflow as hvd
layers = tf.contrib.layers
learn = tf.contrib.learn

tf.logging.set_verbosity(tf.logging.INFO)


def conv_model(feature, target, mode):
    """2-layer convolution model."""
    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
    # with a on-value of 1 for each one-hot vector of length 10.
    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
    # image width and height final dimension being the number of color channels.
    feature = tf.reshape(feature, [-1, 28, 28, 1])

    # Horovod Model Parallelism: specify devices for particular layers.
    with tf.device('/gpu:0'):
        # First conv layer will compute 32 features for each 5x5 patch
        with tf.variable_scope('conv_layer1'):
            h_conv1 = layers.conv2d(
                feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
            h_pool1 = tf.nn.max_pool(
                h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

        # Second conv layer will compute 64 features for each 5x5 patch.
        with tf.variable_scope('conv_layer2'):
            h_conv2 = layers.conv2d(
                h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
            h_pool2 = tf.nn.max_pool(
                h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
            # reshape tensor into a batch of vectors
            h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

    with tf.device('/gpu:1'):
        # Densely connected layer with 1024 neurons.
        h_fc1 = layers.dropout(
            layers.fully_connected(
                h_pool2_flat, 1024, activation_fn=tf.nn.relu),
            keep_prob=0.5,
            is_training=mode == tf.contrib.learn.ModeKeys.TRAIN)

        # Compute logits (1 per class) and compute loss.
        logits = layers.fully_connected(h_fc1, 10, activation_fn=None)
        loss = tf.losses.softmax_cross_entropy(target, logits)

    return tf.argmax(logits, 1), loss


def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod Model Parallelism: pin GPUs to be used to process local rank (two GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = \
        '%d,%d' % (hvd.local_rank() * 2, hvd.local_rank() * 2 + 1)

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})


if __name__ == "__main__":
    tf.app.run()
	# Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	#!/usr/bin/env python

	import tensorflow as tf
	import horovod.tensorflow as hvd
	layers = tf.contrib.layers
	learn = tf.contrib.learn

	tf.logging.set_verbosity(tf.logging.INFO)


	def conv_model(feature, target, mode):
	"""2-layer convolution model."""
	# Convert the target to a one-hot tensor of shape (batch_size, 10) and
	# with a on-value of 1 for each one-hot vector of length 10.
	target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

	# Reshape feature to 4d tensor with 2nd and 3rd dimensions being
	# image width and height final dimension being the number of color channels.
	feature = tf.reshape(feature, [-1, 28, 28, 1])

	# Horovod Model Parallelism: specify devices for particular layers.
	with tf.device('/gpu:0'):
	# First conv layer will compute 32 features for each 5x5 patch
	with tf.variable_scope('conv_layer1'):
	h_conv1 = layers.conv2d(
	feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
	h_pool1 = tf.nn.max_pool(
	h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

	# Second conv layer will compute 64 features for each 5x5 patch.
	with tf.variable_scope('conv_layer2'):
	h_conv2 = layers.conv2d(
	h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
	h_pool2 = tf.nn.max_pool(
	h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
	# reshape tensor into a batch of vectors
	h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

	with tf.device('/gpu:1'):
	# Densely connected layer with 1024 neurons.
	h_fc1 = layers.dropout(
	layers.fully_connected(
	h_pool2_flat, 1024, activation_fn=tf.nn.relu),
	keep_prob=0.5,
	is_training=mode == tf.contrib.learn.ModeKeys.TRAIN)

	# Compute logits (1 per class) and compute loss.
	logits = layers.fully_connected(h_fc1, 10, activation_fn=None)
	loss = tf.losses.softmax_cross_entropy(target, logits)

	return tf.argmax(logits, 1), loss


	def main(_):
	# Horovod: initialize Horovod.
	hvd.init()

	# Download and load MNIST dataset.
	mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

	# Build model...
	with tf.name_scope('input'):
	image = tf.placeholder(tf.float32, [None, 784], name='image')
	label = tf.placeholder(tf.float32, [None], name='label')
	predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

	# Horovod: adjust learning rate based on number of GPUs.
	opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

	# Horovod: add Horovod Distributed Optimizer.
	opt = hvd.DistributedOptimizer(opt)

	global_step = tf.contrib.framework.get_or_create_global_step()
	train_op = opt.minimize(loss, global_step=global_step)

	hooks = [
	# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
	# from rank 0 to all other processes. This is necessary to ensure consistent
	# initialization of all workers when training is started with random weights
	# or restored from a checkpoint.
	hvd.BroadcastGlobalVariablesHook(0),

	# Horovod: adjust number of steps based on number of GPUs.
	tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

	tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
	every_n_iter=10),
	]

	# Horovod Model Parallelism: pin GPUs to be used to process local rank (two GPU per process)
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	config.gpu_options.visible_device_list = \
	'%d,%d' % (hvd.local_rank() * 2, hvd.local_rank() * 2 + 1)

	# Horovod: save checkpoints only on worker 0 to prevent other workers from
	# corrupting them.
	checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

	# The MonitoredTrainingSession takes care of session initialization,
	# restoring from a checkpoint, saving to a checkpoint, and closing when done
	# or an error occurs.
	with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
	hooks=hooks,
	config=config) as mon_sess:
	while not mon_sess.should_stop():
	# Run a training step synchronously.
	image_, label_ = mnist.train.next_batch(100)
	mon_sess.run(train_op, feed_dict={image: image_, label: label_})


	if __name__ == "__main__":
	tf.app.run()