Skip to content

Instantly share code, notes, and snippets.

@himaprasoonpt
Last active February 14, 2021 05:54
Show Gist options
  • Save himaprasoonpt/68d6db5ad1b0fd9ba650afae84ea0795 to your computer and use it in GitHub Desktop.
Save himaprasoonpt/68d6db5ad1b0fd9ba650afae84ea0795 to your computer and use it in GitHub Desktop.
Creating a horovod image for tensorflow 2.1

Get the lates tf gpu image

Get Tensorflow GPU Image

docker pull tensorflow/tensorflow:latest-gpu-py3

Install MPI

# apt-get wget
wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
gunzip -c openmpi-4.0.3.tar.gz | tar xf -
cd openmpi-4.0.3
./configure --prefix=/usr/local
make all install

Check if mpi installation is success

mpicxx -show

if the above command failed try

ldconfig or sudo ldconfig

Install horovod

pip install horovod

Check horovod installation

horovodrun --check-build

Install ssh if not installed

apt-get install openssh-client

Add this code to a file

import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
# Initialize Horovod
hvd.init()

# Pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

# Build model and dataset
fashion_mnist = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[..., None]
test_images = test_images[..., None]
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(64, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

  return model
BATCH_SIZE=32
dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).batch(BATCH_SIZE)
model = create_model()
loss = tf.losses.SparseCategoricalCrossentropy()
opt = tf.optimizers.Adam(0.001 * hvd.size())

checkpoint_dir = './checkpoints'
checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)

@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = model(images, training=True)
        loss_value = loss(labels, probs)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape)

    grads = tape.gradient(loss_value, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    return loss_value

# Horovod: adjust number of steps based on number of GPUs.
for batch, (images, labels) in enumerate(dataset.take(10000 // hvd.size())):
    loss_value = training_step(images, labels, batch == 0)

    if batch % 10 == 0 and hvd.local_rank() == 0:
        print('Step #%d\tLoss: %.6f' % (batch, loss_value))

# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting it.
if hvd.rank() == 0:
    checkpoint.save(checkpoint_dir)

Test tesorflow horovod run

horovodrun -np 1 python3 your_file.py

To run on multiple docker machines

  • Create a docker bridge network
  • Make sure passwordless ssh works between docker containers
  • horovodrun -np 2 -H container_id:1,container_id:1 python3 mnist.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment