Skip to content

Instantly share code, notes, and snippets.

@staceysv
Created February 25, 2020 20:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save staceysv/965e2838434565f691a9778a7660f266 to your computer and use it in GitHub Desktop.
Save staceysv/965e2838434565f691a9778a7660f266 to your computer and use it in GitHub Desktop.
data-parallel distributed training in Keras with wandb
import argparse
import time
import os
import tensorflow
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras import backend as K
import wandb
from wandb.keras import WandbCallback
# training config
#--------------------------------
img_width, img_height = 299, 299
# project name for logging to W&B
PROJECT_NAME = "estuary"
# data location
# remote
data_remote = '/home/stacey/keras/nature_data'
# local
train_data_local = '/Users/stacey/Code/nature_data/val'
val_data_local = '/Users/stacey/Code/nature_data/train'
def build_core_model(dropout, num_classes):
""" Build core 7-layer model """
if K.image_data_format() == 'channels_first':
input_shape = (3, img_width, img_height)
else:
input_shape = (img_width, img_height, 3)
model = Sequential()
model.add(Conv2D(16, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(dropout))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
return model
def build_parallel_gpu_model(optimizer, dropout, num_classes, gpus=1, lr=None):
""" Build a parallel model over several gpus """
with tf.device('/cpu:0'):
model = build_core_model(dropout, num_classes)
# if learning rate is supplied, default to SGD with momentum=0.8
if lr:
lr_optimizer = optimizers.SGD(lr=lr, momentum=0.8)
else:
lr_optimizer = optimizer
parallel_model = multi_gpu_model(model, gpus=gpus)
parallel_model.compile(loss='categorical_crossentropy',
optimizer=lr_optimizer,
metrics=['accuracy'])
print(parallel_model.summary())
return parallel_model
def log_model_params(model, wandb_config, args):
""" Extract params of interest about the model (e.g. number of different layer types).
Log these and any experiment-level settings to wandb """
num_conv_layers = 0
num_fc_layers = 0
# NOTE: the model architecture is obscured when using Keras multi_gpu_model()
# so this only logs accurately with 1 GPU
for l in model.layers:
layer_type = l.get_config()["name"].split("_")[0]
if layer_type == "conv2d":
num_conv_layers += 1
elif layer_type == "dense":
num_fc_layers += 1
wandb_config.update({
"epochs" : args.epochs,
"batch_size" : args.batch_size,
"n_conv_layers" : num_conv_layers,
"n_fc_layers" : num_fc_layers,
"img_dim" : img_width,
"num_classes" : args.num_classes,
"n_train" : args.num_train,
"n_valid" : args.num_valid,
"optimizer" : args.optimizer,
"dropout" : args.dropout
})
def run_distrib_exp(args):
""" Experiment with data-parallel training across multiple GPUs """
wandb.init(project=PROJECT_NAME, name=args.model)
if args.local_mode:
# reset paths to local
args.train_data = train_data_local
args.val_data = val_data_local
# data generator from Keras finetuning tutorial
# consider modifying data augmentation strateg for -more or less variety
train_datagen = ImageDataGenerator(
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
test_datagen = ImageDataGenerator(rescale=1. / 255)
model = build_parallel_gpu_model(args.optimizer, args.dropout, args.num_classes, args.gpus)
log_model_params(model, wandb.config, args)
train_generator = train_datagen.flow_from_directory(
args.train_data,
target_size=(img_width, img_height),
batch_size=args.batch_size,
class_mode='categorical',
follow_links=True)
validation_generator = test_datagen.flow_from_directory(
args.val_data,
target_size=(img_width, img_height),
batch_size=args.batch_size,
class_mode='categorical',
follow_links=True)
callbacks = [WandbCallback()]
model.fit_generator(
train_generator,
steps_per_epoch=args.num_train // args.batch_size,
epochs=args.epochs,
validation_data=validation_generator,
callbacks = callbacks,
validation_steps=args.num_valid // args.batch_size)
# save model weights
save_model_filename = args.model_name + ".h5"
model.save_weights(save_model_filename)
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
#############################
# Frequently updated params
#----------------------------
parser.add_argument(
"-m",
"--model_name",
type=str,
default="",
help="Name of this model/run (model will be saved to this file)")
parser.add_argument(
"-g",
"--gpus",
type=int,
default=1,
help="Distribute run over this many GPUs"
)
parser.add_argument(
"-b",
"--batch_size",
type=int,
default=32,
help="Batch size")
#############################
# Mostly fixed params
#--------------------------
parser.add_argument(
"-c",
"--num_classes",
type=int,
default=10,
help="Number of classes to predict")
parser.add_argument(
"-d",
"--dropout",
type=float,
default=0.3,
help="Dropout for last fc layer")
parser.add_argument(
"-e",
"--epochs",
type=int,
default=50,
help="Number of training epochs")
parser.add_argument(
"-nt",
"--num_train",
type=int,
default=5000,
help="Number of training examples per class")
parser.add_argument(
"-nv",
"--num_valid",
type=int,
default=800,
help="Number of validation examples per class")
parser.add_argument(
"-o",
"--optimizer",
type=str,
default="rmsprop",
help="Learning optimizer")
parser.add_argument(
"-t",
"--train_data",
type=str,
default="/home/stacey/keras/nature_data/train",
help="Absolute path to training data")
parser.add_argument(
"-v",
"--val_data",
type=str,
default="/home/stacey/keras/nature_data/val",
help="Absolute path to validation data")
parser.add_argument(
"-l",
"--local_mode",
action="store_true",
help="Run locally if flag is set")
parser.add_argument(
"-q",
"--dry_run",
action="store_true",
help="Dry run (do not log to wandb)")
args = parser.parse_args()
# easier testing--don't log to wandb if dry run is set
if args.dry_run:
os.environ['WANDB_MODE'] = 'dryrun'
run_distrib_exp(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment