Created
February 25, 2020 20:33
-
-
Save staceysv/965e2838434565f691a9778a7660f266 to your computer and use it in GitHub Desktop.
data-parallel distributed training in Keras with wandb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import time | |
import os | |
import tensorflow | |
from tensorflow.keras import optimizers | |
from tensorflow.keras.preprocessing.image import ImageDataGenerator | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Conv2D, MaxPooling2D | |
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense | |
from tensorflow.keras import backend as K | |
import wandb | |
from wandb.keras import WandbCallback | |
# training config | |
#-------------------------------- | |
img_width, img_height = 299, 299 | |
# project name for logging to W&B | |
PROJECT_NAME = "estuary" | |
# data location | |
# remote | |
data_remote = '/home/stacey/keras/nature_data' | |
# local | |
train_data_local = '/Users/stacey/Code/nature_data/val' | |
val_data_local = '/Users/stacey/Code/nature_data/train' | |
def build_core_model(dropout, num_classes): | |
""" Build core 7-layer model """ | |
if K.image_data_format() == 'channels_first': | |
input_shape = (3, img_width, img_height) | |
else: | |
input_shape = (img_width, img_height, 3) | |
model = Sequential() | |
model.add(Conv2D(16, (3, 3), input_shape=input_shape)) | |
model.add(Activation('relu')) | |
model.add(MaxPooling2D(pool_size=(2, 2))) | |
model.add(Conv2D(32, (3, 3))) | |
model.add(Activation('relu')) | |
model.add(MaxPooling2D(pool_size=(2, 2))) | |
model.add(Conv2D(32, (3, 3))) | |
model.add(Activation('relu')) | |
model.add(MaxPooling2D(pool_size=(2, 2))) | |
model.add(Conv2D(64, (3, 3))) | |
model.add(Activation('relu')) | |
model.add(MaxPooling2D(pool_size=(2, 2))) | |
model.add(Conv2D(128, (3, 3))) | |
model.add(Activation('relu')) | |
model.add(MaxPooling2D(pool_size=(2, 2))) | |
model.add(Flatten()) | |
model.add(Dense(128)) | |
model.add(Activation('relu')) | |
model.add(Dropout(dropout)) | |
model.add(Dense(num_classes)) | |
model.add(Activation('softmax')) | |
return model | |
def build_parallel_gpu_model(optimizer, dropout, num_classes, gpus=1, lr=None): | |
""" Build a parallel model over several gpus """ | |
with tf.device('/cpu:0'): | |
model = build_core_model(dropout, num_classes) | |
# if learning rate is supplied, default to SGD with momentum=0.8 | |
if lr: | |
lr_optimizer = optimizers.SGD(lr=lr, momentum=0.8) | |
else: | |
lr_optimizer = optimizer | |
parallel_model = multi_gpu_model(model, gpus=gpus) | |
parallel_model.compile(loss='categorical_crossentropy', | |
optimizer=lr_optimizer, | |
metrics=['accuracy']) | |
print(parallel_model.summary()) | |
return parallel_model | |
def log_model_params(model, wandb_config, args): | |
""" Extract params of interest about the model (e.g. number of different layer types). | |
Log these and any experiment-level settings to wandb """ | |
num_conv_layers = 0 | |
num_fc_layers = 0 | |
# NOTE: the model architecture is obscured when using Keras multi_gpu_model() | |
# so this only logs accurately with 1 GPU | |
for l in model.layers: | |
layer_type = l.get_config()["name"].split("_")[0] | |
if layer_type == "conv2d": | |
num_conv_layers += 1 | |
elif layer_type == "dense": | |
num_fc_layers += 1 | |
wandb_config.update({ | |
"epochs" : args.epochs, | |
"batch_size" : args.batch_size, | |
"n_conv_layers" : num_conv_layers, | |
"n_fc_layers" : num_fc_layers, | |
"img_dim" : img_width, | |
"num_classes" : args.num_classes, | |
"n_train" : args.num_train, | |
"n_valid" : args.num_valid, | |
"optimizer" : args.optimizer, | |
"dropout" : args.dropout | |
}) | |
def run_distrib_exp(args): | |
""" Experiment with data-parallel training across multiple GPUs """ | |
wandb.init(project=PROJECT_NAME, name=args.model) | |
if args.local_mode: | |
# reset paths to local | |
args.train_data = train_data_local | |
args.val_data = val_data_local | |
# data generator from Keras finetuning tutorial | |
# consider modifying data augmentation strateg for -more or less variety | |
train_datagen = ImageDataGenerator( | |
rescale=1. / 255, | |
shear_range=0.2, | |
zoom_range=0.2, | |
horizontal_flip=True) | |
test_datagen = ImageDataGenerator(rescale=1. / 255) | |
model = build_parallel_gpu_model(args.optimizer, args.dropout, args.num_classes, args.gpus) | |
log_model_params(model, wandb.config, args) | |
train_generator = train_datagen.flow_from_directory( | |
args.train_data, | |
target_size=(img_width, img_height), | |
batch_size=args.batch_size, | |
class_mode='categorical', | |
follow_links=True) | |
validation_generator = test_datagen.flow_from_directory( | |
args.val_data, | |
target_size=(img_width, img_height), | |
batch_size=args.batch_size, | |
class_mode='categorical', | |
follow_links=True) | |
callbacks = [WandbCallback()] | |
model.fit_generator( | |
train_generator, | |
steps_per_epoch=args.num_train // args.batch_size, | |
epochs=args.epochs, | |
validation_data=validation_generator, | |
callbacks = callbacks, | |
validation_steps=args.num_valid // args.batch_size) | |
# save model weights | |
save_model_filename = args.model_name + ".h5" | |
model.save_weights(save_model_filename) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
############################# | |
# Frequently updated params | |
#---------------------------- | |
parser.add_argument( | |
"-m", | |
"--model_name", | |
type=str, | |
default="", | |
help="Name of this model/run (model will be saved to this file)") | |
parser.add_argument( | |
"-g", | |
"--gpus", | |
type=int, | |
default=1, | |
help="Distribute run over this many GPUs" | |
) | |
parser.add_argument( | |
"-b", | |
"--batch_size", | |
type=int, | |
default=32, | |
help="Batch size") | |
############################# | |
# Mostly fixed params | |
#-------------------------- | |
parser.add_argument( | |
"-c", | |
"--num_classes", | |
type=int, | |
default=10, | |
help="Number of classes to predict") | |
parser.add_argument( | |
"-d", | |
"--dropout", | |
type=float, | |
default=0.3, | |
help="Dropout for last fc layer") | |
parser.add_argument( | |
"-e", | |
"--epochs", | |
type=int, | |
default=50, | |
help="Number of training epochs") | |
parser.add_argument( | |
"-nt", | |
"--num_train", | |
type=int, | |
default=5000, | |
help="Number of training examples per class") | |
parser.add_argument( | |
"-nv", | |
"--num_valid", | |
type=int, | |
default=800, | |
help="Number of validation examples per class") | |
parser.add_argument( | |
"-o", | |
"--optimizer", | |
type=str, | |
default="rmsprop", | |
help="Learning optimizer") | |
parser.add_argument( | |
"-t", | |
"--train_data", | |
type=str, | |
default="/home/stacey/keras/nature_data/train", | |
help="Absolute path to training data") | |
parser.add_argument( | |
"-v", | |
"--val_data", | |
type=str, | |
default="/home/stacey/keras/nature_data/val", | |
help="Absolute path to validation data") | |
parser.add_argument( | |
"-l", | |
"--local_mode", | |
action="store_true", | |
help="Run locally if flag is set") | |
parser.add_argument( | |
"-q", | |
"--dry_run", | |
action="store_true", | |
help="Dry run (do not log to wandb)") | |
args = parser.parse_args() | |
# easier testing--don't log to wandb if dry run is set | |
if args.dry_run: | |
os.environ['WANDB_MODE'] = 'dryrun' | |
run_distrib_exp(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment