Created
May 20, 2019 05:41
-
-
Save koshian2/c94ed2c2ec9bc83a14c01c1a38ad987d to your computer and use it in GitHub Desktop.
gpu_tpu_2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import tensorflow.python.keras as keras | |
import tensorflow.python.keras.layers as layers | |
import tensorflow.python.keras.backend as K | |
from tensorflow.contrib.tpu.python.tpu import keras_support | |
import datetime | |
import time | |
import pickle | |
import os | |
import numpy as np | |
# channel first | |
def conv_bn_relu(input, ch, reps): | |
x = input | |
for i in range(reps): | |
x = layers.Conv2D(ch, 3, padding="same", data_format="channels_first")(x) | |
x = layers.BatchNormalization(axis=1)(x) | |
x = layers.Activation("relu")(x) | |
return x | |
def create_10layers_model(): | |
input = layers.Input((3, 32, 32)) | |
x = conv_bn_relu(input, 64, 3) | |
x = layers.AveragePooling2D(2, data_format="channels_first")(x) | |
x = conv_bn_relu(x, 128, 3) | |
x = layers.AveragePooling2D(2, data_format="channels_first")(x) | |
x = conv_bn_relu(x, 256, 3) | |
x = layers.GlobalAveragePooling2D(data_format="channels_first")(x) | |
x = layers.Dense(10, activation="softmax")(x) | |
return keras.models.Model(input, x) | |
def _create_normal_residual_block(inputs, ch, N, stride): | |
# Conv with skip connections | |
x = inputs | |
for i in range(N): | |
# adjust channels | |
if i == 0: | |
skip = layers.Conv2D(ch, 1, strides=stride, data_format="channels_first")(x) | |
skip = layers.BatchNormalization(axis=1)(skip) | |
skip = layers.Activation("relu")(skip) | |
else: | |
skip = x | |
s = stride if i == 0 else 1 # ダウンサンプリング | |
x = layers.Conv2D(ch, 3, padding="same", strides=s, data_format="channels_first")(x) | |
x = layers.BatchNormalization(axis=1)(x) | |
x = layers.Activation("relu")(x) | |
x = layers.Conv2D(ch, 3, padding="same", strides=1, data_format="channels_first")(x) | |
x = layers.BatchNormalization(axis=1)(x) | |
x = layers.Activation("relu")(x) | |
x = layers.Add()([x, skip]) | |
return x | |
def create_normal_wide_resnet(N=4, k=10): | |
""" | |
Create vanilla conv Wide ResNet (N=4, k=10) | |
""" | |
# input | |
input = layers.Input((3,32,32)) | |
# 16 channels block | |
x = layers.Conv2D(16, 3, padding="same", data_format="channels_first")(input) | |
x = layers.BatchNormalization(axis=1)(x) | |
x = layers.Activation("relu")(x) | |
# 1st block | |
x = _create_normal_residual_block(x, 16*k, N, 1) | |
# 2nd block | |
x = _create_normal_residual_block(x, 32*k, N, 2) | |
# 3rd block | |
x = _create_normal_residual_block(x, 64*k, N, 2) | |
# FC | |
x = layers.GlobalAveragePooling2D(data_format="channels_first")(x) | |
x = layers.Dense(10, activation="softmax")(x) | |
model = keras.models.Model(input, x) | |
return model | |
class TimeCallback(keras.callbacks.Callback): | |
def __init__(self): | |
self.times = [] | |
self.last_time = time.time() | |
def on_train_begin(self, logs): | |
self.train_begin = datetime.datetime.now() | |
def on_train_end(self, logs): | |
self.train_end = datetime.datetime.now() | |
def on_epoch_end(self, epoch, logs): | |
current_time = time.time() | |
self.times.append(current_time - self.last_time) | |
self.last_time = current_time | |
def train(batch_size, network, device): | |
assert device in ["gpu", "multigpu", "tpu"] | |
assert network in [0, 1] | |
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data() | |
X_train, X_test = X_train/255.0, X_test/255.0 | |
X_train, X_test = np.transpose(X_train, [0, 3, 1, 2]), np.transpose(X_test, [0, 3, 1, 2]) | |
y_train = keras.utils.to_categorical(y_train) | |
y_test = keras.utils.to_categorical(y_test) | |
if network == 0: | |
model = create_10layers_model() | |
elif network == 1: | |
model = create_normal_wide_resnet() | |
if device == "multigpu": | |
model = keras.utils.multi_gpu_model(model, gpus=2) | |
initial_lr = 0.1 * batch_size / 128 | |
model.compile(keras.optimizers.SGD(initial_lr, 0.9), "categorical_crossentropy", ["acc"]) | |
def scheduler(epoch): | |
x = initial_lr | |
if x >= 50: x /= 10.0 | |
if x >= 80: x /= 10.0 | |
return x | |
if device == "tpu": | |
tpu_grpc_url = "grpc://"+os.environ["COLAB_TPU_ADDR"] | |
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu_grpc_url) | |
strategy = keras_support.TPUDistributionStrategy(tpu_cluster_resolver) | |
model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) | |
hist = keras.callbacks.History() | |
lr_step = keras.callbacks.LearningRateScheduler(scheduler) | |
times = TimeCallback() | |
model.fit(X_train, y_train, validation_data=(X_test, y_test), | |
batch_size=batch_size, epochs=100, callbacks=[hist, times, lr_step], | |
verbose=1 if device != "tpu" else 0) | |
history = {**hist.history, | |
"train_begin":times.train_begin, | |
"train_end":times.train_end, | |
"times":times.times} | |
if not os.path.exists("result"): | |
os.mkdir("result") | |
with open(f"result/{device}_{network}_{batch_size}.pkl", "wb") as fp: | |
pickle.dump(history, fp) | |
# main | |
def train_gpus(): | |
network = [0, 1] | |
device = ["gpu", "multigpu"] | |
batch_size = [128, 256, 512, 1024, 2048] | |
for net in network: | |
for dev in device: | |
for batch in batch_size: | |
# OOMになるため無視するモード | |
if net == 0: | |
if dev == "gpu" and batch > 1024: continue | |
if net == 1: | |
if dev == "gpu" and batch > 128: continue | |
if dev == "multigpu" and batch > 256: continue | |
keras.backend.clear_session() | |
print("Network", net, dev, "Batch", batch, "Starts") | |
train(batch, net, dev) | |
time.sleep(60) | |
if __name__ == "__main__": | |
train_gpus() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment