Last active
October 13, 2017 05:29
-
-
Save skoppula/93320039610b0d2d4332bb18ce70ff19 to your computer and use it in GitHub Desktop.
resnet_error_example_with_tensorpack.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
# File: cifar10-resnet.py | |
# Author: Yuxin Wu <ppwwyyxxc@gmail.com> | |
import numpy as np | |
import argparse | |
import os | |
from tensorpack.models.common import layer_register, VariableHolder, rename_get_variable | |
from tensorpack.utils.argtools import shape2d, shape4d | |
from tensorpack.utils.develop import log_deprecated | |
from tensorpack.tfutils import symbolic_functions as symbf | |
from tensorflow.contrib.framework import add_model_variable | |
from tensorflow.python.training import moving_averages | |
from tensorpack import * | |
from tensorpack.tfutils.symbolic_functions import * | |
from tensorpack.tfutils.summary import * | |
from tensorpack.utils.gpu import get_nr_gpu | |
from tensorpack.dataflow import dataset | |
import tensorflow as tf | |
from tensorflow.contrib.layers import variance_scaling_initializer | |
""" | |
CIFAR10 ResNet example. See: | |
Deep Residual Learning for Image Recognition, arxiv:1512.03385 | |
This implementation uses the variants proposed in: | |
Identity Mappings in Deep Residual Networks, arxiv:1603.05027 | |
I can reproduce the results on 2 TitanX for | |
n=5, about 7.1% val error after 67k steps (20.4 step/s) | |
n=18, about 5.95% val error after 80k steps (5.6 step/s, not converged) | |
n=30: a 182-layer network, about 5.6% val error after 51k steps (3.4 step/s) | |
This model uses the whole training set instead of a train-val split. | |
To train: | |
./cifar10-resnet.py --gpu 0,1 | |
""" | |
BATCH_SIZE = 128 | |
NUM_UNITS = None | |
def update_ema(xn, moving_max, moving_min, decay): | |
batch_max = tf.reduce_max(xn, axis=[0,1,2]) | |
batch_min = tf.reduce_min(xn, axis=[0,1,2]) | |
update_op1 = moving_averages.assign_moving_average( | |
moving_max, batch_max, decay, zero_debias=False, | |
name='max_ema_op') | |
update_op2 = moving_averages.assign_moving_average( | |
moving_min, batch_min, decay, zero_debias=False, | |
name='min_ema_op') | |
# Only add to model var when we update them | |
add_model_variable(moving_min) | |
add_model_variable(moving_max) | |
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) | |
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) | |
return xn | |
def log2(x): | |
numerator = tf.log(x) | |
denominator = tf.log(tf.constant(2, dtype=numerator.dtype)) | |
return numerator / denominator | |
@layer_register(log_shape=True) | |
def RescaleActivationLayer(inputs, decay=0.9, bit_a=8): | |
in_shape = inputs.get_shape().as_list() | |
moving_max = tf.get_variable('activation_max/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False) | |
moving_min = tf.get_variable('activation_min/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False) | |
named_inputs = tf.identity(inputs, name='rescaling_input_activation') | |
xn = (named_inputs - moving_min) / tf.pow(tf.constant(2.0), log2(moving_max) - tf.constant(float(bit_a))) | |
named_xn = tf.identity(xn, name='rescaled_activation') | |
ret = update_ema(xn, moving_max, moving_min, decay) | |
vh = ret.variables = VariableHolder(mean=moving_max, variance=moving_min) | |
return ret | |
class Model(ModelDesc): | |
def __init__(self, n): | |
super(Model, self).__init__() | |
self.n = n | |
def _get_inputs(self): | |
return [InputDesc(tf.float32, [None, 32, 32, 3], 'input'), | |
InputDesc(tf.int32, [None], 'label')] | |
def _build_graph(self, inputs): | |
image, label = inputs | |
image = image / 128.0 | |
assert tf.test.is_gpu_available() | |
image = tf.transpose(image, [0, 3, 1, 2]) | |
def residual(name, l, increase_dim=False, first=False): | |
shape = l.get_shape().as_list() | |
in_channel = shape[1] | |
if increase_dim: | |
out_channel = in_channel * 2 | |
stride1 = 2 | |
else: | |
out_channel = in_channel | |
stride1 = 1 | |
with tf.variable_scope(name) as scope: | |
b1 = l if first else BNReLU(l) | |
c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU) | |
c2 = Conv2D('conv2', c1, out_channel) | |
if increase_dim: | |
l = AvgPooling('pool', l, 2) | |
l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]]) | |
l = c2 + l | |
return l | |
with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \ | |
argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3, | |
W_init=variance_scaling_initializer(mode='FAN_OUT')): | |
l = Conv2D('conv0', image, 16, nl=BNReLU) | |
l = RescaleActivationLayer('rescale', l) | |
l = residual('res1.0', l, first=True) | |
for k in range(1, self.n): | |
l = residual('res1.{}'.format(k), l) | |
# 32,c=16 | |
l = residual('res2.0', l, increase_dim=True) | |
for k in range(1, self.n): | |
l = residual('res2.{}'.format(k), l) | |
# 16,c=32 | |
l = residual('res3.0', l, increase_dim=True) | |
for k in range(1, self.n): | |
l = residual('res3.' + str(k), l) | |
l = BNReLU('bnlast', l) | |
# 8,c=64 | |
l = GlobalAvgPooling('gap', l) | |
logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity) | |
prob = tf.nn.softmax(logits, name='output') | |
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) | |
cost = tf.reduce_mean(cost, name='cross_entropy_loss') | |
wrong = prediction_incorrect(logits, label) | |
# monitor training error | |
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) | |
# weight decay on all W of fc layers | |
wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), | |
480000, 0.2, True) | |
wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') | |
add_moving_summary(cost, wd_cost) | |
add_param_summary(('.*/W', ['histogram'])) # monitor W | |
self.cost = tf.add_n([cost, wd_cost], name='cost') | |
def _get_optimizer(self): | |
lr = get_scalar_var('learning_rate', 0.01, summary=True) | |
opt = tf.train.MomentumOptimizer(lr, 0.9) | |
return opt | |
def get_data(train_or_test): | |
isTrain = train_or_test == 'train' | |
ds = dataset.Cifar10(train_or_test) | |
pp_mean = ds.get_per_pixel_mean() | |
if isTrain: | |
augmentors = [ | |
imgaug.CenterPaste((40, 40)), | |
imgaug.RandomCrop((32, 32)), | |
imgaug.Flip(horiz=True), | |
imgaug.MapImage(lambda x: x - pp_mean), | |
] | |
else: | |
augmentors = [ | |
imgaug.MapImage(lambda x: x - pp_mean) | |
] | |
ds = AugmentImageComponent(ds, augmentors) | |
ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain) | |
if isTrain: | |
ds = PrefetchData(ds, 3, 2) | |
return ds | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') | |
parser.add_argument('-n', '--num_units', | |
help='number of units in each stage', | |
type=int, default=18) | |
parser.add_argument('--load', help='load model') | |
args = parser.parse_args() | |
NUM_UNITS = args.num_units | |
if args.gpu: | |
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu | |
logger.auto_set_dir() | |
dataset_train = get_data('train') | |
dataset_test = get_data('test') | |
config = TrainConfig( | |
model=Model(n=NUM_UNITS), | |
dataflow=dataset_train, | |
callbacks=[ | |
ModelSaver(), | |
InferenceRunner(dataset_test, | |
[ScalarStats('cost'), ClassificationError()]), | |
ScheduledHyperParamSetter('learning_rate', | |
[(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)]) | |
], | |
max_epoch=400, | |
nr_tower=max(get_nr_gpu(), 1), | |
session_init=SaverRestore(args.load) if args.load else None | |
) | |
SyncMultiGPUTrainerParameterServer(config).train() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment