skoppula/gist:93320039610b0d2d4332bb18ce70ff19

## gistfile1.txt
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# File: cifar10-resnet.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>

import numpy as np
import argparse
import os

from tensorpack.models.common import layer_register, VariableHolder, rename_get_variable
from tensorpack.utils.argtools import shape2d, shape4d
from tensorpack.utils.develop import log_deprecated
from tensorpack.tfutils import symbolic_functions as symbf
from tensorflow.contrib.framework import add_model_variable
from tensorflow.python.training import moving_averages
from tensorpack import *
from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.dataflow import dataset

import tensorflow as tf
from tensorflow.contrib.layers import variance_scaling_initializer

"""
CIFAR10 ResNet example. See:
Deep Residual Learning for Image Recognition, arxiv:1512.03385
This implementation uses the variants proposed in:
Identity Mappings in Deep Residual Networks, arxiv:1603.05027

I can reproduce the results on 2 TitanX for
n=5, about 7.1% val error after 67k steps (20.4 step/s)
n=18, about 5.95% val error after 80k steps (5.6 step/s, not converged)
n=30: a 182-layer network, about 5.6% val error after 51k steps (3.4 step/s)
This model uses the whole training set instead of a train-val split.

To train:
    ./cifar10-resnet.py --gpu 0,1
"""

BATCH_SIZE = 128
NUM_UNITS = None

def update_ema(xn, moving_max, moving_min, decay):
    batch_max = tf.reduce_max(xn, axis=[0,1,2])
    batch_min = tf.reduce_min(xn, axis=[0,1,2])
    update_op1 = moving_averages.assign_moving_average(
        moving_max, batch_max, decay, zero_debias=False,
        name='max_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_min, batch_min, decay, zero_debias=False,
        name='min_ema_op')
    # Only add to model var when we update them
    add_model_variable(moving_min)
    add_model_variable(moving_max)

    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)

    return xn

def log2(x):
  numerator = tf.log(x)
  denominator = tf.log(tf.constant(2, dtype=numerator.dtype))
  return numerator / denominator

@layer_register(log_shape=True)
def RescaleActivationLayer(inputs, decay=0.9, bit_a=8):
    in_shape = inputs.get_shape().as_list()
    moving_max = tf.get_variable('activation_max/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False)
    moving_min = tf.get_variable('activation_min/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False)

    named_inputs = tf.identity(inputs, name='rescaling_input_activation')
    xn = (named_inputs - moving_min) / tf.pow(tf.constant(2.0), log2(moving_max) - tf.constant(float(bit_a)))
    named_xn = tf.identity(xn, name='rescaled_activation')

    ret = update_ema(xn, moving_max, moving_min, decay)
    vh = ret.variables = VariableHolder(mean=moving_max, variance=moving_min)
    return ret

class Model(ModelDesc):

    def __init__(self, n):
        super(Model, self).__init__()
        self.n = n

    def _get_inputs(self):
        return [InputDesc(tf.float32, [None, 32, 32, 3], 'input'),
                InputDesc(tf.int32, [None], 'label')]

    def _build_graph(self, inputs):
        image, label = inputs
        image = image / 128.0
        assert tf.test.is_gpu_available()
        image = tf.transpose(image, [0, 3, 1, 2])

        def residual(name, l, increase_dim=False, first=False):
            shape = l.get_shape().as_list()
            in_channel = shape[1]

            if increase_dim:
                out_channel = in_channel * 2
                stride1 = 2
            else:
                out_channel = in_channel
                stride1 = 1

            with tf.variable_scope(name) as scope:
                b1 = l if first else BNReLU(l)
                c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU)
                c2 = Conv2D('conv2', c1, out_channel)
                if increase_dim:
                    l = AvgPooling('pool', l, 2)
                    l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])

                l = c2 + l
                return l

        with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \
                argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3,
                         W_init=variance_scaling_initializer(mode='FAN_OUT')):
            l = Conv2D('conv0', image, 16, nl=BNReLU)
            l = RescaleActivationLayer('rescale', l)
            l = residual('res1.0', l, first=True)
            for k in range(1, self.n):
                l = residual('res1.{}'.format(k), l)
            # 32,c=16

            l = residual('res2.0', l, increase_dim=True)
            for k in range(1, self.n):
                l = residual('res2.{}'.format(k), l)
            # 16,c=32

            l = residual('res3.0', l, increase_dim=True)
            for k in range(1, self.n):
                l = residual('res3.' + str(k), l)
            l = BNReLU('bnlast', l)
            # 8,c=64
            l = GlobalAvgPooling('gap', l)

        logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity)
        prob = tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

        wrong = prediction_incorrect(logits, label)
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

        # weight decay on all W of fc layers
        wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
                                          480000, 0.2, True)
        wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
        add_moving_summary(cost, wd_cost)

        add_param_summary(('.*/W', ['histogram']))   # monitor W
        self.cost = tf.add_n([cost, wd_cost], name='cost')

    def _get_optimizer(self):
        lr = get_scalar_var('learning_rate', 0.01, summary=True)
        opt = tf.train.MomentumOptimizer(lr, 0.9)
        return opt


def get_data(train_or_test):
    isTrain = train_or_test == 'train'
    ds = dataset.Cifar10(train_or_test)
    pp_mean = ds.get_per_pixel_mean()
    if isTrain:
        augmentors = [
            imgaug.CenterPaste((40, 40)),
            imgaug.RandomCrop((32, 32)),
            imgaug.Flip(horiz=True),
            imgaug.MapImage(lambda x: x - pp_mean),
        ]
    else:
        augmentors = [
            imgaug.MapImage(lambda x: x - pp_mean)
        ]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
    if isTrain:
        ds = PrefetchData(ds, 3, 2)
    return ds


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('-n', '--num_units',
                        help='number of units in each stage',
                        type=int, default=18)
    parser.add_argument('--load', help='load model')
    args = parser.parse_args()
    NUM_UNITS = args.num_units

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    logger.auto_set_dir()

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=Model(n=NUM_UNITS),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'), ClassificationError()]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)])
        ],
        max_epoch=400,
        nr_tower=max(get_nr_gpu(), 1),
        session_init=SaverRestore(args.load) if args.load else None
    )
    SyncMultiGPUTrainerParameterServer(config).train()
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	# File: cifar10-resnet.py
	# Author: Yuxin Wu <ppwwyyxxc@gmail.com>

	import numpy as np
	import argparse
	import os

	from tensorpack.models.common import layer_register, VariableHolder, rename_get_variable
	from tensorpack.utils.argtools import shape2d, shape4d
	from tensorpack.utils.develop import log_deprecated
	from tensorpack.tfutils import symbolic_functions as symbf
	from tensorflow.contrib.framework import add_model_variable
	from tensorflow.python.training import moving_averages
	from tensorpack import *
	from tensorpack.tfutils.symbolic_functions import *
	from tensorpack.tfutils.summary import *
	from tensorpack.utils.gpu import get_nr_gpu
	from tensorpack.dataflow import dataset

	import tensorflow as tf
	from tensorflow.contrib.layers import variance_scaling_initializer

	"""
	CIFAR10 ResNet example. See:
	Deep Residual Learning for Image Recognition, arxiv:1512.03385
	This implementation uses the variants proposed in:
	Identity Mappings in Deep Residual Networks, arxiv:1603.05027

	I can reproduce the results on 2 TitanX for
	n=5, about 7.1% val error after 67k steps (20.4 step/s)
	n=18, about 5.95% val error after 80k steps (5.6 step/s, not converged)
	n=30: a 182-layer network, about 5.6% val error after 51k steps (3.4 step/s)
	This model uses the whole training set instead of a train-val split.

	To train:
	./cifar10-resnet.py --gpu 0,1
	"""

	BATCH_SIZE = 128
	NUM_UNITS = None

	def update_ema(xn, moving_max, moving_min, decay):
	batch_max = tf.reduce_max(xn, axis=[0,1,2])
	batch_min = tf.reduce_min(xn, axis=[0,1,2])
	update_op1 = moving_averages.assign_moving_average(
	moving_max, batch_max, decay, zero_debias=False,
	name='max_ema_op')
	update_op2 = moving_averages.assign_moving_average(
	moving_min, batch_min, decay, zero_debias=False,
	name='min_ema_op')
	# Only add to model var when we update them
	add_model_variable(moving_min)
	add_model_variable(moving_max)

	tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
	tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)

	return xn

	def log2(x):
	numerator = tf.log(x)
	denominator = tf.log(tf.constant(2, dtype=numerator.dtype))
	return numerator / denominator

	@layer_register(log_shape=True)
	def RescaleActivationLayer(inputs, decay=0.9, bit_a=8):
	in_shape = inputs.get_shape().as_list()
	moving_max = tf.get_variable('activation_max/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False)
	moving_min = tf.get_variable('activation_min/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False)

	named_inputs = tf.identity(inputs, name='rescaling_input_activation')
	xn = (named_inputs - moving_min) / tf.pow(tf.constant(2.0), log2(moving_max) - tf.constant(float(bit_a)))
	named_xn = tf.identity(xn, name='rescaled_activation')

	ret = update_ema(xn, moving_max, moving_min, decay)
	vh = ret.variables = VariableHolder(mean=moving_max, variance=moving_min)
	return ret

	class Model(ModelDesc):

	def __init__(self, n):
	super(Model, self).__init__()
	self.n = n

	def _get_inputs(self):
	return [InputDesc(tf.float32, [None, 32, 32, 3], 'input'),
	InputDesc(tf.int32, [None], 'label')]

	def _build_graph(self, inputs):
	image, label = inputs
	image = image / 128.0
	assert tf.test.is_gpu_available()
	image = tf.transpose(image, [0, 3, 1, 2])

	def residual(name, l, increase_dim=False, first=False):
	shape = l.get_shape().as_list()
	in_channel = shape[1]

	if increase_dim:
	out_channel = in_channel * 2
	stride1 = 2
	else:
	out_channel = in_channel
	stride1 = 1

	with tf.variable_scope(name) as scope:
	b1 = l if first else BNReLU(l)
	c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU)
	c2 = Conv2D('conv2', c1, out_channel)
	if increase_dim:
	l = AvgPooling('pool', l, 2)
	l = tf.pad(l, [[0, 0], [in_channel // 2, in_channel // 2], [0, 0], [0, 0]])

	l = c2 + l
	return l

	with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \
	argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3,
	W_init=variance_scaling_initializer(mode='FAN_OUT')):
	l = Conv2D('conv0', image, 16, nl=BNReLU)
	l = RescaleActivationLayer('rescale', l)
	l = residual('res1.0', l, first=True)
	for k in range(1, self.n):
	l = residual('res1.{}'.format(k), l)
	# 32,c=16

	l = residual('res2.0', l, increase_dim=True)
	for k in range(1, self.n):
	l = residual('res2.{}'.format(k), l)
	# 16,c=32

	l = residual('res3.0', l, increase_dim=True)
	for k in range(1, self.n):
	l = residual('res3.' + str(k), l)
	l = BNReLU('bnlast', l)
	# 8,c=64
	l = GlobalAvgPooling('gap', l)

	logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity)
	prob = tf.nn.softmax(logits, name='output')

	cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
	cost = tf.reduce_mean(cost, name='cross_entropy_loss')

	wrong = prediction_incorrect(logits, label)
	# monitor training error
	add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

	# weight decay on all W of fc layers
	wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
	480000, 0.2, True)
	wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
	add_moving_summary(cost, wd_cost)

	add_param_summary(('.*/W', ['histogram'])) # monitor W
	self.cost = tf.add_n([cost, wd_cost], name='cost')

	def _get_optimizer(self):
	lr = get_scalar_var('learning_rate', 0.01, summary=True)
	opt = tf.train.MomentumOptimizer(lr, 0.9)
	return opt


	def get_data(train_or_test):
	isTrain = train_or_test == 'train'
	ds = dataset.Cifar10(train_or_test)
	pp_mean = ds.get_per_pixel_mean()
	if isTrain:
	augmentors = [
	imgaug.CenterPaste((40, 40)),
	imgaug.RandomCrop((32, 32)),
	imgaug.Flip(horiz=True),
	imgaug.MapImage(lambda x: x - pp_mean),
	]
	else:
	augmentors = [
	imgaug.MapImage(lambda x: x - pp_mean)
	]
	ds = AugmentImageComponent(ds, augmentors)
	ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
	if isTrain:
	ds = PrefetchData(ds, 3, 2)
	return ds


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
	parser.add_argument('-n', '--num_units',
	help='number of units in each stage',
	type=int, default=18)
	parser.add_argument('--load', help='load model')
	args = parser.parse_args()
	NUM_UNITS = args.num_units

	if args.gpu:
	os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

	logger.auto_set_dir()

	dataset_train = get_data('train')
	dataset_test = get_data('test')

	config = TrainConfig(
	model=Model(n=NUM_UNITS),
	dataflow=dataset_train,
	callbacks=[
	ModelSaver(),
	InferenceRunner(dataset_test,
	[ScalarStats('cost'), ClassificationError()]),
	ScheduledHyperParamSetter('learning_rate',
	[(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)])
	],
	max_epoch=400,
	nr_tower=max(get_nr_gpu(), 1),
	session_init=SaverRestore(args.load) if args.load else None
	)
	SyncMultiGPUTrainerParameterServer(config).train()