Skip to content

Instantly share code, notes, and snippets.

@skaae
Last active May 11, 2016 13:02
Show Gist options
  • Save skaae/5faacedb9c5961136e82 to your computer and use it in GitHub Desktop.
Save skaae/5faacedb9c5961136e82 to your computer and use it in GitHub Desktop.
import numpy as np
import theano
import theano.tensor as T
from theano import ifelse
from .. import init
from .. import nonlinearities
from .base import Layer
__all__ = [
"BatchNormalizationLayer"
]
class BatchNormalizationLayer(Layer):
"""
Batch normalization Layer [1]
The user is required to setup updates for the learned parameters (Gamma
and Beta). The values nessesary for creating the updates can be
obtained by passing a dict as the moving_avg_hooks keyword to
get_output().
REF:
[1] http://arxiv.org/abs/1502.03167
:parameters:
- input_layer : `Layer` instance
The layer from which this layer will obtain its input
- nonlinearity : callable or None (default: lasagne.nonlinearities.rectify)
The nonlinearity that is applied to the layer activations. If None
is provided, the layer will be linear.
- epsilon : scalar float. Stabilizing training. Setting this too
close to zero will result in nans.
:usage:
>>> from lasagne.layers import InputLayer, BatchNormalizationLayer,
DenseLayer
>>> from lasagne.nonlinearities import linear, rectify
>>> l_in = InputLayer((100, 20))
l_dense = Denselayer(l_in, 50, nonlinearity=linear)
>>> l_bn = BatchNormalizationLayer(l_dense, nonlinearity=rectify)
>>> hooks, input, updates = {}, T.matrix, []
>>> l_out = l_bn.get_output(
input, deterministic=False, moving_avg_hooks=hooks)
>>> mulfac = 1.0/100.0
>>> batchnormparams = list(itertools.chain(
*[i[1] for i in hooks['BatchNormalizationLayer:movingavg']]))
>>> batchnormvalues = list(itertools.chain(
*[i[0] for i in hooks['BatchNormalizationLayer:movingavg']]))
>>> for tensor, param in zip(tensors, params):
updates.append((param, (1.0-mulfac)*param + mulfac*tensor))
# append updates to your normal update list
"""
def __init__(self, incoming,
gamma = init.Uniform([0.95, 1.05]),
beta = init.Constant(0.),
nonlinearity=nonlinearities.rectify,
epsilon = 0.001,
**kwargs):
super(BatchNormalizationLayer, self).__init__(incoming, **kwargs)
if nonlinearity is None:
self.nonlinearity = nonlinearities.identity
else:
self.nonlinearity = nonlinearity
self.num_units = int(np.prod(self.input_shape[1:]))
self.gamma = self.create_param(gamma, (self.num_units),
name="BatchNormalizationLayer:gamma")
self.beta = self.create_param(beta, (self.num_units),
name="BatchNormalizationLayer:beta")
self.epsilon = epsilon
self.mean_inference = theano.shared(
np.zeros((1, self.num_units), dtype=theano.config.floatX),
borrow=True,
broadcastable=(True, False))
self.mean_inference.name = "shared:mean"
self.variance_inference = theano.shared(
np.zeros((1, self.num_units), dtype=theano.config.floatX),
borrow=True,
broadcastable=(True, False))
self.variance_inference.name = "shared:variance"
def get_params(self):
return [self.gamma, self.beta]
def get_output_shape_for(self, input_shape):
return input_shape
def get_output_for(self, input, moving_avg_hooks=None,
deterministic=False, *args, **kwargs):
if input.ndim > 2:
output_shape = input.shape
input = input.flatten(2)
if deterministic is False:
m = T.mean(input, axis=0, keepdims=True)
v = T.sqrt(T.var(input, axis=0, keepdims=True)+self.epsilon)
m.name = "tensor:mean"
v.name = "tensor:variance"
key = "BatchNormalizationLayer:movingavg"
if key not in moving_avg_hooks:
moving_avg_hooks[key] = []
moving_avg_hooks[key].append(
[[m,v], [self.mean_inference, self.variance_inference]])
else:
m = self.mean_inference
v = self.variance_inference
input_hat = (input - m) / v # normalize
y = self.gamma*input_hat + self.beta # scale and shift
if input.ndim > 2:
y = T.reshape(y, output_shape)
return self.nonlinearity(y)
@DediGadot
Copy link

Hi,

I am trying to use your (excellently written) code above.
When trying to add a Batchnormalization layer between 2 conv-layers, the code fails.

The way I'm structuring the network:

    dimshuffle=True

    l_in = layers.InputLayer(
        shape=(batch_size, in_channels, input_width, input_height),
    ) #### =(128,2,51,51)

    l_conv1 = layers.cuda_convnet.Conv2DCCLayer(
        l_in,
        num_filters=64,
        filter_size=(7, 7),
        nonlinearity=nonlinearities.linear,
        W=lasagne.init.Uniform(),
        dimshuffle=dimshuffle,
    )

    l_norm1 = BatchNormalizationLayer(
        l_conv1,
        nonlinearity=nonlinearities.rectify
    )

    l_conv2 = layers.cuda_convnet.Conv2DCCLayer(
        l_norm1,
        num_filters=32,
        filter_size=(5, 5),
        nonlinearity=nonlinearities.linear,
        W=lasagne.init.Uniform(),
        dimshuffle=dimshuffle,
    )
    l_out = lasagne.layers.DenseLayer(
        l_conv2,#l_norm_h3,
        num_units=output_dim,
        nonlinearity=nonlinearities.linear,
        W=lasagne.init.Uniform(),
    )

    return l_out

Then, when creating the updates function I wrote something similar to your example mnist_bn:

    hooks = {}
    loss_train = objective.get_loss(X_batch, target=y_batch,moving_avg_hooks=hooks)
    loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)

    batchnormparams = list(itertools.chain(*[i[1] for i in hooks['BatchNormalizationLayer:movingavg']]))
    batchnormvalues = list(itertools.chain(*[i[0] for i in hooks['BatchNormalizationLayer:movingavg']]))

    # create updates
    def batchnormalizeupdates(tensors, params, avglen):
        updates = []
        mulfac = 1.0/avglen
        for tensor, param in zip(tensors, params):
            updates.append((param, (1.0-mulfac)*param + mulfac*tensor))
        return updates

    batchupd = batchnormalizeupdates(batchnormvalues, batchnormparams, 100)

    pred = output_layer.get_output(X_batch, deterministic=True)
    accuracy = T.mean((pred-y_batch)**2, dtype=theano.config.floatX)

    all_params = layers.get_all_params(output_layer)
    ups = updates.nesterov_momentum(
        loss_train, all_params, learning_rate, momentum)

    # add batchnormalize updates
    ups += batchupd

    iter_train = theano.function(
        [batch_index], loss_train,
        updates=ups,
        givens={
            X_batch: dataset['X_train'][batch_slice],
            y_batch: dataset['y_train'][batch_slice],
        },
    )

Yet I am getting this error:


  File "/home/dedigadot/test_las_stacked.py", line 170, in create_iter_functions
    loss_train = objective.get_loss(X_batch, target=y_batch,moving_avg_hooks=hooks)

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/Lasagne-0.1dev-py2.7.egg/lasagne/objectives.py", line 91, in get_loss
    network_output = self.input_layer.get_output(input, *args, **kwargs)

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/Lasagne-0.1dev-py2.7.egg/lasagne/layers/base.py", line 128, in get_output
    layer_input = self.input_layer.get_output(input, *args, **kwargs)

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/Lasagne-0.1dev-py2.7.egg/lasagne/layers/base.py", line 129, in get_output
    return self.get_output_for(layer_input, *args, **kwargs)

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/Lasagne-0.1dev-py2.7.egg/lasagne/layers/cuda_convnet.py", line 149, in get_output_for
    input = input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/theano/tensor/var.py", line 334, in dimshuffle
    pattern)

  File "/home/dedigadot/Envs/env2/local/lib/python2.7/site-packages/theano/tensor/elemwise.py", line 141, in __init__
    (i, j, len(input_broadcastable)))

ValueError: new_order[1] is 2, but the input only has 2 axes.

Is it possible that the code is not meant to be used with conv-layers?

Thanks,
David

@DediGadot
Copy link

The bug is in line 119.
You decide whether to reshape the output based on input.ndim, yet you flattened the input in line 99.
I added a Boolean variable specifying whether the input was indeed reshaped or not.

Cheers,
David

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment