Skip to content

Instantly share code, notes, and snippets.

@udibr udibr/gruln.py
Last active Jul 26, 2019

Embed
What would you like to do?
Keras GRU with Layer Normalization
import numpy as np
from keras.layers import GRU, initializations, K
from collections import OrderedDict
class GRULN(GRU):
'''Gated Recurrent Unit with Layer Normalization
Current impelemtation only works with consume_less = 'gpu' which is already
set.
# Arguments
output_dim: dimension of the internal projections and the final output.
...: see GRU documentation for all other arguments.
gamma_init: name of initialization function for scale parameter.
The default is 1, but in some cases this is too high resulting
in NaN loss while training. If this happens try reducing to 0.2
# References
-[Layer Normalization](https://arxiv.org/abs/1607.06450)
'''
def __init__(self, output_dim, gamma_init=1., **kwargs):
if 'consume_less' in kwargs:
assert kwargs['consume_less'] == 'gpu'
else:
kwargs = kwargs.copy()
kwargs['consume_less']='gpu'
super(GRULN, self).__init__(output_dim, **kwargs)
def gamma_init_func(shape, c=gamma_init, **kwargs):
if c == 1.:
return initializations.get('one')(shape, **kwargs)
return K.variable(np.ones(shape) * c, **kwargs)
self.gamma_init = gamma_init_func
self.beta_init = initializations.get('zero')
self.epsilon = 1e-5
def build(self, input_shape):
super(GRULN, self).build(input_shape)
shape = (self.output_dim,)
shape1 = (2*self.output_dim,)
# LN is applied in 4 inputs/outputs (fields) of the cell
gammas = OrderedDict()
betas = OrderedDict()
# each location has its own BN
for slc, shp in zip(['state_below', 'state_belowx', 'preact', 'preactx'], [shape1, shape, shape1, shape]):
gammas[slc] = self.gamma_init(shp,
name='{}_gamma_{}'.format(
self.name, slc))
betas[slc] = self.beta_init(shp,
name='{}_beta_{}'.format(
self.name, slc))
self.gammas = gammas
self.betas = betas
self.trainable_weights += self.gammas.values() + self.betas.values()
def ln(self, x, slc):
# sample-wise normalization
m = K.mean(x, axis=-1, keepdims=True)
std = K.sqrt(K.var(x, axis=-1, keepdims=True) + self.epsilon)
x_normed = (x - m) / (std + self.epsilon)
x_normed = self.gammas[slc] * x_normed + self.betas[slc]
return x_normed
def step(self, x, states):
h_tm1 = states[0] # previous memory
B_U = states[1] # dropout matrices for recurrent units
B_W = states[2]
matrix_x = K.dot(x * B_W[0], self.W) + self.b
x_ = self.ln(matrix_x[:, : 2 * self.output_dim], 'state_below')
xx_ = self.ln(matrix_x[:, 2 * self.output_dim:], 'state_belowx')
matrix_inner = self.ln(K.dot(h_tm1 * B_U[0], self.U[:, :2 * self.output_dim]), 'preact')
x_z = x_[:, :self.output_dim]
x_r = x_[:, self.output_dim: 2 * self.output_dim]
inner_z = matrix_inner[:, :self.output_dim]
inner_r = matrix_inner[:, self.output_dim: 2 * self.output_dim]
z = self.inner_activation(x_z + inner_z)
r = self.inner_activation(x_r + inner_r)
x_h = xx_
inner_h = r * self.ln(K.dot(h_tm1 * B_U[0], self.U[:, 2 * self.output_dim:]), 'preactx')
hh = self.activation(x_h + inner_h)
h = z * h_tm1 + (1 - z) * hh
return h, [h]
if __name__ == '__main__':
from keras.layers import Input
from keras.engine.training import Model
np.random.seed(42)
input = Input(batch_shape=(5, 6, 7), dtype='float32',name='input')
rnn = GRULN(10)
output = rnn(input)
model = Model(input=input, output=output)
model.compile(loss='mse', optimizer='sgd')
data = np.ones((5,6,7), dtype='float32')
probs = model.predict(data,batch_size=5)
print probs.shape,probs.mean()
# (5, 10) 0.0689924
print rnn.trainable_weights
@EdwardRaff

This comment has been minimized.

Copy link

commented Aug 5, 2016

Have you tested this code? I tried it with the Keras MNIST example (used your GRULN here ) and it produces a NaN loss and doesn't learn

@udibr

This comment has been minimized.

Copy link
Owner Author

commented Aug 14, 2016

I also noticed NaN in some cases, this is also discussed on twitter
You can fix this problem by reducing gamma to 0.2 in the updated code.

@rylanchiu

This comment has been minimized.

Copy link

commented Aug 23, 2016

Hi @udibr , I have used your code in standard fully connected neural network here. I have already reduced the gamma to 0.2 as you said. But it still could not learn and the loss are all NaN. Could you please take a look and see what's wrong? Thank you in advance!

@hassyGo

This comment has been minimized.

Copy link

commented Sep 21, 2016

In my experiments on RNNs and FFNs, NNs with layer normalization never outperform NNs without layer normalization.
Does anyone observe positive results using layer normalization?
If so, please share the tips :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.