Skip to content

Instantly share code, notes, and snippets.

Last active Nov 7, 2020
What would you like to do?
Keras GRU with Layer Normalization
import numpy as np
from keras.layers import GRU, initializations, K
from collections import OrderedDict
class GRULN(GRU):
'''Gated Recurrent Unit with Layer Normalization
Current impelemtation only works with consume_less = 'gpu' which is already
# Arguments
output_dim: dimension of the internal projections and the final output.
...: see GRU documentation for all other arguments.
gamma_init: name of initialization function for scale parameter.
The default is 1, but in some cases this is too high resulting
in NaN loss while training. If this happens try reducing to 0.2
# References
-[Layer Normalization](
def __init__(self, output_dim, gamma_init=1., **kwargs):
if 'consume_less' in kwargs:
assert kwargs['consume_less'] == 'gpu'
kwargs = kwargs.copy()
super(GRULN, self).__init__(output_dim, **kwargs)
def gamma_init_func(shape, c=gamma_init, **kwargs):
if c == 1.:
return initializations.get('one')(shape, **kwargs)
return K.variable(np.ones(shape) * c, **kwargs)
self.gamma_init = gamma_init_func
self.beta_init = initializations.get('zero')
self.epsilon = 1e-5
def build(self, input_shape):
super(GRULN, self).build(input_shape)
shape = (self.output_dim,)
shape1 = (2*self.output_dim,)
# LN is applied in 4 inputs/outputs (fields) of the cell
gammas = OrderedDict()
betas = OrderedDict()
# each location has its own BN
for slc, shp in zip(['state_below', 'state_belowx', 'preact', 'preactx'], [shape1, shape, shape1, shape]):
gammas[slc] = self.gamma_init(shp,
name='{}_gamma_{}'.format(, slc))
betas[slc] = self.beta_init(shp,
name='{}_beta_{}'.format(, slc))
self.gammas = gammas
self.betas = betas
self.trainable_weights += self.gammas.values() + self.betas.values()
def ln(self, x, slc):
# sample-wise normalization
m = K.mean(x, axis=-1, keepdims=True)
std = K.sqrt(K.var(x, axis=-1, keepdims=True) + self.epsilon)
x_normed = (x - m) / (std + self.epsilon)
x_normed = self.gammas[slc] * x_normed + self.betas[slc]
return x_normed
def step(self, x, states):
h_tm1 = states[0] # previous memory
B_U = states[1] # dropout matrices for recurrent units
B_W = states[2]
matrix_x = * B_W[0], self.W) + self.b
x_ = self.ln(matrix_x[:, : 2 * self.output_dim], 'state_below')
xx_ = self.ln(matrix_x[:, 2 * self.output_dim:], 'state_belowx')
matrix_inner = self.ln( * B_U[0], self.U[:, :2 * self.output_dim]), 'preact')
x_z = x_[:, :self.output_dim]
x_r = x_[:, self.output_dim: 2 * self.output_dim]
inner_z = matrix_inner[:, :self.output_dim]
inner_r = matrix_inner[:, self.output_dim: 2 * self.output_dim]
z = self.inner_activation(x_z + inner_z)
r = self.inner_activation(x_r + inner_r)
x_h = xx_
inner_h = r * self.ln( * B_U[0], self.U[:, 2 * self.output_dim:]), 'preactx')
hh = self.activation(x_h + inner_h)
h = z * h_tm1 + (1 - z) * hh
return h, [h]
if __name__ == '__main__':
from keras.layers import Input
from import Model
input = Input(batch_shape=(5, 6, 7), dtype='float32',name='input')
rnn = GRULN(10)
output = rnn(input)
model = Model(input=input, output=output)
model.compile(loss='mse', optimizer='sgd')
data = np.ones((5,6,7), dtype='float32')
probs = model.predict(data,batch_size=5)
print probs.shape,probs.mean()
# (5, 10) 0.0689924
print rnn.trainable_weights
Copy link

EdwardRaff commented Aug 5, 2016

Have you tested this code? I tried it with the Keras MNIST example (used your GRULN here ) and it produces a NaN loss and doesn't learn

Copy link

udibr commented Aug 14, 2016

I also noticed NaN in some cases, this is also discussed on twitter
You can fix this problem by reducing gamma to 0.2 in the updated code.

Copy link

ghost commented Aug 23, 2016

Hi @udibr , I have used your code in standard fully connected neural network here. I have already reduced the gamma to 0.2 as you said. But it still could not learn and the loss are all NaN. Could you please take a look and see what's wrong? Thank you in advance!

Copy link

hassyGo commented Sep 21, 2016

In my experiments on RNNs and FFNs, NNs with layer normalization never outperform NNs without layer normalization.
Does anyone observe positive results using layer normalization?
If so, please share the tips :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment