Skip to content

Instantly share code, notes, and snippets.

Created November 1, 2016 08:06
  • Star 33 You must be signed in to star a gist
  • Fork 13 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
A keras attention layer that wraps RNN layers.
A keras attention layer that wraps RNN layers.
Based on tensorflows [attention_decoder](
and [Grammar as a Foreign Language](
date: 20161101
author: wassname
from keras import backend as K
from keras.engine import InputSpec
from keras.layers import LSTM, activations, Wrapper, Recurrent
class Attention(Wrapper):
This wrapper will provide an attention layer to a recurrent layer.
# Arguments:
layer: `Recurrent` instance with consume_less='gpu' or 'mem'
# Examples:
model = Sequential()
model.add(LSTM(10, return_sequences=True), batch_input_shape=(4, 5, 10))
model.add(TFAttentionRNNWrapper(LSTM(10, return_sequences=True, consume_less='gpu')))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
# References
- [Grammar as a Foreign Language](
def __init__(self, layer, **kwargs):
assert isinstance(layer, Recurrent)
if layer.get_config()['consume_less']=='cpu':
raise Exception("AttentionLSTMWrapper doesn't support RNN's with consume_less='cpu'")
self.supports_masking = True
super(Attention, self).__init__(layer, **kwargs)
def build(self, input_shape):
assert len(input_shape) >= 3
self.input_spec = [InputSpec(shape=input_shape)]
nb_samples, nb_time, input_dim = input_shape
if not self.layer.built:
self.layer.built = True
super(Attention, self).build()
self.W1 = self.layer.init((input_dim, input_dim, 1, 1), name='{}_W1'.format(
self.W2 = self.layer.init((self.layer.output_dim, input_dim), name='{}_W2'.format(
self.b2 = K.zeros((input_dim,), name='{}_b2'.format(
self.W3 = self.layer.init((input_dim*2, input_dim), name='{}_W3'.format(
self.b3 = K.zeros((input_dim,), name='{}_b3'.format(
self.V = self.layer.init((input_dim,), name='{}_V'.format(
self.trainable_weights = [self.W1, self.W2, self.W3, self.V, self.b2, self.b3]
def get_output_shape_for(self, input_shape):
return self.layer.get_output_shape_for(input_shape)
def step(self, x, states):
# This is based on [tensorflows implementation](
# First, we calculate new attention masks:
# attn = softmax(V^T * tanh(W2 * X +b2 + W1 * h))
# and we make the input as a concatenation of the input and weighted inputs which is then
# transformed back to the shape x of using W3
# x = W3*(x+X*attn)+b3
# Then, we run the cell on a combination of the input and previous attention masks:
# h, state = cell(x, h).
nb_samples, nb_time, input_dim = self.input_spec[0].shape
h = states[0]
X = states[-1]
xW1 = states[-2]
Xr = K.reshape(X,(-1,nb_time,1,input_dim))
hW2 =,self.W2)+self.b2
hW2 = K.reshape(hW2,(-1,1,1,input_dim))
u = K.tanh(xW1+hW2)
a = K.sum(self.V*u,[2,3])
a = K.softmax(a)
a = K.reshape(a,(-1, nb_time, 1, 1))
# Weight attention vector by attention
Xa = K.sum(a*Xr,[1,2])
Xa = K.reshape(Xa,(-1,input_dim))
# Merge input and attention weighted inputs into one vector of the right size.
x =[x,Xa],1),self.W3)+self.b3
h, new_states = self.layer.step(x, states)
return h, new_states
def get_constants(self, x):
constants = self.layer.get_constants(x)
# Calculate, W2) only once per sequence by making it a constant
nb_samples, nb_time, input_dim = self.input_spec[0].shape
Xr = K.reshape(x,(-1,nb_time,input_dim,1))
Xrt = K.permute_dimensions(Xr, (0, 2, 1, 3))
xW1t = K.conv2d(Xrt,self.W1,border_mode='same')
xW1 = K.permute_dimensions(xW1t, (0, 2, 3, 1))
# we need to supply the full sequence of inputs to step (as the attention_vector)
return constants
def call(self, x, mask=None):
# input shape: (nb_samples, time (padded with zeros), input_dim)
input_shape = self.input_spec[0].shape
if K._BACKEND == 'tensorflow':
if not input_shape[1]:
raise Exception('When using TensorFlow, you should define '
'explicitly the number of timesteps of '
'your sequences.\n'
'If your first layer is an Embedding, '
'make sure to pass it an "input_length" '
'argument. Otherwise, make sure '
'the first layer has '
'an "input_shape" or "batch_input_shape" '
'argument, including the time axis. '
'Found input shape at layer ' + +
': ' + str(input_shape))
if self.layer.stateful:
initial_states = self.layer.states
initial_states = self.layer.get_initial_states(x)
constants = self.get_constants(x)
preprocessed_input = self.layer.preprocess_input(x)
last_output, outputs, states = K.rnn(self.step, preprocessed_input,
if self.layer.stateful:
self.updates = []
for i in range(len(states)):
self.updates.append((self.layer.states[i], states[i]))
if self.layer.return_sequences:
return outputs
return last_output
# test likes in
import pytest
import numpy as np
from numpy.testing import assert_allclose
from keras.utils.test_utils import keras_test
from keras.layers import wrappers, Input, recurrent, InputLayer
from keras.layers import core, convolutional, recurrent
from keras.models import Sequential, Model, model_from_json
nb_samples, timesteps, embedding_dim, output_dim = 2, 5, 3, 4
embedding_num = 12
x = np.random.random((nb_samples, timesteps, embedding_dim))
y = np.random.random((nb_samples, timesteps, output_dim))
# base line test with LSTM
model = Sequential()
model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem')))
model.compile(optimizer='rmsprop', loss='mse'),y, nb_epoch=1, batch_size=nb_samples)
# test stacked with all RNN layers and consume_less options
model = Sequential()
model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
# test supported consume_less options
# model.add(Attention(recurrent.LSTM(embedding_dim, input_dim=embedding_dim,, consume_less='cpu' return_sequences=True))) # not supported
model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, consume_less='gpu', return_sequences=True)))
model.add(Attention(recurrent.LSTM(embedding_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
# test each other RNN type
model.add(Attention(recurrent.GRU(embedding_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
model.add(Attention(recurrent.SimpleRNN(embedding_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
model.compile(optimizer='rmsprop', loss='mse'),y, nb_epoch=1, batch_size=nb_samples)
# test with return_sequence = False
model = Sequential()
model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=False, consume_less='mem')))
model.compile(optimizer='rmsprop', loss='mse'),y[:,-1,:], nb_epoch=1, batch_size=nb_samples)
# with bidirectional encoder
model = Sequential()
model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
model.add(wrappers.Bidirectional(recurrent.LSTM(embedding_dim, input_dim=embedding_dim, return_sequences=True)))
model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem')))
model.compile(optimizer='rmsprop', loss='mse'),y, nb_epoch=1, batch_size=nb_samples)
# test config
# test to and from json
model = model_from_json(model.to_json(),custom_objects=dict(Attention=Attention))
# test with functional API
input = Input(batch_shape=(nb_samples, timesteps, embedding_dim))
output = Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem'))(input)
model = Model(input, output)
model.compile(optimizer='rmsprop', loss='mse'), y, nb_epoch=1, batch_size=nb_samples)
Copy link

billhsia commented May 18, 2017

Hi wassname
Your wrapper is great , and I would like to know the version of your keras, I would appreciate!
because when i run this script, i got error below:

AttributeError: 'LSTM' object has no attribute 'init'

Copy link

@wassname @billhsia running into the same issue now, any hints?

Copy link

@wassname @billhsia @thomasjungblut
It's because the version of keras in use. You could update the source code by referring to
However, I get another error at line:
x =[x,Xa],1),self.W3)+self.b3
with error:
"ValueError: Dimensions must be equal, but are 1800 and 1200 for 'attention_1/MatMul_5' (op: 'MatMul') with input shapes: [?,1800], [1200,600]."
The dimensions cannot match. Could anyone help?

Copy link

xymtxwd commented Jun 16, 2017

Does anyone really get this code running? I met exactly the same issues as others did.

Copy link

philipperemy commented Jun 22, 2017

Thanks for this implementation!

If somebody wants a much more easier and compact implementation of the attention mechanism for RNN, have a look at:

@xymtxwd @billhsia

Copy link

wassname commented Jul 14, 2017

@billhsia sorry I didn't see this until now, the keras version is in this requirements.txt,

everyone, latest version is here

@philipperemy that's a nice implementation. Is it that simple, and whats the performance like? If you added some tests to the repo using example data instead of random data I would definitely use it

Copy link

@wassname I think pop index error still persist for tensorflow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment