def dot_product(x, kernel): | |
""" | |
Wrapper for dot product operation, in order to be compatible with both | |
Theano and Tensorflow | |
Args: | |
x (): input | |
kernel (): weights | |
Returns: | |
""" | |
if K.backend() == 'tensorflow': | |
return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) | |
else: | |
return K.dot(x, kernel) | |
class AttentionWithContext(Layer): | |
""" | |
Attention operation, with a context/query vector, for temporal data. | |
Supports Masking. | |
Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] | |
"Hierarchical Attention Networks for Document Classification" | |
by using a context vector to assist the attention | |
# Input shape | |
3D tensor with shape: `(samples, steps, features)`. | |
# Output shape | |
2D tensor with shape: `(samples, features)`. | |
How to use: | |
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. | |
The dimensions are inferred based on the output shape of the RNN. | |
Note: The layer has been tested with Keras 2.0.6 | |
Example: | |
model.add(LSTM(64, return_sequences=True)) | |
model.add(AttentionWithContext()) | |
# next add a Dense layer (for classification/regression) or whatever... | |
""" | |
def __init__(self, | |
W_regularizer=None, u_regularizer=None, b_regularizer=None, | |
W_constraint=None, u_constraint=None, b_constraint=None, | |
bias=True, **kwargs): | |
self.supports_masking = True | |
self.init = initializers.get('glorot_uniform') | |
self.W_regularizer = regularizers.get(W_regularizer) | |
self.u_regularizer = regularizers.get(u_regularizer) | |
self.b_regularizer = regularizers.get(b_regularizer) | |
self.W_constraint = constraints.get(W_constraint) | |
self.u_constraint = constraints.get(u_constraint) | |
self.b_constraint = constraints.get(b_constraint) | |
self.bias = bias | |
super(AttentionWithContext, self).__init__(**kwargs) | |
def build(self, input_shape): | |
assert len(input_shape) == 3 | |
self.W = self.add_weight((input_shape[-1], input_shape[-1],), | |
initializer=self.init, | |
name='{}_W'.format(self.name), | |
regularizer=self.W_regularizer, | |
constraint=self.W_constraint) | |
if self.bias: | |
self.b = self.add_weight((input_shape[-1],), | |
initializer='zero', | |
name='{}_b'.format(self.name), | |
regularizer=self.b_regularizer, | |
constraint=self.b_constraint) | |
self.u = self.add_weight((input_shape[-1],), | |
initializer=self.init, | |
name='{}_u'.format(self.name), | |
regularizer=self.u_regularizer, | |
constraint=self.u_constraint) | |
super(AttentionWithContext, self).build(input_shape) | |
def compute_mask(self, input, input_mask=None): | |
# do not pass the mask to the next layers | |
return None | |
def call(self, x, mask=None): | |
uit = dot_product(x, self.W) | |
if self.bias: | |
uit += self.b | |
uit = K.tanh(uit) | |
ait = K.dot(uit, self.u) | |
a = K.exp(ait) | |
# apply mask after the exp. will be re-normalized next | |
if mask is not None: | |
# Cast the mask to floatX to avoid float64 upcasting in theano | |
a *= K.cast(mask, K.floatx()) | |
# in some cases especially in the early stages of training the sum may be almost zero | |
# and this results in NaN's. A workaround is to add a very small positive number ε to the sum. | |
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) | |
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) | |
a = K.expand_dims(a) | |
weighted_input = x * a | |
return K.sum(weighted_input, axis=1) | |
def compute_output_shape(self, input_shape): | |
return input_shape[0], input_shape[-1] |
This comment has been minimized.
This comment has been minimized.
I'm having the same problem actually. I've narrowed the problem down to a specific cause. For some reason this layer gives the wrong output shape. It should be (None, 100), but as you can see in your table (row 3), the output shape is (None, 500, 100). Will see if I can fix it. EDIT: I found the issue. Custom layers need a |
This comment has been minimized.
This comment has been minimized.
Thank you for releasing your code. Have you implemented the entire Hierarchical Attention Network (HAN) also, apart from the above attention layer? Any leads on how to get the code of HAN, preferably in Keras. I have currently found these two Keras implementations: https://github.com/richliao/textClassifier and https://github.com/synthesio/hierarchical-attention-networks/blob/master/model.py |
This comment has been minimized.
This comment has been minimized.
Need some help in two lines. |
This comment has been minimized.
This comment has been minimized.
@cbaziotis My loss is still NaN despite the small epsilon. Any recommended paths for debugging? |
This comment has been minimized.
This comment has been minimized.
Sorry for not replying sooner, but notifications for gist comments apparently don't work. Regarding some of the errors: the layer was developed using Theano as a backend. I have updated the gist and now it also works with Tensorflow. However, i suggest to use Theano, as it has better RNN performance. Please use the new version and let me know. @Helw150 do you mind sharing the code for your model? |
This comment has been minimized.
This comment has been minimized.
Thank you for this! using it in my school project. |
This comment has been minimized.
This comment has been minimized.
Updated for Keras 2. |
This comment has been minimized.
This comment has been minimized.
Line 93: I had to replace |
This comment has been minimized.
This comment has been minimized.
@sreiling If that, there is not error. But the model result is different. If I make that,AttentionWithContext's output dimension is lstm's hidden dim, compute_output_shape's output dim is input's last dim(embedding dim). Is it right? |
This comment has been minimized.
This comment has been minimized.
Really great code @cbaziotis! I've used it several times for classification problem. But, I've been wondering how to use this in a seq2seq architecture? Many thanks! |
This comment has been minimized.
This comment has been minimized.
L93 was also creating an issue for me with TensorFlow so I reused the |
This comment has been minimized.
This comment has been minimized.
Thanks so much for this terrific gist (as well as your other One minor bug: on line 93, |
This comment has been minimized.
This comment has been minimized.
Nice work. Thanks for sharing the code. |
This comment has been minimized.
This comment has been minimized.
Thank you for your code. |
This comment has been minimized.
This comment has been minimized.
Hello everyone I was wondering, does anyone know how to create an attention layer with a custom (fixed, or trainable) context vector? I have tried this:
having also modified some other aspects such as the
However, when attempting to run
I get an error on the final_merge which says:
and if I comment out that specific lines, I instead get the error:
Any ideas? Thanks. |
This comment has been minimized.
This comment has been minimized.
Just wondering whether |
This comment has been minimized.
This comment has been minimized.
https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2#gistcomment-2373145 @LeZhengThu This code works for the variable length input I think. At least it works for my case. |
This comment has been minimized.
This comment has been minimized.
I'm getting negative attention weights for some words using this. Is this supposed to happen? If so, any literature that indicates this should happen? If not, any ideas on how to fix? |
This comment has been minimized.
This comment has been minimized.
@skywang329 Do you check a values or u values ? The attention weights are the a values and normally the exponential forces coefficient to be positive |
This comment has been minimized.
This comment has been minimized.
https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2#gistcomment-2605022 |
This comment has been minimized.
This comment has been minimized.
I fixed the error I was getting by replacing |
This comment has been minimized.
This comment has been minimized.
Thanks for your implementation @cbaziotis! I have made some modifications on your code here in order to make it compatible with Keras 2.x and to also make easy recovering the attention weights for visualization. By the way, have you thought about making a PR for the attention layer on keras-contrib? |
This comment has been minimized.
This comment has been minimized.
inputs = Input(shape=(100,)) I train it with a binary classification problem. My question is How should I catch 'word_scores'? attention_model = Model(input= model.input, output= model.layers[-2].output) I got the 'sentence' rather than 'word_scores ' Anyone knows? |
This comment has been minimized.
This comment has been minimized.
Where is the context computed? I need to output a different sequence length than the one of the input. |
This comment has been minimized.
This comment has been minimized.
The attention layer outputs a 2D tensor shape (none,256) any idea on how to make it output a 3D tensor without reshaping??! Because I reshaped it to be (none,1,256) and my time distributed dense layers that follow expects (None, 1, 15) and I need it to expect what its actually receiving (none,20,15) since 20 is my max sentence length ?! Any ideas? |
This comment has been minimized.
This comment has been minimized.
Great work, thanks! I've made some small updates, so that the Layer works under Tensorflow 1.13 with Eager Execution (EE is awesome, with its imperative model, makes debugging soooooo much easier.) |
This comment has been minimized.
This comment has been minimized.
will this work for different modalities like (visual and texual)? |
This comment has been minimized.
This comment has been minimized.
@LuisPB7 I combine the context and key into a whole tensor as an input, then split them in the Attention class. But that needs some modification in the Attention codes (stuff like tensor calculation, input/output shape). |
This comment has been minimized.
This comment has been minimized.
Will this work for images? |
This comment has been minimized.
This comment has been minimized.
I am getting this error. Can anyone please help me resolve it. CODE: |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
@cbaziotis |
This comment has been minimized.
Thank you so much for the code, i've tried the code but i got an error about input dimension in the dense layer. Here's my code:
I'm using tensorflow as the backend, and i've fix the code using your suggestion in the keras issue #4962. but i'm still getting the error. Here's the output and layer diagram:
And here's the error :
ValueError: Error when checking model target: expected dense_1 to have 3 dimensions, but got array with shape (25000, 2)
I'm using the imdb sentiment analysis data for this by the way.