Skip to content

Instantly share code, notes, and snippets.

@cedrickchee
Last active December 25, 2018 02:41
Show Gist options
  • Save cedrickchee/c4116c3b60fd1d4801bde60895b7ea54 to your computer and use it in GitHub Desktop.
Save cedrickchee/c4116c3b60fd1d4801bde60895b7ea54 to your computer and use it in GitHub Desktop.
Text classification (NLP) using Capsule Network (aka CapsNet) layer and GRU/LSTM block

We will look at the advantage of Capsule layer in text classification.

CapsNet Model

The architecture of our model with CapsNet is very similar to general architecture, except for an addition Capsule layer.

Advantage of Capsule Layer in Text Classification

The diagram shows that we have used Capsule layer instead of Pooling layer. Capsule Layer eliminates the need for forced pooling layers like MaxPool. In many cases, this is desired because we get translational invariance without losing minute details.

What is the advantage over CNN?

In a CNN, there are pooling layers. We generally use MaxPool which is a very primitive type of routing mechanism. The most active feature in a local pool (say 4x4 grid) is routed to the higher layer and the higher-level detectors don't have a say in the routing. Compare this with the routing-by-agreement mechanism introduced in the CapsNet. Only those features that agree with high-level detectors are routed. This is the advantage of CapsNet over CNN. It has a superior dynamic routing mechanism (dynamic because the information to be routed is determined in real time).

It's great to see that improvement in the Computer Vision field is also helping NLP/NLU field.

# first, imports Keras
from keras.engine import Layer
from keras.layers import (
K,
Activation,
Bidirectional,
Dense,
Dropout,
Embedding,
Flatten,
GRU,
Input,
SpatialDropout1D,
)
from keras.models import Model
# CapsNet params
LEN_GRU = 128
N_ROUTINGS = 5
N_CAPS = 10
DIM_CAPS = 16
DROPOUT_PROBA = 0.3
DROP_DENSE_RATE = 0.3
def squash(x, axis=-1):
"""The implementation of the squash in the paper"""
squared_norm = K.sum(K.square(x), axis, keepdims=True)
scale_factor = K.sqrt(squared_norm + K.epsilon())
return x / scale_factor
class Capsule(Layer):
"""The Capsule layer!"""
def __init__(
self,
n_caps,
dim_caps,
n_routings=3,
activation="default",
share_weights=True,
kernel_size=(9, 1),
**kwargs
):
super(Capsule, self).__init__(**kwargs)
self.n_caps = n_caps
self.dim_caps = dim_caps
self.n_routings = n_routings
self.share_weights = share_weights
self.kernel_size = kernel_size
if activation == "default":
self.activation = squash
else:
self.activation = Activation(activation)
def build(self, input_shape):
super(Capsule, self).build(input_shape)
input_dim_caps = input_shape[-1]
# Capsule kernel weights and weights init
if self.share_weights:
self.W = self.add_weight(
name="capsule kernel",
shape=(1, input_dim_caps, self.n_caps * self.dim_caps),
initializer="glorot_uniform",
trainable=True,
)
else:
input_n_caps = input_shape[-2]
self.W = self.add_weight(
name="capsule kernel",
shape=(input_n_caps, input_dim_caps, self.n_caps * self.dim_caps),
initializer="glorot_uniform",
trainable=True,
)
def call(self, u_vecs):
batch_size = K.shape(u_vecs)[0]
input_n_caps = K.shape(u_vecs)[1]
if self.share_weights:
u_hat_vecs = K.conv1d(u_vecs, self.W)
else:
u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])
u_hat_vecs = K.reshape(
u_hat_vecs, (batch_size, input_n_caps, self.n_caps, self.dim_caps)
)
u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
h = K.zeros_like(u_hat_vecs[:, :, :, 0])
for i in range(self.n_routings):
h = K.permute_dimensions(h, (0, 2, 1))
c = K.softmax(h)
c = K.permute_dimensions(c, (0, 2, 1))
h = K.permute_dimensions(h, (0, 2, 1))
outs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
if i < self.n_routings - 1:
h = K.batch_dot(outs, u_hat_vecs, [2, 3])
return outs
def compute_out_shape(self, input_shape):
return (None, self.n_caps, self.dim_caps)
def build_model(embedding, sequence_length):
input1 = Input(shape=(sequence_length,))
embedding_layer = Embedding(
embedding.shape[0], embedding.shape[1], weights=[embedding], trainable=False
)(input1)
embedding_layer = SpatialDropout1D(DROP_DENSE_RATE)(embedding_layer)
x = Bidirectional(
GRU(
LEN_GRU,
activation="relu",
dropout=DROPOUT_PROBA,
recurrent_dropout=DROPOUT_PROBA,
return_sequences=True,
)
)(embedding_layer)
capsule = Capsule(
n_caps=N_CAPS, dim_caps=DIM_CAPS, routings=N_ROUTINGS, share_weights=True
)(x)
capsule = Flatten()(capsule)
capsule = Dropout(DROPOUT_PROBA)(capsule)
output = Dense(1, activation="sigmoid")(capsule)
model = Model(inputs=input1, outputs=output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment