cedrickchee/advantage_capsule_layer.md

## advantage_capsule_layer.md

      
    Raw
  

              advantage_capsule_layer.md
            
          
    We will look at the advantage of Capsule layer in text classification.
CapsNet Model

The architecture of our model with CapsNet is very similar to general architecture, except for an addition Capsule layer.

Advantage of Capsule Layer in Text Classification

The diagram shows that we have used Capsule layer instead of Pooling layer. Capsule Layer eliminates the need for forced pooling layers like MaxPool. In many cases, this is desired because we get translational invariance without losing minute details.
What is the advantage over CNN?
In a CNN, there are pooling layers. We generally use MaxPool which is a very primitive type of routing mechanism. The most active feature in a local pool (say 4x4 grid) is routed to the higher layer and the higher-level detectors don't have a say in the routing. Compare this with the routing-by-agreement mechanism introduced in the CapsNet. Only those features that agree with high-level detectors are routed. This is the advantage of CapsNet over CNN. It has a superior dynamic routing mechanism (dynamic because the information to be routed is determined in real time).
It's great to see that improvement in the Computer Vision field is also helping NLP/NLU field.

  
## capsule_layer.py
# first, imports Keras
from keras.engine import Layer
from keras.layers import (
    K,
    Activation,
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    GRU,
    Input,
    SpatialDropout1D,
)
from keras.models import Model

# CapsNet params
LEN_GRU = 128
N_ROUTINGS = 5
N_CAPS = 10
DIM_CAPS = 16
DROPOUT_PROBA = 0.3
DROP_DENSE_RATE = 0.3


def squash(x, axis=-1):
    """The implementation of the squash in the paper"""
    squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale_factor = K.sqrt(squared_norm + K.epsilon())
    return x / scale_factor


class Capsule(Layer):
    """The Capsule layer!"""

    def __init__(
        self,
        n_caps,
        dim_caps,
        n_routings=3,
        activation="default",
        share_weights=True,
        kernel_size=(9, 1),
        **kwargs
    ):
        super(Capsule, self).__init__(**kwargs)
        self.n_caps = n_caps
        self.dim_caps = dim_caps
        self.n_routings = n_routings
        self.share_weights = share_weights
        self.kernel_size = kernel_size

        if activation == "default":
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_caps = input_shape[-1]

        # Capsule kernel weights and weights init
        if self.share_weights:
            self.W = self.add_weight(
                name="capsule kernel",
                shape=(1, input_dim_caps, self.n_caps * self.dim_caps),
                initializer="glorot_uniform",
                trainable=True,
            )
        else:
            input_n_caps = input_shape[-2]
            self.W = self.add_weight(
                name="capsule kernel",
                shape=(input_n_caps, input_dim_caps, self.n_caps * self.dim_caps),
                initializer="glorot_uniform",
                trainable=True,
            )

    def call(self, u_vecs):
        batch_size = K.shape(u_vecs)[0]
        input_n_caps = K.shape(u_vecs)[1]

        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        u_hat_vecs = K.reshape(
            u_hat_vecs, (batch_size, input_n_caps, self.n_caps, self.dim_caps)
        )
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))

        h = K.zeros_like(u_hat_vecs[:, :, :, 0])
        for i in range(self.n_routings):
            h = K.permute_dimensions(h, (0, 2, 1))
            c = K.softmax(h)
            c = K.permute_dimensions(c, (0, 2, 1))
            h = K.permute_dimensions(h, (0, 2, 1))
            outs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.n_routings - 1:
                h = K.batch_dot(outs, u_hat_vecs, [2, 3])

        return outs

    def compute_out_shape(self, input_shape):
        return (None, self.n_caps, self.dim_caps)


def build_model(embedding, sequence_length):
    input1 = Input(shape=(sequence_length,))
    embedding_layer = Embedding(
        embedding.shape[0], embedding.shape[1], weights=[embedding], trainable=False
    )(input1)
    embedding_layer = SpatialDropout1D(DROP_DENSE_RATE)(embedding_layer)

    x = Bidirectional(
        GRU(
            LEN_GRU,
            activation="relu",
            dropout=DROPOUT_PROBA,
            recurrent_dropout=DROPOUT_PROBA,
            return_sequences=True,
        )
    )(embedding_layer)

    capsule = Capsule(
        n_caps=N_CAPS, dim_caps=DIM_CAPS, routings=N_ROUTINGS, share_weights=True
    )(x)

    capsule = Flatten()(capsule)
    capsule = Dropout(DROPOUT_PROBA)(capsule)
    output = Dense(1, activation="sigmoid")(capsule)
    model = Model(inputs=input1, outputs=output)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model
	# first, imports Keras
	from keras.engine import Layer
	from keras.layers import (
	K,
	Activation,
	Bidirectional,
	Dense,
	Dropout,
	Embedding,
	Flatten,
	GRU,
	Input,
	SpatialDropout1D,
	)
	from keras.models import Model

	# CapsNet params
	LEN_GRU = 128
	N_ROUTINGS = 5
	N_CAPS = 10
	DIM_CAPS = 16
	DROPOUT_PROBA = 0.3
	DROP_DENSE_RATE = 0.3


	def squash(x, axis=-1):
	"""The implementation of the squash in the paper"""
	squared_norm = K.sum(K.square(x), axis, keepdims=True)
	scale_factor = K.sqrt(squared_norm + K.epsilon())
	return x / scale_factor


	class Capsule(Layer):
	"""The Capsule layer!"""

	def __init__(
	self,
	n_caps,
	dim_caps,
	n_routings=3,
	activation="default",
	share_weights=True,
	kernel_size=(9, 1),
	**kwargs
	):
	super(Capsule, self).__init__(**kwargs)
	self.n_caps = n_caps
	self.dim_caps = dim_caps
	self.n_routings = n_routings
	self.share_weights = share_weights
	self.kernel_size = kernel_size

	if activation == "default":
	self.activation = squash
	else:
	self.activation = Activation(activation)

	def build(self, input_shape):
	super(Capsule, self).build(input_shape)
	input_dim_caps = input_shape[-1]

	# Capsule kernel weights and weights init
	if self.share_weights:
	self.W = self.add_weight(
	name="capsule kernel",
	shape=(1, input_dim_caps, self.n_caps * self.dim_caps),
	initializer="glorot_uniform",
	trainable=True,
	)
	else:
	input_n_caps = input_shape[-2]
	self.W = self.add_weight(
	name="capsule kernel",
	shape=(input_n_caps, input_dim_caps, self.n_caps * self.dim_caps),
	initializer="glorot_uniform",
	trainable=True,
	)

	def call(self, u_vecs):
	batch_size = K.shape(u_vecs)[0]
	input_n_caps = K.shape(u_vecs)[1]

	if self.share_weights:
	u_hat_vecs = K.conv1d(u_vecs, self.W)
	else:
	u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

	u_hat_vecs = K.reshape(
	u_hat_vecs, (batch_size, input_n_caps, self.n_caps, self.dim_caps)
	)
	u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))

	h = K.zeros_like(u_hat_vecs[:, :, :, 0])
	for i in range(self.n_routings):
	h = K.permute_dimensions(h, (0, 2, 1))
	c = K.softmax(h)
	c = K.permute_dimensions(c, (0, 2, 1))
	h = K.permute_dimensions(h, (0, 2, 1))
	outs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
	if i < self.n_routings - 1:
	h = K.batch_dot(outs, u_hat_vecs, [2, 3])

	return outs

	def compute_out_shape(self, input_shape):
	return (None, self.n_caps, self.dim_caps)


	def build_model(embedding, sequence_length):
	input1 = Input(shape=(sequence_length,))
	embedding_layer = Embedding(
	embedding.shape[0], embedding.shape[1], weights=[embedding], trainable=False
	)(input1)
	embedding_layer = SpatialDropout1D(DROP_DENSE_RATE)(embedding_layer)

	x = Bidirectional(
	GRU(
	LEN_GRU,
	activation="relu",
	dropout=DROPOUT_PROBA,
	recurrent_dropout=DROPOUT_PROBA,
	return_sequences=True,
	)
	)(embedding_layer)

	capsule = Capsule(
	n_caps=N_CAPS, dim_caps=DIM_CAPS, routings=N_ROUTINGS, share_weights=True
	)(x)

	capsule = Flatten()(capsule)
	capsule = Dropout(DROPOUT_PROBA)(capsule)
	output = Dense(1, activation="sigmoid")(capsule)
	model = Model(inputs=input1, outputs=output)
	model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
	return model