Lanme/attn.py

## attn.py
#attn和capsule来自 https://github.com/plantsgo
from keras import backend as K
from keras.layers import Layer
from keras import initializers, regularizers, constraints

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

## capsule.py
#https://www.kaggle.com/chongjiujjin/capsule-net-with-gru
# A Capsule Implement with Pure Keras
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)
	#attn和capsule来自 https://github.com/plantsgo
	from keras import backend as K
	from keras.layers import Layer
	from keras import initializers, regularizers, constraints

	def dot_product(x, kernel):
	"""
	Wrapper for dot product operation, in order to be compatible with both
	Theano and Tensorflow
	Args:
	x (): input
	kernel (): weights
	Returns:
	"""
	if K.backend() == 'tensorflow':
	return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
	else:
	return K.dot(x, kernel)


	class AttentionWithContext(Layer):
	"""
	Attention operation, with a context/query vector, for temporal data.
	Supports Masking.
	Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
	"Hierarchical Attention Networks for Document Classification"
	by using a context vector to assist the attention
	# Input shape
	3D tensor with shape: `(samples, steps, features)`.
	# Output shape
	2D tensor with shape: `(samples, features)`.
	How to use:
	Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
	The dimensions are inferred based on the output shape of the RNN.
	Note: The layer has been tested with Keras 2.0.6
	Example:
	model.add(LSTM(64, return_sequences=True))
	model.add(AttentionWithContext())
	# next add a Dense layer (for classification/regression) or whatever...
	"""

	def __init__(self,
	W_regularizer=None, u_regularizer=None, b_regularizer=None,
	W_constraint=None, u_constraint=None, b_constraint=None,
	bias=True, **kwargs):

	self.supports_masking = True
	self.init = initializers.get('glorot_uniform')

	self.W_regularizer = regularizers.get(W_regularizer)
	self.u_regularizer = regularizers.get(u_regularizer)
	self.b_regularizer = regularizers.get(b_regularizer)

	self.W_constraint = constraints.get(W_constraint)
	self.u_constraint = constraints.get(u_constraint)
	self.b_constraint = constraints.get(b_constraint)

	self.bias = bias
	super(AttentionWithContext, self).__init__(**kwargs)

	def build(self, input_shape):
	assert len(input_shape) == 3

	self.W = self.add_weight((input_shape[-1], input_shape[-1],),
	initializer=self.init,
	name='{}_W'.format(self.name),
	regularizer=self.W_regularizer,
	constraint=self.W_constraint)
	if self.bias:
	self.b = self.add_weight((input_shape[-1],),
	initializer='zero',
	name='{}_b'.format(self.name),
	regularizer=self.b_regularizer,
	constraint=self.b_constraint)

	self.u = self.add_weight((input_shape[-1],),
	initializer=self.init,
	name='{}_u'.format(self.name),
	regularizer=self.u_regularizer,
	constraint=self.u_constraint)

	super(AttentionWithContext, self).build(input_shape)

	def compute_mask(self, input, input_mask=None):
	# do not pass the mask to the next layers
	return None

	def call(self, x, mask=None):
	uit = dot_product(x, self.W)

	if self.bias:
	uit += self.b

	uit = K.tanh(uit)
	ait = dot_product(uit, self.u)

	a = K.exp(ait)

	# apply mask after the exp. will be re-normalized next
	if mask is not None:
	# Cast the mask to floatX to avoid float64 upcasting in theano
	a *= K.cast(mask, K.floatx())

	# in some cases especially in the early stages of training the sum may be almost zero
	# and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
	# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
	a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

	a = K.expand_dims(a)
	weighted_input = x * a
	return K.sum(weighted_input, axis=1)

	def compute_output_shape(self, input_shape):
	return input_shape[0], input_shape[-1]
	#https://www.kaggle.com/chongjiujjin/capsule-net-with-gru
	# A Capsule Implement with Pure Keras
	def squash(x, axis=-1):
	# s_squared_norm is really small
	# s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
	# scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
	# return scale * x
	s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
	scale = K.sqrt(s_squared_norm + K.epsilon())
	return x / scale

	class Capsule(Layer):
	def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
	activation='default', **kwargs):
	super(Capsule, self).__init__(**kwargs)
	self.num_capsule = num_capsule
	self.dim_capsule = dim_capsule
	self.routings = routings
	self.kernel_size = kernel_size
	self.share_weights = share_weights
	if activation == 'default':
	self.activation = squash
	else:
	self.activation = Activation(activation)

	def build(self, input_shape):
	super(Capsule, self).build(input_shape)
	input_dim_capsule = input_shape[-1]
	if self.share_weights:
	self.W = self.add_weight(name='capsule_kernel',
	shape=(1, input_dim_capsule,
	self.num_capsule * self.dim_capsule),
	# shape=self.kernel_size,
	initializer='glorot_uniform',
	trainable=True)
	else:
	input_num_capsule = input_shape[-2]
	self.W = self.add_weight(name='capsule_kernel',
	shape=(input_num_capsule,
	input_dim_capsule,
	self.num_capsule * self.dim_capsule),
	initializer='glorot_uniform',
	trainable=True)

	def call(self, u_vecs):
	if self.share_weights:
	u_hat_vecs = K.conv1d(u_vecs, self.W)
	else:
	u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

	batch_size = K.shape(u_vecs)[0]
	input_num_capsule = K.shape(u_vecs)[1]
	u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
	self.num_capsule, self.dim_capsule))
	u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
	# final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

	b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule]
	for i in range(self.routings):
	b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule]
	c = K.softmax(b)
	c = K.permute_dimensions(c, (0, 2, 1))
	b = K.permute_dimensions(b, (0, 2, 1))
	outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
	if i < self.routings - 1:
	b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

	return outputs

	def compute_output_shape(self, input_shape):
	return (None, self.num_capsule, self.dim_capsule)