Trung Tran ChunML

## transformer_13.py
    def call(self, sequence, encoder_output):
        # EMBEDDING AND POSITIONAL EMBEDDING
        embed_out = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            embed_out.append(embed + pes[i, :])

        embed_out = tf.concat(embed_out, axis=1)

## transformer_12.py
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Decoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
        self.attention_bot = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_bot_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.attention_mid = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]

## transformer_11.py
    def call(self, sequence):
        sub_in = []
        for i in range(sequence.shape[1]):
            # Compute the embedded vector
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))

            # Add positional encoding to the embedded vector
            sub_in.append(embed + pes[i, :])

        # Concatenate the result so that the shape is (batch_size, length, model_size)

## transformer_10.py
            # The FFN input is the output of the Multi-Head Attention
            ffn_in = sub_out

            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            # Add the residual connection
            ffn_out = ffn_in + ffn_out
            # Normalize the output
            ffn_out = self.ffn_norm[i](ffn_out)

            # Assign the FFN output to the next layer's Multi-Head Attention input

## transformer_9.py
            # Residual connection
            sub_out = sub_in + sub_out
            # Normalize the output
            sub_out = self.attention_norm[i](sub_out)

## transformer_8.py
        # We will have num_layers of (Attention + FFN)
        for i in range(self.num_layers):
            sub_out = []

            # Iterate along the sequence length
            for j in range(sub_in.shape[1]):
                # Compute the context vector towards the whole sequence
                attention = self.attention[i](
                    tf.expand_dims(sub_in[:, j, :], axis=1), sub_in)

## transformer_7.py
    def call(self, sequence):
        sub_in = []
        for i in range(sequence.shape[1]):
            # Compute the embedded vector
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))

            # Add positional encoding to the embedded vector
            sub_in.append(embed + pes[i, :])

        # Concatenate the result so that the shape is (batch_size, length, model_size)

## transformer_6.py
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Encoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h

        # One Embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)


## transformer_5.py
    def call(self, query, value):
        # query has shape (batch, query_len, model_size)
        # value has shape (batch, value_len, model_size)
        heads = []
        for i in range(self.h):
            score = tf.matmul(self.wq[i](query), self.wk[i](value), transpose_b=True)

            # Here we scale the score as described in the paper
            score /= tf.math.sqrt(tf.dtypes.cast(self.key_size, tf.float32))
            # score has shape (batch, query_len, value_len)

## transformer_4.py
    def call(self, query, value):
        # query has shape (batch, query_len, model_size)
        # value has shape (batch, value_len, model_size)
        score = tf.matmul(query, value, transpose_b=True) / tf.math.sqrt(tf.dtypes.cast(self.key_size, tf.float32))
        # score has shape (batch, query_len, value_len)

        alignment = tf.nn.softmax(score, axis=2)
        # alignment has shape (batch, query_len, value_len)

        context = tf.matmul(alignment, value)
	def call(self, sequence, encoder_output):
	# EMBEDDING AND POSITIONAL EMBEDDING
	embed_out = []
	for i in range(sequence.shape[1]):
	embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
	embed_out.append(embed + pes[i, :])

	embed_out = tf.concat(embed_out, axis=1)
	class Decoder(tf.keras.Model):
	def __init__(self, vocab_size, model_size, num_layers, h):
	super(Decoder, self).__init__()
	self.model_size = model_size
	self.num_layers = num_layers
	self.h = h
	self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
	self.attention_bot = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
	self.attention_bot_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
	self.attention_mid = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
	def call(self, sequence):
	sub_in = []
	for i in range(sequence.shape[1]):
	# Compute the embedded vector
	embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))

	# Add positional encoding to the embedded vector
	sub_in.append(embed + pes[i, :])

	# Concatenate the result so that the shape is (batch_size, length, model_size)
	# The FFN input is the output of the Multi-Head Attention
	ffn_in = sub_out

	ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
	# Add the residual connection
	ffn_out = ffn_in + ffn_out
	# Normalize the output
	ffn_out = self.ffn_norm[i](ffn_out)

	# Assign the FFN output to the next layer's Multi-Head Attention input
	# Residual connection
	sub_out = sub_in + sub_out
	# Normalize the output
	sub_out = self.attention_norm[i](sub_out)
	# We will have num_layers of (Attention + FFN)
	for i in range(self.num_layers):
	sub_out = []

	# Iterate along the sequence length
	for j in range(sub_in.shape[1]):
	# Compute the context vector towards the whole sequence
	attention = self.attention[i](
	tf.expand_dims(sub_in[:, j, :], axis=1), sub_in)
	class Encoder(tf.keras.Model):
	def __init__(self, vocab_size, model_size, num_layers, h):
	super(Encoder, self).__init__()
	self.model_size = model_size
	self.num_layers = num_layers
	self.h = h

	# One Embedding layer
	self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
	def call(self, query, value):
	# query has shape (batch, query_len, model_size)
	# value has shape (batch, value_len, model_size)
	heads = []
	for i in range(self.h):
	score = tf.matmul(self.wq[i](query), self.wk[i](value), transpose_b=True)

	# Here we scale the score as described in the paper
	score /= tf.math.sqrt(tf.dtypes.cast(self.key_size, tf.float32))
	# score has shape (batch, query_len, value_len)