Trung Tran ChunML

## transformer_23.py
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out):
    with tf.GradientTape() as tape:
        encoder_output = encoder(source_seq)

        decoder_output = decoder(target_seq_in, encoder_output)

        loss = loss_func(target_seq_out, decoder_output)

    variables = encoder.trainable_variables + decoder.trainable_variables

## transformer_22.py
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)


def loss_func(targets, logits):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss

## transformer_21.py
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)

## transformer_20.txt
Input vocabulary size 98
Encoder input shape (2, 10)
Encoder output shape (2, 10, 128)
Target vocabulary size 110
Decoder input shape (2, 14)
Decoder output shape (2, 14, 110)

## transformer_19.py
H = 2
NUM_LAYERS = 2

en_vocab_size = len(en_tokenizer.word_index) + 1
encoder = Encoder(en_vocab_size, MODEL_SIZE, NUM_LAYERS, H)

en_sequence_in = tf.constant([[1, 2, 3, 4, 6, 7, 8, 0, 0, 0],
                           [1, 2, 3, 4, 6, 7, 8, 0, 0, 0]])
encoder_output = encoder(en_sequence_in)

## transformer_18.py
    def call(self, sequence, encoder_output):
        # EMBEDDING AND POSITIONAL EMBEDDING
        embed_out = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            embed_out.append(embed + pes[i, :])

        embed_out = tf.concat(embed_out, axis=1)


## transformer_17.py
        logits = self.dense(ffn_out)

        return logits

## transformer_16.py
            # FFN
            ffn_in = mid_sub_out

            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            ffn_out = ffn_out + ffn_in
            ffn_out = self.ffn_norm[i](ffn_out)

            bot_sub_in = ffn_out

## transformer_15.py
            # MIDDLE MULTIHEAD SUB LAYER
            mid_sub_in = bot_sub_out

            mid_sub_out = []
            for j in range(mid_sub_in.shape[1]):
                attention = self.attention_mid[i](
                    tf.expand_dims(mid_sub_in[:, j, :], axis=1), encoder_output)

                mid_sub_out.append(attention)

## transformer_14.py
        bot_sub_in = embed_out

        for i in range(self.num_layers):
            # BOTTOM MULTIHEAD SUB LAYER
            bot_sub_out = []

            for j in range(bot_sub_in.shape[1]):
                # the value vector must not contain tokens that lies on the right of the current token
                values = bot_sub_in[:, :j, :]
                attention = self.attention_bot[i](
	@tf.function
	def train_step(source_seq, target_seq_in, target_seq_out):
	with tf.GradientTape() as tape:
	encoder_output = encoder(source_seq)

	decoder_output = decoder(target_seq_in, encoder_output)

	loss = loss_func(target_seq_out, decoder_output)

	variables = encoder.trainable_variables + decoder.trainable_variables
	crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
	from_logits=True)


	def loss_func(targets, logits):
	mask = tf.math.logical_not(tf.math.equal(targets, 0))
	mask = tf.cast(mask, dtype=tf.int64)
	loss = crossentropy(targets, logits, sample_weight=mask)

	return loss
	def unicode_to_ascii(s):
	return ''.join(
	c for c in unicodedata.normalize('NFD', s)
	if unicodedata.category(c) != 'Mn')


	def normalize_string(s):
	s = unicode_to_ascii(s)
	s = re.sub(r'([!.?])', r' \1', s)
	s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
	Input vocabulary size 98
	Encoder input shape (2, 10)
	Encoder output shape (2, 10, 128)
	Target vocabulary size 110
	Decoder input shape (2, 14)
	Decoder output shape (2, 14, 110)
	H = 2
	NUM_LAYERS = 2

	en_vocab_size = len(en_tokenizer.word_index) + 1
	encoder = Encoder(en_vocab_size, MODEL_SIZE, NUM_LAYERS, H)

	en_sequence_in = tf.constant([[1, 2, 3, 4, 6, 7, 8, 0, 0, 0],
	[1, 2, 3, 4, 6, 7, 8, 0, 0, 0]])
	encoder_output = encoder(en_sequence_in)
	def call(self, sequence, encoder_output):
	# EMBEDDING AND POSITIONAL EMBEDDING
	embed_out = []
	for i in range(sequence.shape[1]):
	embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
	embed_out.append(embed + pes[i, :])

	embed_out = tf.concat(embed_out, axis=1)
	# FFN
	ffn_in = mid_sub_out

	ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
	ffn_out = ffn_out + ffn_in
	ffn_out = self.ffn_norm[i](ffn_out)

	bot_sub_in = ffn_out
	# MIDDLE MULTIHEAD SUB LAYER
	mid_sub_in = bot_sub_out

	mid_sub_out = []
	for j in range(mid_sub_in.shape[1]):
	attention = self.attention_mid[i](
	tf.expand_dims(mid_sub_in[:, j, :], axis=1), encoder_output)

	mid_sub_out.append(attention)
	bot_sub_in = embed_out

	for i in range(self.num_layers):
	# BOTTOM MULTIHEAD SUB LAYER
	bot_sub_out = []

	for j in range(bot_sub_in.shape[1]):
	# the value vector must not contain tokens that lies on the right of the current token
	values = bot_sub_in[:, :j, :]
	attention = self.attention_bot[i](