-
-
Save YasuThompson/4b0ff91622cbd8cb38bf24c0b771fd14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2020-09-27T01:27:46.765993Z", | |
"iopub.status.busy": "2020-09-27T01:27:46.765450Z", | |
"iopub.status.idle": "2020-09-27T01:27:46.767350Z", | |
"shell.execute_reply": "2020-09-27T01:27:46.766844Z" | |
}, | |
"id": "9SoX0-vd1hue" | |
}, | |
"outputs": [], | |
"source": [ | |
"class DecoderLayer(tf.keras.layers.Layer):\n", | |
" def __init__(self, d_model, num_heads, dff, rate=0.1):\n", | |
" super(DecoderLayer, self).__init__()\n", | |
"\n", | |
" self.mha1 = MultiHeadAttention(d_model, num_heads)\n", | |
" self.mha2 = MultiHeadAttention(d_model, num_heads)\n", | |
"\n", | |
" self.ffn = point_wise_feed_forward_network(d_model, dff)\n", | |
" \n", | |
" self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n", | |
" self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n", | |
" self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n", | |
" \n", | |
" self.dropout1 = tf.keras.layers.Dropout(rate)\n", | |
" self.dropout2 = tf.keras.layers.Dropout(rate)\n", | |
" self.dropout3 = tf.keras.layers.Dropout(rate)\n", | |
" \n", | |
" \n", | |
" def call(self, x, enc_output, training, \n", | |
" look_ahead_mask, padding_mask):\n", | |
" # enc_output.shape == (batch_size, input_seq_len, d_model)\n", | |
" \n", | |
" '''\n", | |
" You can generate all the \"queries\", \"keys\", and \"values\" from the same target sentence. \n", | |
" And you apply look ahead mask to the first multi-head attention of the decoder part. \n", | |
" '''\n", | |
" attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)\n", | |
" attn1 = self.dropout1(attn1, training=training)\n", | |
" out1 = self.layernorm1(attn1 + x)\n", | |
" \n", | |
" '''\n", | |
" Very importatnly, you generate only the \"queries\" from the outputs of the encoder part. \n", | |
" You apply normal padding mask to the second multi-head attention of the decoder part. \n", | |
" '''\n", | |
" attn2, attn_weights_block2 = self.mha2(\n", | |
" enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)\n", | |
" attn2 = self.dropout2(attn2, training=training)\n", | |
" out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)\n", | |
" \n", | |
" ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)\n", | |
" ffn_output = self.dropout3(ffn_output, training=training)\n", | |
" out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)\n", | |
" \n", | |
" return out3, attn_weights_block1, attn_weights_block2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2020-09-27T01:27:47.163368Z", | |
"iopub.status.busy": "2020-09-27T01:27:47.162774Z", | |
"iopub.status.idle": "2020-09-27T01:27:47.164447Z", | |
"shell.execute_reply": "2020-09-27T01:27:47.164782Z" | |
}, | |
"id": "d5_d5-PLQXwY" | |
}, | |
"outputs": [], | |
"source": [ | |
"class Decoder(tf.keras.layers.Layer):\n", | |
" def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,\n", | |
" maximum_position_encoding, rate=0.1):\n", | |
" super(Decoder, self).__init__()\n", | |
"\n", | |
" self.d_model = d_model\n", | |
" self.num_layers = num_layers\n", | |
" \n", | |
" #self.pre_embedding = tf.keras.layers.Dense(target_vocab_size, 100)\n", | |
" \n", | |
" self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)\n", | |
" self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)\n", | |
" \n", | |
" self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) \n", | |
" for _ in range(num_layers)]\n", | |
" self.dropout = tf.keras.layers.Dropout(rate)\n", | |
" \n", | |
" def call(self, x, enc_output, training, \n", | |
" look_ahead_mask, padding_mask):\n", | |
" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n", | |
"\n", | |
" seq_len = tf.shape(x)[1]\n", | |
" #print(\"'seq_len' is \" + str(seq_len))\n", | |
" \n", | |
" attention_weights = {}\n", | |
" \n", | |
" #x = self.pre_embedding(x)\n", | |
"\n", | |
" x = self.embedding(x) # (batch_size, target_seq_len, d_model)\n", | |
" x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n", | |
" x += self.pos_encoding[:, :seq_len, :]\n", | |
" \n", | |
" x = self.dropout(x, training=training)\n", | |
"\n", | |
" for i in range(self.num_layers):\n", | |
" x, block1, block2 = self.dec_layers[i](x, enc_output, training,\n", | |
" look_ahead_mask, padding_mask)\n", | |
" \n", | |
" attention_weights['decoder_layer{}_block1'.format(i+1)] = block1\n", | |
" attention_weights['decoder_layer{}_block2'.format(i+1)] = block2\n", | |
" \n", | |
" # x.shape == (batch_size, target_seq_len, d_model)\n", | |
" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n", | |
" \n", | |
" return x, attention_weights" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"num_layers = 4\n", | |
"d_model = 128\n", | |
"dff = 512\n", | |
"num_heads = 4\n", | |
"batch_size = 64\n", | |
"vocab_size = 10000 + 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# You need an encoder output for the decoder.\n", | |
"sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n", | |
" dff=dff, input_vocab_size=vocab_size,\n", | |
" maximum_position_encoding=10000)\n", | |
"temp_enc_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n", | |
"\n", | |
"sample_encoder_output = sample_encoder(temp_enc_input, training=False, mask=None)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sample_decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n", | |
" dff=dff, target_vocab_size=vocab_size,\n", | |
" maximum_position_encoding=10000)\n", | |
"\n", | |
"temp_dec_input = tf.random.uniform((64, 39), dtype=tf.int64, minval=0, maxval=200)\n", | |
"\n", | |
"output, attn = sample_decoder(temp_dec_input, \n", | |
" enc_output=sample_encoder_output, \n", | |
" training=False,\n", | |
" look_ahead_mask=None, \n", | |
" padding_mask=None)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"TensorShape([64, 39, 128])" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'''\n", | |
"You can see that the decoder alaso gives out outputs like those of the encoder.\n", | |
"'''\n", | |
"output.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"accelerator": "GPU", | |
"colab": { | |
"collapsed_sections": [ | |
"s_qNSzzyaCbD" | |
], | |
"name": "transformer.ipynb", | |
"toc_visible": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment