Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@YasuThompson
Created March 17, 2021 12:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YasuThompson/4b0ff91622cbd8cb38bf24c0b771fd14 to your computer and use it in GitHub Desktop.
Save YasuThompson/4b0ff91622cbd8cb38bf24c0b771fd14 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2020-09-27T01:27:46.765993Z",
"iopub.status.busy": "2020-09-27T01:27:46.765450Z",
"iopub.status.idle": "2020-09-27T01:27:46.767350Z",
"shell.execute_reply": "2020-09-27T01:27:46.766844Z"
},
"id": "9SoX0-vd1hue"
},
"outputs": [],
"source": [
"class DecoderLayer(tf.keras.layers.Layer):\n",
" def __init__(self, d_model, num_heads, dff, rate=0.1):\n",
" super(DecoderLayer, self).__init__()\n",
"\n",
" self.mha1 = MultiHeadAttention(d_model, num_heads)\n",
" self.mha2 = MultiHeadAttention(d_model, num_heads)\n",
"\n",
" self.ffn = point_wise_feed_forward_network(d_model, dff)\n",
" \n",
" self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
" self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
" self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
" \n",
" self.dropout1 = tf.keras.layers.Dropout(rate)\n",
" self.dropout2 = tf.keras.layers.Dropout(rate)\n",
" self.dropout3 = tf.keras.layers.Dropout(rate)\n",
" \n",
" \n",
" def call(self, x, enc_output, training, \n",
" look_ahead_mask, padding_mask):\n",
" # enc_output.shape == (batch_size, input_seq_len, d_model)\n",
" \n",
" '''\n",
" You can generate all the \"queries\", \"keys\", and \"values\" from the same target sentence. \n",
" And you apply look ahead mask to the first multi-head attention of the decoder part. \n",
" '''\n",
" attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)\n",
" attn1 = self.dropout1(attn1, training=training)\n",
" out1 = self.layernorm1(attn1 + x)\n",
" \n",
" '''\n",
" Very importatnly, you generate only the \"queries\" from the outputs of the encoder part. \n",
" You apply normal padding mask to the second multi-head attention of the decoder part. \n",
" '''\n",
" attn2, attn_weights_block2 = self.mha2(\n",
" enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)\n",
" attn2 = self.dropout2(attn2, training=training)\n",
" out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)\n",
" \n",
" ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)\n",
" ffn_output = self.dropout3(ffn_output, training=training)\n",
" out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)\n",
" \n",
" return out3, attn_weights_block1, attn_weights_block2"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"execution": {
"iopub.execute_input": "2020-09-27T01:27:47.163368Z",
"iopub.status.busy": "2020-09-27T01:27:47.162774Z",
"iopub.status.idle": "2020-09-27T01:27:47.164447Z",
"shell.execute_reply": "2020-09-27T01:27:47.164782Z"
},
"id": "d5_d5-PLQXwY"
},
"outputs": [],
"source": [
"class Decoder(tf.keras.layers.Layer):\n",
" def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,\n",
" maximum_position_encoding, rate=0.1):\n",
" super(Decoder, self).__init__()\n",
"\n",
" self.d_model = d_model\n",
" self.num_layers = num_layers\n",
" \n",
" #self.pre_embedding = tf.keras.layers.Dense(target_vocab_size, 100)\n",
" \n",
" self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)\n",
" self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)\n",
" \n",
" self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) \n",
" for _ in range(num_layers)]\n",
" self.dropout = tf.keras.layers.Dropout(rate)\n",
" \n",
" def call(self, x, enc_output, training, \n",
" look_ahead_mask, padding_mask):\n",
" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
"\n",
" seq_len = tf.shape(x)[1]\n",
" #print(\"'seq_len' is \" + str(seq_len))\n",
" \n",
" attention_weights = {}\n",
" \n",
" #x = self.pre_embedding(x)\n",
"\n",
" x = self.embedding(x) # (batch_size, target_seq_len, d_model)\n",
" x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n",
" x += self.pos_encoding[:, :seq_len, :]\n",
" \n",
" x = self.dropout(x, training=training)\n",
"\n",
" for i in range(self.num_layers):\n",
" x, block1, block2 = self.dec_layers[i](x, enc_output, training,\n",
" look_ahead_mask, padding_mask)\n",
" \n",
" attention_weights['decoder_layer{}_block1'.format(i+1)] = block1\n",
" attention_weights['decoder_layer{}_block2'.format(i+1)] = block2\n",
" \n",
" # x.shape == (batch_size, target_seq_len, d_model)\n",
" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
" \n",
" return x, attention_weights"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"num_layers = 4\n",
"d_model = 128\n",
"dff = 512\n",
"num_heads = 4\n",
"batch_size = 64\n",
"vocab_size = 10000 + 2"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# You need an encoder output for the decoder.\n",
"sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
" dff=dff, input_vocab_size=vocab_size,\n",
" maximum_position_encoding=10000)\n",
"temp_enc_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n",
"\n",
"sample_encoder_output = sample_encoder(temp_enc_input, training=False, mask=None)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"sample_decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
" dff=dff, target_vocab_size=vocab_size,\n",
" maximum_position_encoding=10000)\n",
"\n",
"temp_dec_input = tf.random.uniform((64, 39), dtype=tf.int64, minval=0, maxval=200)\n",
"\n",
"output, attn = sample_decoder(temp_dec_input, \n",
" enc_output=sample_encoder_output, \n",
" training=False,\n",
" look_ahead_mask=None, \n",
" padding_mask=None)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TensorShape([64, 39, 128])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"You can see that the decoder alaso gives out outputs like those of the encoder.\n",
"'''\n",
"output.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [
"s_qNSzzyaCbD"
],
"name": "transformer.ipynb",
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment