YasuThompson/translator_wp_5.ipynb Secret

## translator_wp_5.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-09-27T01:27:46.765993Z",
     "iopub.status.busy": "2020-09-27T01:27:46.765450Z",
     "iopub.status.idle": "2020-09-27T01:27:46.767350Z",
     "shell.execute_reply": "2020-09-27T01:27:46.766844Z"
    },
    "id": "9SoX0-vd1hue"
   },
   "outputs": [],
   "source": [
    "class DecoderLayer(tf.keras.layers.Layer):\n",
    "  def __init__(self, d_model, num_heads, dff, rate=0.1):\n",
    "    super(DecoderLayer, self).__init__()\n",
    "\n",
    "    self.mha1 = MultiHeadAttention(d_model, num_heads)\n",
    "    self.mha2 = MultiHeadAttention(d_model, num_heads)\n",
    "\n",
    "    self.ffn = point_wise_feed_forward_network(d_model, dff)\n",
    " \n",
    "    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
    "    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
    "    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
    "    \n",
    "    self.dropout1 = tf.keras.layers.Dropout(rate)\n",
    "    self.dropout2 = tf.keras.layers.Dropout(rate)\n",
    "    self.dropout3 = tf.keras.layers.Dropout(rate)\n",
    "    \n",
    "    \n",
    "  def call(self, x, enc_output, training, \n",
    "           look_ahead_mask, padding_mask):\n",
    "    # enc_output.shape == (batch_size, input_seq_len, d_model)\n",
    "    \n",
    "    '''\n",
    "    You can generate all the \"queries\", \"keys\", and \"values\" from the same target sentence. \n",
    "    And you apply look ahead mask to the first multi-head attention of the decoder part. \n",
    "    '''\n",
    "    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)\n",
    "    attn1 = self.dropout1(attn1, training=training)\n",
    "    out1 = self.layernorm1(attn1 + x)\n",
    "    \n",
    "    '''\n",
    "    Very importatnly, you generate only the \"queries\" from the outputs of the encoder part. \n",
    "    You apply normal padding mask to the second multi-head attention of the decoder part. \n",
    "    '''\n",
    "    attn2, attn_weights_block2 = self.mha2(\n",
    "        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)\n",
    "    attn2 = self.dropout2(attn2, training=training)\n",
    "    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)\n",
    "    \n",
    "    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)\n",
    "    ffn_output = self.dropout3(ffn_output, training=training)\n",
    "    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)\n",
    "    \n",
    "    return out3, attn_weights_block1, attn_weights_block2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-09-27T01:27:47.163368Z",
     "iopub.status.busy": "2020-09-27T01:27:47.162774Z",
     "iopub.status.idle": "2020-09-27T01:27:47.164447Z",
     "shell.execute_reply": "2020-09-27T01:27:47.164782Z"
    },
    "id": "d5_d5-PLQXwY"
   },
   "outputs": [],
   "source": [
    "class Decoder(tf.keras.layers.Layer):\n",
    "  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,\n",
    "               maximum_position_encoding, rate=0.1):\n",
    "    super(Decoder, self).__init__()\n",
    "\n",
    "    self.d_model = d_model\n",
    "    self.num_layers = num_layers\n",
    "    \n",
    "    #self.pre_embedding = tf.keras.layers.Dense(target_vocab_size, 100)\n",
    "    \n",
    "    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)\n",
    "    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)\n",
    "    \n",
    "    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) \n",
    "                       for _ in range(num_layers)]\n",
    "    self.dropout = tf.keras.layers.Dropout(rate)\n",
    "    \n",
    "  def call(self, x, enc_output, training, \n",
    "           look_ahead_mask, padding_mask):\n",
    "    #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
    "\n",
    "    seq_len = tf.shape(x)[1]\n",
    "    #print(\"'seq_len' is \" + str(seq_len))\n",
    "    \n",
    "    attention_weights = {}\n",
    "    \n",
    "    #x = self.pre_embedding(x)\n",
    "\n",
    "    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)\n",
    "    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n",
    "    x += self.pos_encoding[:, :seq_len, :]\n",
    "    \n",
    "    x = self.dropout(x, training=training)\n",
    "\n",
    "    for i in range(self.num_layers):\n",
    "      x, block1, block2 = self.dec_layers[i](x, enc_output, training,\n",
    "                                             look_ahead_mask, padding_mask)\n",
    "      \n",
    "      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1\n",
    "      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2\n",
    "    \n",
    "    # x.shape == (batch_size, target_seq_len, d_model)\n",
    "    #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
    "    \n",
    "    return x, attention_weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_layers = 4\n",
    "d_model = 128\n",
    "dff = 512\n",
    "num_heads = 4\n",
    "batch_size = 64\n",
    "vocab_size = 10000 + 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# You need an encoder output for the decoder.\n",
    "sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
    "                         dff=dff, input_vocab_size=vocab_size,\n",
    "                         maximum_position_encoding=10000)\n",
    "temp_enc_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n",
    "\n",
    "sample_encoder_output = sample_encoder(temp_enc_input, training=False, mask=None)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
    "                         dff=dff, target_vocab_size=vocab_size,\n",
    "                         maximum_position_encoding=10000)\n",
    "\n",
    "temp_dec_input = tf.random.uniform((64, 39), dtype=tf.int64, minval=0, maxval=200)\n",
    "\n",
    "output, attn = sample_decoder(temp_dec_input, \n",
    "                              enc_output=sample_encoder_output, \n",
    "                              training=False,\n",
    "                              look_ahead_mask=None, \n",
    "                              padding_mask=None)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TensorShape([64, 39, 128])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "You can see that the decoder alaso gives out outputs like those of the encoder.\n",
    "'''\n",
    "output.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [
    "s_qNSzzyaCbD"
   ],
   "name": "transformer.ipynb",
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2020-09-27T01:27:46.765993Z",
	"iopub.status.busy": "2020-09-27T01:27:46.765450Z",
	"iopub.status.idle": "2020-09-27T01:27:46.767350Z",
	"shell.execute_reply": "2020-09-27T01:27:46.766844Z"
	},
	"id": "9SoX0-vd1hue"
	},
	"outputs": [],
	"source": [
	"class DecoderLayer(tf.keras.layers.Layer):\n",
	" def __init__(self, d_model, num_heads, dff, rate=0.1):\n",
	" super(DecoderLayer, self).__init__()\n",
	"\n",
	" self.mha1 = MultiHeadAttention(d_model, num_heads)\n",
	" self.mha2 = MultiHeadAttention(d_model, num_heads)\n",
	"\n",
	" self.ffn = point_wise_feed_forward_network(d_model, dff)\n",
	" \n",
	" self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
	" self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
	" self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
	" \n",
	" self.dropout1 = tf.keras.layers.Dropout(rate)\n",
	" self.dropout2 = tf.keras.layers.Dropout(rate)\n",
	" self.dropout3 = tf.keras.layers.Dropout(rate)\n",
	" \n",
	" \n",
	" def call(self, x, enc_output, training, \n",
	" look_ahead_mask, padding_mask):\n",
	" # enc_output.shape == (batch_size, input_seq_len, d_model)\n",
	" \n",
	" '''\n",
	" You can generate all the \"queries\", \"keys\", and \"values\" from the same target sentence. \n",
	" And you apply look ahead mask to the first multi-head attention of the decoder part. \n",
	" '''\n",
	" attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)\n",
	" attn1 = self.dropout1(attn1, training=training)\n",
	" out1 = self.layernorm1(attn1 + x)\n",
	" \n",
	" '''\n",
	" Very importatnly, you generate only the \"queries\" from the outputs of the encoder part. \n",
	" You apply normal padding mask to the second multi-head attention of the decoder part. \n",
	" '''\n",
	" attn2, attn_weights_block2 = self.mha2(\n",
	" enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)\n",
	" attn2 = self.dropout2(attn2, training=training)\n",
	" out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)\n",
	" \n",
	" ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)\n",
	" ffn_output = self.dropout3(ffn_output, training=training)\n",
	" out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)\n",
	" \n",
	" return out3, attn_weights_block1, attn_weights_block2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2020-09-27T01:27:47.163368Z",
	"iopub.status.busy": "2020-09-27T01:27:47.162774Z",
	"iopub.status.idle": "2020-09-27T01:27:47.164447Z",
	"shell.execute_reply": "2020-09-27T01:27:47.164782Z"
	},
	"id": "d5_d5-PLQXwY"
	},
	"outputs": [],
	"source": [
	"class Decoder(tf.keras.layers.Layer):\n",
	" def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,\n",
	" maximum_position_encoding, rate=0.1):\n",
	" super(Decoder, self).__init__()\n",
	"\n",
	" self.d_model = d_model\n",
	" self.num_layers = num_layers\n",
	" \n",
	" #self.pre_embedding = tf.keras.layers.Dense(target_vocab_size, 100)\n",
	" \n",
	" self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)\n",
	" self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)\n",
	" \n",
	" self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) \n",
	" for _ in range(num_layers)]\n",
	" self.dropout = tf.keras.layers.Dropout(rate)\n",
	" \n",
	" def call(self, x, enc_output, training, \n",
	" look_ahead_mask, padding_mask):\n",
	" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
	"\n",
	" seq_len = tf.shape(x)[1]\n",
	" #print(\"'seq_len' is \" + str(seq_len))\n",
	" \n",
	" attention_weights = {}\n",
	" \n",
	" #x = self.pre_embedding(x)\n",
	"\n",
	" x = self.embedding(x) # (batch_size, target_seq_len, d_model)\n",
	" x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n",
	" x += self.pos_encoding[:, :seq_len, :]\n",
	" \n",
	" x = self.dropout(x, training=training)\n",
	"\n",
	" for i in range(self.num_layers):\n",
	" x, block1, block2 = self.dec_layers[i](x, enc_output, training,\n",
	" look_ahead_mask, padding_mask)\n",
	" \n",
	" attention_weights['decoder_layer{}_block1'.format(i+1)] = block1\n",
	" attention_weights['decoder_layer{}_block2'.format(i+1)] = block2\n",
	" \n",
	" # x.shape == (batch_size, target_seq_len, d_model)\n",
	" #print(\"The shape of 'x' is \" + str(tf.shape(x)))\n",
	" \n",
	" return x, attention_weights"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"num_layers = 4\n",
	"d_model = 128\n",
	"dff = 512\n",
	"num_heads = 4\n",
	"batch_size = 64\n",
	"vocab_size = 10000 + 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"# You need an encoder output for the decoder.\n",
	"sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
	" dff=dff, input_vocab_size=vocab_size,\n",
	" maximum_position_encoding=10000)\n",
	"temp_enc_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n",
	"\n",
	"sample_encoder_output = sample_encoder(temp_enc_input, training=False, mask=None)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"sample_decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
	" dff=dff, target_vocab_size=vocab_size,\n",
	" maximum_position_encoding=10000)\n",
	"\n",
	"temp_dec_input = tf.random.uniform((64, 39), dtype=tf.int64, minval=0, maxval=200)\n",
	"\n",
	"output, attn = sample_decoder(temp_dec_input, \n",
	" enc_output=sample_encoder_output, \n",
	" training=False,\n",
	" look_ahead_mask=None, \n",
	" padding_mask=None)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"TensorShape([64, 39, 128])"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'''\n",
	"You can see that the decoder alaso gives out outputs like those of the encoder.\n",
	"'''\n",
	"output.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"collapsed_sections": [
	"s_qNSzzyaCbD"
	],
	"name": "transformer.ipynb",
	"toc_visible": true
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}