YasuThompson/translator_wp_4.ipynb Secret

## translator_wp_4.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-09-27T01:27:46.699328Z",
     "iopub.status.busy": "2020-09-27T01:27:46.698640Z",
     "iopub.status.idle": "2020-09-27T01:27:46.700613Z",
     "shell.execute_reply": "2020-09-27T01:27:46.700958Z"
    },
    "id": "ncyS-Ms3i2x_"
   },
   "outputs": [],
   "source": [
    "class EncoderLayer(tf.keras.layers.Layer):\n",
    "  def __init__(self, d_model, num_heads, dff, rate=0.1):\n",
    "    super(EncoderLayer, self).__init__()\n",
    "\n",
    "    self.mha = MultiHeadAttention(d_model, num_heads)\n",
    "    self.ffn = point_wise_feed_forward_network(d_model, dff)\n",
    "\n",
    "    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
    "    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
    "    \n",
    "    self.dropout1 = tf.keras.layers.Dropout(rate)\n",
    "    self.dropout2 = tf.keras.layers.Dropout(rate)\n",
    "    \n",
    "  def call(self, x, training, mask):\n",
    "    \n",
    "    '''\n",
    "    Very importantly, in encoder layers, you generate \"query\", \"key\", and \"value\" from the same input sentences. \n",
    "    That is why the three inputs of the MultiHeadAttention() class below are all 'x.'\n",
    "\n",
    "    The part 'self.layernorm1(x + attn_output)' means you apply a layer normalization with \n",
    "    an input through the residual connection. \n",
    "    \n",
    "    You should also keep it in mind that the outputs of all the parts have the same shape. \n",
    "    '''\n",
    "    \n",
    "    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)\n",
    "    attn_output = self.dropout1(attn_output, training=training)\n",
    "    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)\n",
    "    \n",
    "    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)\n",
    "    ffn_output = self.dropout2(ffn_output, training=training)\n",
    "    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)\n",
    "    \n",
    "    return out2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(64, 37, 128)\n"
     ]
    }
   ],
   "source": [
    "num_layers = 4\n",
    "d_model = 128\n",
    "dff = 512\n",
    "num_heads = 4\n",
    "batch_size = 64\n",
    "vocab_size = 10000 + 2\n",
    "\n",
    "sample_encoder_layer = EncoderLayer(d_model, num_heads, dff)\n",
    "\n",
    "'''\n",
    "Let the maximum length of sentences be 37 . \n",
    "In this case, a sentence is nodenoted as a matrix with the size of (37, d_model=128). \n",
    "'''\n",
    "sample_input = tf.random.uniform((batch_size, 37, d_model))\n",
    "sample_encoder_layer_output = sample_encoder_layer(sample_input, False, None)\n",
    "\n",
    "print(sample_encoder_layer_output.shape)  # (batch_size, input_seq_len, d_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Encoder(tf.keras.layers.Layer):\n",
    "  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,\n",
    "               maximum_position_encoding, rate=0.1):\n",
    "    super(Encoder, self).__init__()\n",
    "\n",
    "    self.d_model = d_model\n",
    "    self.num_layers = num_layers\n",
    "    \n",
    "    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)\n",
    "    self.pos_encoding = positional_encoding(maximum_position_encoding, \n",
    "                                            self.d_model)\n",
    "    \n",
    "    \n",
    "    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) \n",
    "                       for _ in range(num_layers)]\n",
    "  \n",
    "    self.dropout = tf.keras.layers.Dropout(rate)\n",
    "        \n",
    "  def call(self, x, training, mask):\n",
    "\n",
    "    seq_len = tf.shape(x)[1]\n",
    "\n",
    "    '''\n",
    "    Fisrst you convert integers which denote words into d_model dimensional vectors\n",
    "    with an embedding layer, as I explained in the first article. \n",
    "    \n",
    "    I don't know why, but you multiply the embeddnig layer by √d_model, according to the original paper. \n",
    "    \n",
    "    You just add positional encodng to the input x, depending on the length of input sentences so that \n",
    "    Transformer can learn relative and definite positions of input tokens, as I explained in the last article.\n",
    "    That is equal to cropping the heat map in the last article and adding it to the each (input_seq_len, d_model)\n",
    "    sized matrix. \n",
    "    \n",
    "    You also apply a dropout to mitigate overfitting. \n",
    "    '''\n",
    "    \n",
    "    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)\n",
    "    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n",
    "    x += self.pos_encoding[:, :seq_len, :]\n",
    "    x = self.dropout(x, training=training)\n",
    "    \n",
    "    \n",
    "    '''\n",
    "    You put the input through all the encoder layers in the loop below. \n",
    "    After each loop, you can keep the shape (batch_ size, input_seq_len, d_model). \n",
    "    '''\n",
    "    \n",
    "    for i in range(self.num_layers):\n",
    "      x = self.enc_layers[i](x, training, mask)\n",
    "    \n",
    "    return x  #(batch_ size, input_seq_len, d_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(64, 37, 128)\n"
     ]
    }
   ],
   "source": [
    "sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
    "                         dff=dff, input_vocab_size=vocab_size,\n",
    "                         maximum_position_encoding=10000)\n",
    "temp_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n",
    "\n",
    "sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)\n",
    "\n",
    "'''\n",
    "You can see that the shape of the output of the Encoder() class is the same as that of the\n",
    "EncoderLayer() class. \n",
    "\n",
    "In this case, all the input sentences are denoded as a matrix with a size of (37, d_model=128), \n",
    "And Transformer model keeps converting input sentences, layer by layer, keeping its original \n",
    "shape at the end of each layer. \n",
    "'''\n",
    "\n",
    "print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [
    "s_qNSzzyaCbD"
   ],
   "name": "transformer.ipynb",
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2020-09-27T01:27:46.699328Z",
	"iopub.status.busy": "2020-09-27T01:27:46.698640Z",
	"iopub.status.idle": "2020-09-27T01:27:46.700613Z",
	"shell.execute_reply": "2020-09-27T01:27:46.700958Z"
	},
	"id": "ncyS-Ms3i2x_"
	},
	"outputs": [],
	"source": [
	"class EncoderLayer(tf.keras.layers.Layer):\n",
	" def __init__(self, d_model, num_heads, dff, rate=0.1):\n",
	" super(EncoderLayer, self).__init__()\n",
	"\n",
	" self.mha = MultiHeadAttention(d_model, num_heads)\n",
	" self.ffn = point_wise_feed_forward_network(d_model, dff)\n",
	"\n",
	" self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
	" self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n",
	" \n",
	" self.dropout1 = tf.keras.layers.Dropout(rate)\n",
	" self.dropout2 = tf.keras.layers.Dropout(rate)\n",
	" \n",
	" def call(self, x, training, mask):\n",
	" \n",
	" '''\n",
	" Very importantly, in encoder layers, you generate \"query\", \"key\", and \"value\" from the same input sentences. \n",
	" That is why the three inputs of the MultiHeadAttention() class below are all 'x.'\n",
	"\n",
	" The part 'self.layernorm1(x + attn_output)' means you apply a layer normalization with \n",
	" an input through the residual connection. \n",
	" \n",
	" You should also keep it in mind that the outputs of all the parts have the same shape. \n",
	" '''\n",
	" \n",
	" attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)\n",
	" attn_output = self.dropout1(attn_output, training=training)\n",
	" out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)\n",
	" \n",
	" ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)\n",
	" ffn_output = self.dropout2(ffn_output, training=training)\n",
	" out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)\n",
	" \n",
	" return out2\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(64, 37, 128)\n"
	]
	}
	],
	"source": [
	"num_layers = 4\n",
	"d_model = 128\n",
	"dff = 512\n",
	"num_heads = 4\n",
	"batch_size = 64\n",
	"vocab_size = 10000 + 2\n",
	"\n",
	"sample_encoder_layer = EncoderLayer(d_model, num_heads, dff)\n",
	"\n",
	"'''\n",
	"Let the maximum length of sentences be 37 . \n",
	"In this case, a sentence is nodenoted as a matrix with the size of (37, d_model=128). \n",
	"'''\n",
	"sample_input = tf.random.uniform((batch_size, 37, d_model))\n",
	"sample_encoder_layer_output = sample_encoder_layer(sample_input, False, None)\n",
	"\n",
	"print(sample_encoder_layer_output.shape) # (batch_size, input_seq_len, d_model)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"class Encoder(tf.keras.layers.Layer):\n",
	" def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,\n",
	" maximum_position_encoding, rate=0.1):\n",
	" super(Encoder, self).__init__()\n",
	"\n",
	" self.d_model = d_model\n",
	" self.num_layers = num_layers\n",
	" \n",
	" self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)\n",
	" self.pos_encoding = positional_encoding(maximum_position_encoding, \n",
	" self.d_model)\n",
	" \n",
	" \n",
	" self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) \n",
	" for _ in range(num_layers)]\n",
	" \n",
	" self.dropout = tf.keras.layers.Dropout(rate)\n",
	" \n",
	" def call(self, x, training, mask):\n",
	"\n",
	" seq_len = tf.shape(x)[1]\n",
	"\n",
	" '''\n",
	" Fisrst you convert integers which denote words into d_model dimensional vectors\n",
	" with an embedding layer, as I explained in the first article. \n",
	" \n",
	" I don't know why, but you multiply the embeddnig layer by √d_model, according to the original paper. \n",
	" \n",
	" You just add positional encodng to the input x, depending on the length of input sentences so that \n",
	" Transformer can learn relative and definite positions of input tokens, as I explained in the last article.\n",
	" That is equal to cropping the heat map in the last article and adding it to the each (input_seq_len, d_model)\n",
	" sized matrix. \n",
	" \n",
	" You also apply a dropout to mitigate overfitting. \n",
	" '''\n",
	" \n",
	" x = self.embedding(x) # (batch_size, input_seq_len, d_model)\n",
	" x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))\n",
	" x += self.pos_encoding[:, :seq_len, :]\n",
	" x = self.dropout(x, training=training)\n",
	" \n",
	" \n",
	" '''\n",
	" You put the input through all the encoder layers in the loop below. \n",
	" After each loop, you can keep the shape (batch_ size, input_seq_len, d_model). \n",
	" '''\n",
	" \n",
	" for i in range(self.num_layers):\n",
	" x = self.enc_layers[i](x, training, mask)\n",
	" \n",
	" return x #(batch_ size, input_seq_len, d_model)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(64, 37, 128)\n"
	]
	}
	],
	"source": [
	"sample_encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, \n",
	" dff=dff, input_vocab_size=vocab_size,\n",
	" maximum_position_encoding=10000)\n",
	"temp_input = tf.random.uniform((64, 37), dtype=tf.int64, minval=0, maxval=200)\n",
	"\n",
	"sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)\n",
	"\n",
	"'''\n",
	"You can see that the shape of the output of the Encoder() class is the same as that of the\n",
	"EncoderLayer() class. \n",
	"\n",
	"In this case, all the input sentences are denoded as a matrix with a size of (37, d_model=128), \n",
	"And Transformer model keeps converting input sentences, layer by layer, keeping its original \n",
	"shape at the end of each layer. \n",
	"'''\n",
	"\n",
	"print (sample_encoder_output.shape) # (batch_size, input_seq_len, d_model)"
	]
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"collapsed_sections": [
	"s_qNSzzyaCbD"
	],
	"name": "transformer.ipynb",
	"toc_visible": true
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}