YasuThompson/translator_wp_2.ipynb Secret

## translator_wp_2.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dental-monster",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import time\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as ticker\n",
    "from sklearn.model_selection import train_test_split\n",
    "import unicodedata\n",
    "import re\n",
    "import os\n",
    "import io\n",
    "import time\n",
    "from bpemb import BPEmb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "heated-audience",
   "metadata": {},
   "outputs": [],
   "source": [
    "bpemb_de = BPEmb(lang='de', vs=10000, dim=100)\n",
    "bpemb_en = BPEmb(lang='en', vs=10000, dim=100)\n",
    "\n",
    "# From the corpus, you extract only the texts necessary. \n",
    "path_to_file = \"../datasets/deu.txt\"\n",
    "lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\\n')\n",
    "temp_list = []\n",
    "corpus = []\n",
    "for i in range(len(lines)):\n",
    "    temp_list =  lines[i].split('\\t')[:-1]\n",
    "    corpus.append(temp_list)\n",
    "en, de = np.array(corpus).T\n",
    "\n",
    "en_encoded = []\n",
    "de_encoded = []\n",
    "\n",
    "cnt_en = 0\n",
    "cnt_de = 0\n",
    "\n",
    "# You encode the sentences with fewer than 40 words in each row into \n",
    "# a list of integers, and you append [10000] (<start>) and [10001] (<end>) \n",
    "# at the beginning and the end of each sentence. \n",
    "for i in range(len(en)):\n",
    "    en_encoded_temp = bpemb_en.encode_ids(en[i])\n",
    "    de_encoded_temp = bpemb_de.encode_ids(de[i])\n",
    "    \n",
    "    if (len(en_encoded_temp)<=40) and (len(de_encoded_temp)<=40):\n",
    "        en_encoded.append([10000] + en_encoded_temp + [10001])\n",
    "        de_encoded.append([10000] + de_encoded_temp + [10001])\n",
    "\n",
    "# Zero padding the encoded corpus. \n",
    "en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')\n",
    "de_padded = tf.keras.preprocessing.sequence.pad_sequences(de_encoded, padding='post')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "contemporary-tyler",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splitting the corpus into traiing and validaiton datasets. \n",
    "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(en_padded, de_padded, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "departmental-stamp",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((179501, 41), (44876, 41), (179501, 42), (44876, 42))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# (data set size, length of the longest sentence)\n",
    "input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val .shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "honey-hayes",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUFFER_SIZE = len(input_tensor_train)\n",
    "BATCH_SIZE = 64\n",
    "steps_per_epoch = len(input_tensor_train)//BATCH_SIZE\n",
    "embedding_dim = 256\n",
    "units = 1024\n",
    "vocab_inp_size = 10000 + 2\n",
    "vocab_tar_size = 10000 + 2\n",
    "\n",
    "# You get an iterator for training the network. \n",
    "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
    "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cardiac-hardware",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<tf.Tensor: shape=(64, 41), dtype=int32, numpy=\n",
       " array([[10000,  5451,  1616, ...,     0,     0,     0],\n",
       "        [10000,   386,  2689, ...,     0,     0,     0],\n",
       "        [10000,  2088,  6838, ...,     0,     0,     0],\n",
       "        ...,\n",
       "        [10000,   386,  9937, ...,     0,     0,     0],\n",
       "        [10000,   391,    73, ...,     0,     0,     0],\n",
       "        [10000,   509,   536, ...,     0,     0,     0]], dtype=int32)>,\n",
       " <tf.Tensor: shape=(64, 42), dtype=int32, numpy=\n",
       " array([[10000,   153,    83, ...,     0,     0,     0],\n",
       "        [10000,  3077,     5, ...,     0,     0,     0],\n",
       "        [10000,  4104,   284, ...,     0,     0,     0],\n",
       "        ...,\n",
       "        [10000,  3077,  6331, ...,     0,     0,     0],\n",
       "        [10000,    19,   115, ...,     0,     0,     0],\n",
       "        [10000,   249,  1503, ...,     0,     0,     0]], dtype=int32)>)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# A sample batch. \n",
    "sample_data_pair = next(iter(dataset))\n",
    "sample_data_pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "previous-opportunity",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[10000  5451  1616  9937  9915  1220  4451   352    42  3687   756  6110\n",
      "  9967 10001     0     0     0     0     0     0     0     0     0     0\n",
      "     0     0     0     0     0     0     0     0     0     0     0     0\n",
      "     0     0     0     0     0], shape=(41,), dtype=int32)\n",
      "why don't you want me to tell anybody?\n"
     ]
    }
   ],
   "source": [
    "# You can see that each row of the batch corresponds to a sentence. \n",
    "# The first row, and its decoded sentence in English. \n",
    "sample_sentence_en = sample_data_pair[0][0]\n",
    "print(sample_sentence_en)\n",
    "sample_sentence_en = sample_sentence_en.numpy()\n",
    "sample_sentence_en= sample_sentence_en[np.where(sample_sentence_en!=0)][1:-1]\n",
    "print(bpemb_en.decode_ids(sample_sentence_en))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "elementary-lecture",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[10000   153    83   865  3077   234  2377   632  8005    50  9223  9974\n",
      " 10001     0     0     0     0     0     0     0     0     0     0     0\n",
      "     0     0     0     0     0     0     0     0     0     0     0     0\n",
      "     0     0     0     0     0     0], shape=(42,), dtype=int32)\n",
      "warum soll ich es denn niemandem sagen?\n"
     ]
    }
   ],
   "source": [
    "# The first row, and its decoded sentence in German. \n",
    "sample_sentence_de = sample_data_pair[1][0]\n",
    "print(sample_sentence_de)\n",
    "sample_sentence_de = sample_sentence_de.numpy()\n",
    "sample_sentence_de = sample_sentence_de[np.where(sample_sentence_de!=0)][1:-1]\n",
    "print(bpemb_de.decode_ids(sample_sentence_de))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "considerable-musician",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "dental-monster",
	"metadata": {},
	"outputs": [],
	"source": [
	"import tensorflow as tf\n",
	"import time\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import matplotlib.ticker as ticker\n",
	"from sklearn.model_selection import train_test_split\n",
	"import unicodedata\n",
	"import re\n",
	"import os\n",
	"import io\n",
	"import time\n",
	"from bpemb import BPEmb"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "heated-audience",
	"metadata": {},
	"outputs": [],
	"source": [
	"bpemb_de = BPEmb(lang='de', vs=10000, dim=100)\n",
	"bpemb_en = BPEmb(lang='en', vs=10000, dim=100)\n",
	"\n",
	"# From the corpus, you extract only the texts necessary. \n",
	"path_to_file = \"../datasets/deu.txt\"\n",
	"lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\\n')\n",
	"temp_list = []\n",
	"corpus = []\n",
	"for i in range(len(lines)):\n",
	" temp_list = lines[i].split('\\t')[:-1]\n",
	" corpus.append(temp_list)\n",
	"en, de = np.array(corpus).T\n",
	"\n",
	"en_encoded = []\n",
	"de_encoded = []\n",
	"\n",
	"cnt_en = 0\n",
	"cnt_de = 0\n",
	"\n",
	"# You encode the sentences with fewer than 40 words in each row into \n",
	"# a list of integers, and you append [10000] (<start>) and [10001] (<end>) \n",
	"# at the beginning and the end of each sentence. \n",
	"for i in range(len(en)):\n",
	" en_encoded_temp = bpemb_en.encode_ids(en[i])\n",
	" de_encoded_temp = bpemb_de.encode_ids(de[i])\n",
	" \n",
	" if (len(en_encoded_temp)<=40) and (len(de_encoded_temp)<=40):\n",
	" en_encoded.append([10000] + en_encoded_temp + [10001])\n",
	" de_encoded.append([10000] + de_encoded_temp + [10001])\n",
	"\n",
	"# Zero padding the encoded corpus. \n",
	"en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')\n",
	"de_padded = tf.keras.preprocessing.sequence.pad_sequences(de_encoded, padding='post')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "contemporary-tyler",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Splitting the corpus into traiing and validaiton datasets. \n",
	"input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(en_padded, de_padded, test_size=0.2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "departmental-stamp",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((179501, 41), (44876, 41), (179501, 42), (44876, 42))"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# (data set size, length of the longest sentence)\n",
	"input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val .shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "honey-hayes",
	"metadata": {},
	"outputs": [],
	"source": [
	"BUFFER_SIZE = len(input_tensor_train)\n",
	"BATCH_SIZE = 64\n",
	"steps_per_epoch = len(input_tensor_train)//BATCH_SIZE\n",
	"embedding_dim = 256\n",
	"units = 1024\n",
	"vocab_inp_size = 10000 + 2\n",
	"vocab_tar_size = 10000 + 2\n",
	"\n",
	"# You get an iterator for training the network. \n",
	"dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
	"dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "cardiac-hardware",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(<tf.Tensor: shape=(64, 41), dtype=int32, numpy=\n",
	" array([[10000, 5451, 1616, ..., 0, 0, 0],\n",
	" [10000, 386, 2689, ..., 0, 0, 0],\n",
	" [10000, 2088, 6838, ..., 0, 0, 0],\n",
	" ...,\n",
	" [10000, 386, 9937, ..., 0, 0, 0],\n",
	" [10000, 391, 73, ..., 0, 0, 0],\n",
	" [10000, 509, 536, ..., 0, 0, 0]], dtype=int32)>,\n",
	" <tf.Tensor: shape=(64, 42), dtype=int32, numpy=\n",
	" array([[10000, 153, 83, ..., 0, 0, 0],\n",
	" [10000, 3077, 5, ..., 0, 0, 0],\n",
	" [10000, 4104, 284, ..., 0, 0, 0],\n",
	" ...,\n",
	" [10000, 3077, 6331, ..., 0, 0, 0],\n",
	" [10000, 19, 115, ..., 0, 0, 0],\n",
	" [10000, 249, 1503, ..., 0, 0, 0]], dtype=int32)>)"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# A sample batch. \n",
	"sample_data_pair = next(iter(dataset))\n",
	"sample_data_pair"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "previous-opportunity",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"tf.Tensor(\n",
	"[10000 5451 1616 9937 9915 1220 4451 352 42 3687 756 6110\n",
	" 9967 10001 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0], shape=(41,), dtype=int32)\n",
	"why don't you want me to tell anybody?\n"
	]
	}
	],
	"source": [
	"# You can see that each row of the batch corresponds to a sentence. \n",
	"# The first row, and its decoded sentence in English. \n",
	"sample_sentence_en = sample_data_pair[0][0]\n",
	"print(sample_sentence_en)\n",
	"sample_sentence_en = sample_sentence_en.numpy()\n",
	"sample_sentence_en= sample_sentence_en[np.where(sample_sentence_en!=0)][1:-1]\n",
	"print(bpemb_en.decode_ids(sample_sentence_en))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "elementary-lecture",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"tf.Tensor(\n",
	"[10000 153 83 865 3077 234 2377 632 8005 50 9223 9974\n",
	" 10001 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0], shape=(42,), dtype=int32)\n",
	"warum soll ich es denn niemandem sagen?\n"
	]
	}
	],
	"source": [
	"# The first row, and its decoded sentence in German. \n",
	"sample_sentence_de = sample_data_pair[1][0]\n",
	"print(sample_sentence_de)\n",
	"sample_sentence_de = sample_sentence_de.numpy()\n",
	"sample_sentence_de = sample_sentence_de[np.where(sample_sentence_de!=0)][1:-1]\n",
	"print(bpemb_de.decode_ids(sample_sentence_de))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "considerable-musician",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}