Skip to content

Instantly share code, notes, and snippets.

@YasuThompson
Created March 3, 2021 12:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YasuThompson/62df769cfd25350c6334d4e3c786240f to your computer and use it in GitHub Desktop.
Save YasuThompson/62df769cfd25350c6334d4e3c786240f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dental-monster",
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import time\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.ticker as ticker\n",
"from sklearn.model_selection import train_test_split\n",
"import unicodedata\n",
"import re\n",
"import os\n",
"import io\n",
"import time\n",
"from bpemb import BPEmb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "heated-audience",
"metadata": {},
"outputs": [],
"source": [
"bpemb_de = BPEmb(lang='de', vs=10000, dim=100)\n",
"bpemb_en = BPEmb(lang='en', vs=10000, dim=100)\n",
"\n",
"# From the corpus, you extract only the texts necessary. \n",
"path_to_file = \"../datasets/deu.txt\"\n",
"lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\\n')\n",
"temp_list = []\n",
"corpus = []\n",
"for i in range(len(lines)):\n",
" temp_list = lines[i].split('\\t')[:-1]\n",
" corpus.append(temp_list)\n",
"en, de = np.array(corpus).T\n",
"\n",
"en_encoded = []\n",
"de_encoded = []\n",
"\n",
"cnt_en = 0\n",
"cnt_de = 0\n",
"\n",
"# You encode the sentences with fewer than 40 words in each row into \n",
"# a list of integers, and you append [10000] (<start>) and [10001] (<end>) \n",
"# at the beginning and the end of each sentence. \n",
"for i in range(len(en)):\n",
" en_encoded_temp = bpemb_en.encode_ids(en[i])\n",
" de_encoded_temp = bpemb_de.encode_ids(de[i])\n",
" \n",
" if (len(en_encoded_temp)<=40) and (len(de_encoded_temp)<=40):\n",
" en_encoded.append([10000] + en_encoded_temp + [10001])\n",
" de_encoded.append([10000] + de_encoded_temp + [10001])\n",
"\n",
"# Zero padding the encoded corpus. \n",
"en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')\n",
"de_padded = tf.keras.preprocessing.sequence.pad_sequences(de_encoded, padding='post')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "contemporary-tyler",
"metadata": {},
"outputs": [],
"source": [
"# Splitting the corpus into traiing and validaiton datasets. \n",
"input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(en_padded, de_padded, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "departmental-stamp",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((179501, 41), (44876, 41), (179501, 42), (44876, 42))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# (data set size, length of the longest sentence)\n",
"input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val .shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "honey-hayes",
"metadata": {},
"outputs": [],
"source": [
"BUFFER_SIZE = len(input_tensor_train)\n",
"BATCH_SIZE = 64\n",
"steps_per_epoch = len(input_tensor_train)//BATCH_SIZE\n",
"embedding_dim = 256\n",
"units = 1024\n",
"vocab_inp_size = 10000 + 2\n",
"vocab_tar_size = 10000 + 2\n",
"\n",
"# You get an iterator for training the network. \n",
"dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
"dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cardiac-hardware",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(<tf.Tensor: shape=(64, 41), dtype=int32, numpy=\n",
" array([[10000, 5451, 1616, ..., 0, 0, 0],\n",
" [10000, 386, 2689, ..., 0, 0, 0],\n",
" [10000, 2088, 6838, ..., 0, 0, 0],\n",
" ...,\n",
" [10000, 386, 9937, ..., 0, 0, 0],\n",
" [10000, 391, 73, ..., 0, 0, 0],\n",
" [10000, 509, 536, ..., 0, 0, 0]], dtype=int32)>,\n",
" <tf.Tensor: shape=(64, 42), dtype=int32, numpy=\n",
" array([[10000, 153, 83, ..., 0, 0, 0],\n",
" [10000, 3077, 5, ..., 0, 0, 0],\n",
" [10000, 4104, 284, ..., 0, 0, 0],\n",
" ...,\n",
" [10000, 3077, 6331, ..., 0, 0, 0],\n",
" [10000, 19, 115, ..., 0, 0, 0],\n",
" [10000, 249, 1503, ..., 0, 0, 0]], dtype=int32)>)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# A sample batch. \n",
"sample_data_pair = next(iter(dataset))\n",
"sample_data_pair"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "previous-opportunity",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor(\n",
"[10000 5451 1616 9937 9915 1220 4451 352 42 3687 756 6110\n",
" 9967 10001 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0], shape=(41,), dtype=int32)\n",
"why don't you want me to tell anybody?\n"
]
}
],
"source": [
"# You can see that each row of the batch corresponds to a sentence. \n",
"# The first row, and its decoded sentence in English. \n",
"sample_sentence_en = sample_data_pair[0][0]\n",
"print(sample_sentence_en)\n",
"sample_sentence_en = sample_sentence_en.numpy()\n",
"sample_sentence_en= sample_sentence_en[np.where(sample_sentence_en!=0)][1:-1]\n",
"print(bpemb_en.decode_ids(sample_sentence_en))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "elementary-lecture",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor(\n",
"[10000 153 83 865 3077 234 2377 632 8005 50 9223 9974\n",
" 10001 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0], shape=(42,), dtype=int32)\n",
"warum soll ich es denn niemandem sagen?\n"
]
}
],
"source": [
"# The first row, and its decoded sentence in German. \n",
"sample_sentence_de = sample_data_pair[1][0]\n",
"print(sample_sentence_de)\n",
"sample_sentence_de = sample_sentence_de.numpy()\n",
"sample_sentence_de = sample_sentence_de[np.where(sample_sentence_de!=0)][1:-1]\n",
"print(bpemb_de.decode_ids(sample_sentence_de))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "considerable-musician",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment