-
-
Save TaiToTo/62df769cfd25350c6334d4e3c786240f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "dental-monster", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tensorflow as tf\n", | |
"import time\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib.ticker as ticker\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"import unicodedata\n", | |
"import re\n", | |
"import os\n", | |
"import io\n", | |
"import time\n", | |
"from bpemb import BPEmb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "heated-audience", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bpemb_de = BPEmb(lang='de', vs=10000, dim=100)\n", | |
"bpemb_en = BPEmb(lang='en', vs=10000, dim=100)\n", | |
"\n", | |
"# From the corpus, you extract only the texts necessary. \n", | |
"path_to_file = \"../datasets/deu.txt\"\n", | |
"lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\\n')\n", | |
"temp_list = []\n", | |
"corpus = []\n", | |
"for i in range(len(lines)):\n", | |
" temp_list = lines[i].split('\\t')[:-1]\n", | |
" corpus.append(temp_list)\n", | |
"en, de = np.array(corpus).T\n", | |
"\n", | |
"en_encoded = []\n", | |
"de_encoded = []\n", | |
"\n", | |
"cnt_en = 0\n", | |
"cnt_de = 0\n", | |
"\n", | |
"# You encode the sentences with fewer than 40 words in each row into \n", | |
"# a list of integers, and you append [10000] (<start>) and [10001] (<end>) \n", | |
"# at the beginning and the end of each sentence. \n", | |
"for i in range(len(en)):\n", | |
" en_encoded_temp = bpemb_en.encode_ids(en[i])\n", | |
" de_encoded_temp = bpemb_de.encode_ids(de[i])\n", | |
" \n", | |
" if (len(en_encoded_temp)<=40) and (len(de_encoded_temp)<=40):\n", | |
" en_encoded.append([10000] + en_encoded_temp + [10001])\n", | |
" de_encoded.append([10000] + de_encoded_temp + [10001])\n", | |
"\n", | |
"# Zero padding the encoded corpus. \n", | |
"en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')\n", | |
"de_padded = tf.keras.preprocessing.sequence.pad_sequences(de_encoded, padding='post')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "contemporary-tyler", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Splitting the corpus into traiing and validaiton datasets. \n", | |
"input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(en_padded, de_padded, test_size=0.2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "departmental-stamp", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"((179501, 41), (44876, 41), (179501, 42), (44876, 42))" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# (data set size, length of the longest sentence)\n", | |
"input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val .shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "honey-hayes", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"BUFFER_SIZE = len(input_tensor_train)\n", | |
"BATCH_SIZE = 64\n", | |
"steps_per_epoch = len(input_tensor_train)//BATCH_SIZE\n", | |
"embedding_dim = 256\n", | |
"units = 1024\n", | |
"vocab_inp_size = 10000 + 2\n", | |
"vocab_tar_size = 10000 + 2\n", | |
"\n", | |
"# You get an iterator for training the network. \n", | |
"dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n", | |
"dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "cardiac-hardware", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(<tf.Tensor: shape=(64, 41), dtype=int32, numpy=\n", | |
" array([[10000, 5451, 1616, ..., 0, 0, 0],\n", | |
" [10000, 386, 2689, ..., 0, 0, 0],\n", | |
" [10000, 2088, 6838, ..., 0, 0, 0],\n", | |
" ...,\n", | |
" [10000, 386, 9937, ..., 0, 0, 0],\n", | |
" [10000, 391, 73, ..., 0, 0, 0],\n", | |
" [10000, 509, 536, ..., 0, 0, 0]], dtype=int32)>,\n", | |
" <tf.Tensor: shape=(64, 42), dtype=int32, numpy=\n", | |
" array([[10000, 153, 83, ..., 0, 0, 0],\n", | |
" [10000, 3077, 5, ..., 0, 0, 0],\n", | |
" [10000, 4104, 284, ..., 0, 0, 0],\n", | |
" ...,\n", | |
" [10000, 3077, 6331, ..., 0, 0, 0],\n", | |
" [10000, 19, 115, ..., 0, 0, 0],\n", | |
" [10000, 249, 1503, ..., 0, 0, 0]], dtype=int32)>)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# A sample batch. \n", | |
"sample_data_pair = next(iter(dataset))\n", | |
"sample_data_pair" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "previous-opportunity", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"tf.Tensor(\n", | |
"[10000 5451 1616 9937 9915 1220 4451 352 42 3687 756 6110\n", | |
" 9967 10001 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0], shape=(41,), dtype=int32)\n", | |
"why don't you want me to tell anybody?\n" | |
] | |
} | |
], | |
"source": [ | |
"# You can see that each row of the batch corresponds to a sentence. \n", | |
"# The first row, and its decoded sentence in English. \n", | |
"sample_sentence_en = sample_data_pair[0][0]\n", | |
"print(sample_sentence_en)\n", | |
"sample_sentence_en = sample_sentence_en.numpy()\n", | |
"sample_sentence_en= sample_sentence_en[np.where(sample_sentence_en!=0)][1:-1]\n", | |
"print(bpemb_en.decode_ids(sample_sentence_en))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "elementary-lecture", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"tf.Tensor(\n", | |
"[10000 153 83 865 3077 234 2377 632 8005 50 9223 9974\n", | |
" 10001 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0], shape=(42,), dtype=int32)\n", | |
"warum soll ich es denn niemandem sagen?\n" | |
] | |
} | |
], | |
"source": [ | |
"# The first row, and its decoded sentence in German. \n", | |
"sample_sentence_de = sample_data_pair[1][0]\n", | |
"print(sample_sentence_de)\n", | |
"sample_sentence_de = sample_sentence_de.numpy()\n", | |
"sample_sentence_de = sample_sentence_de[np.where(sample_sentence_de!=0)][1:-1]\n", | |
"print(bpemb_de.decode_ids(sample_sentence_de))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "considerable-musician", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment