Skip to content

Instantly share code, notes, and snippets.

@bharadwaj6
Created May 25, 2018 21:28
Show Gist options
  • Save bharadwaj6/6af00c692d2a25c1df299c39632cb8ff to your computer and use it in GitHub Desktop.
Save bharadwaj6/6af00c692d2a25c1df299c39632cb8ff to your computer and use it in GitHub Desktop.
telugu2vec
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline\n",
"\n",
"import re\n",
"\n",
"import torchtext\n",
"from torchtext import vocab, data\n",
"from torchtext.datasets import language_modeling\n",
"\n",
"from fastai.learner import *\n",
"from fastai.rnn_reg import *\n",
"from fastai.rnn_train import *\n",
"from fastai.nlp import *\n",
"from fastai.lm_rnn import *\n",
"\n",
"\n",
"import dill as pickle\n",
"from IPython.display import Image\n",
"from IPython.core.display import HTML"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"PATH = 'data/teluguwiki/'\n",
"EXT_PATH = 'extract/'\n",
"TRN_PATH = 'train/'\n",
"VAL_PATH = 'valid/'\n",
"SAMPLE_PATH = 'sample/'\n",
"\n",
"EXT = f'{PATH}{EXT_PATH}'\n",
"TRN = f'{PATH}{TRN_PATH}'\n",
"VAL = f'{PATH}{VAL_PATH}'\n",
"SAMPLE = f'{PATH}{SAMPLE_PATH}'\n",
"\n",
"ext_files = !ls {EXT}\n",
"sample_files = !ls {SAMPLE}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import html\n",
"re1 = re.compile(r' +')\n",
"\n",
"def fixup(x):\n",
" x = x.replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace(\n",
" 'nbsp;', ' ').replace('#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace(\n",
" '<br />', \"\\n\").replace('\\\\\"', '\"').replace('<unk>','u_n').replace(' @.@ ','.').replace(\n",
" ' @-@ ','-').replace('\\\\', ' \\\\ ')\n",
" return re1.sub(' ', html.unescape(x))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"cleaned_all = []\n",
"for ext_file in ext_files:\n",
" raw_txt = !cat {EXT}{ext_file}\n",
" cleaned_doc = []\n",
" for line in raw_txt:\n",
" # remove tags\n",
" new_line = re.sub('<[^<]+?>', '', line)\n",
" new_line = re.sub('__[^<]+?__', '', new_line)\n",
" new_line = fixup(new_line)\n",
" new_line = new_line.strip()\n",
" if new_line != '':\n",
" cleaned_doc.append(new_line)\n",
" new_doc = '\\n'.join(cleaned_doc)\n",
" with open(f'{TRN}{ext_file}.txt', 'w+') as text_file:\n",
" text_file.write(new_doc)\n",
" cleaned_all.append(cleaned_doc)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"trn_files = !ls {TRN}\n",
"val_files = !ls {VAL}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create validation set"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"random.seed = 42"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"random.shuffle(trn_files)\n",
"val_files = trn_files[:10] # about 20%\n",
"trn_files = trn_files[10:]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import shutil, os\n",
"\n",
"for root, dirs, files in os.walk(TRN):\n",
" for file in files:\n",
" if file.endswith('.txt') and file in val_files:\n",
" shutil.move(os.path.join(root, file), VAL)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"trn_files = !ls {TRN}\n",
"val_files = !ls {VAL}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data generator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Telugu tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"text = \"అందువల్ల ఇక్కడ జరుగుతున్నది అతన్ని హేళన చెయ్యటం. అతన్ని “ధీవిశాలు”డని పిలవటం కూడ దాన్లో భాగమే. పైకి పొగడ్తగా కనిపిస్తూ లోపల అవహేళన నిండిన ఈ పద్యం ఆలోచనామృతం!\"\n",
"a = tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['అందువల్ల',\n",
" 'ఇక్కడ',\n",
" 'జరుగుతున్నది',\n",
" 'అతన్ని',\n",
" 'హేళన',\n",
" 'చెయ్యటం',\n",
" '.',\n",
" 'అతన్ని',\n",
" '“',\n",
" 'ధీవిశాలు',\n",
" '”',\n",
" 'డని',\n",
" 'పిలవటం',\n",
" 'కూడ',\n",
" 'దాన్లో',\n",
" 'భాగమే',\n",
" '.',\n",
" 'పైకి',\n",
" 'పొగడ్తగా',\n",
" 'కనిపిస్తూ',\n",
" 'లోపల',\n",
" 'అవహేళన',\n",
" 'నిండిన',\n",
" 'ఈ',\n",
" 'పద్యం',\n",
" 'ఆలోచనామృతం',\n",
" '!']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lot more cleaning to do in tokenization. Punctuations, roman numerals, currency notation have to be accounted for first. And morphological analysis maybe? Gotta deal with Agglutination!"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# TEXT = data.Field(lower=True, tokenize=tokenize)\n",
"TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))\n",
"bs = 32\n",
"bptt = 70"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>.</th>\n",
" <td>2428201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>,</th>\n",
" <td>1284024</td>\n",
" </tr>\n",
" <tr>\n",
" <th>కి</th>\n",
" <td>373552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>నుండి</th>\n",
" <td>361665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ఉన్నాయి</th>\n",
" <td>306000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>మీ</th>\n",
" <td>287805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ఉంది</th>\n",
" <td>275448</td>\n",
" </tr>\n",
" <tr>\n",
" <th>గ్రామం</th>\n",
" <td>258004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>దూరంలో</th>\n",
" <td>249881</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>222843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>\"</th>\n",
" <td>190407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>గ్రామంలో</th>\n",
" <td>183030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>)</th>\n",
" <td>151912</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(</th>\n",
" <td>149897</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>148945</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
". 2428201\n",
", 1284024\n",
"కి 373552\n",
"నుండి 361665\n",
"ఉన్నాయి 306000\n",
"మీ 287805\n",
"ఉంది 275448\n",
"గ్రామం 258004\n",
"దూరంలో 249881\n",
"10 222843\n",
"\" 190407\n",
"గ్రామంలో 183030\n",
") 151912\n",
"( 149897\n",
"5 148945"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freqs = pd.DataFrame.from_dict(TEXT.vocab.freqs, orient='index')\n",
"freqs.sort_values(0, ascending=False).head(15)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"cnt = []\n",
"for i in range(49):\n",
" row_cnt = freqs[freqs[0] >= i+1].shape[0]\n",
" cnt.append(row_cnt)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.lines.Line2D at 0x7fae77636208>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvFvnyVgAAIABJREFUeJzt3Xt0nNV57/HvMzOasTTyRbZlYyxjQzAuJiRcxCXJaQ7hagiEnDZpkrbB6aF1Vy490KSngaxksU5uTVaThtImrFKgQJsT8CLtCWQ5cR0SmjsgwOESx+DYYAsMlpExtmXdRs/5491jj4U8motGo9H7+6y11zvvfvc7e7+tw6N9mXebuyMiIlKNRL0bICIijU/BREREqqZgIiIiVVMwERGRqimYiIhI1RRMRESkagomIiJSNQUTERGpmoKJiIhULVXvBkyW+fPn+7Jly+rdjMazeXN0XLGivu0Qkbp49NFHd7t7+3jlYhNMli1bRldXV72b0XjOOy86PvhgPVshInViZs+XUk7DXCIiUrXY9EykQp/+dL1bICINQMFEirvwwnq3QEQagIa5pLiNG6MkIlKEeiZS3LXXRkdNwItIEeqZiIhI1RRMRESkauMGEzO73cx2mdlTBXlzzWyDmT0bjm0h38zsJjPbYmZPmNkZBfesDuWfNbPVBflnmtmT4Z6bzMwqraMWHnmul79d/xtyI9reWETkaErpmdwBrBqVdx3wgLsvBx4I5wCXAstDWgPcDFFgAG4AzgHOBm7IB4dQZk3BfasqqaNWfrXjVb7+o9/SNzhcy2pERBrauMHE3X8M9I7KvhK4M3y+E3h3Qf5dHvklMMfMFgGXABvcvdfd9wAbgFXh2ix3/4W7O3DXqO8qp46ayGaiNQoHBnK1qmJq++IXoyQiUkSlq7kWuvtOAHffaWYLQv5iYEdBue6QVyy/e4z8SurYObqRZraGqPfCcccdV+YjRlrSSQD2D8S0Z/LWt9a7BSLSACZ6At7GyPMK8iup4/WZ7re4e6e7d7a3j/uesjG1HuqZxDSY/PznURIRKaLSYPJyfmgpHHeF/G5gSUG5DuDFcfI7xsivpI6ayMY9mHzqU1ESESmi0mByH5BfkbUa+E5B/lVhxdW5wN4wVLUeuNjM2sLE+8XA+nBtn5mdG1ZxXTXqu8qpoybyPZPYDnOJiJRg3DkTM/sWcB4w38y6iVZlfQlYa2ZXA9uB94bi64DLgC1AH/AnAO7ea2afAx4J5T7r7vlJ/Q8TrRhrBr4XEuXWUSuHeiZazSUiclTjBhN3/8BRLl0wRlkHPnqU77kduH2M/C7gjWPkv1JuHbWQzeQn4GO6mktEpAT6Bfw4Yj8BLyJSAr3ocRzNTUkSFuNgcuON9W6BiDQABZNxmBnZdCq+E/CnnVbvFohIA9AwVwmymVR8eyY/+EGURESKUM+kBNlMMr6vU/n856OjdlwUkSLUMylBaybGw1wiIiVQMClBrIe5RERKoGBSgqx6JiIiRSmYlKA1k9Iv4EVEitAEfAliPQH/T/9U7xaISANQMClBrIe5VqyodwtEpAFomKsErekUg8MjDOVG6t2UyXf//VESESlCPZMS5N8c3DeQY3ZLzOLvV78aHa+4or7tEJEpLWb/ZazMoT1NNAkvIjImBZMSxH63RRGRcSiYlKDl0J4mCiYiImNRMCmB9jQRESlOE/AlyKZjHEz+9V/r3QIRaQAKJiU4NAEfxx8uLllS7xaISAPQMFcJ8vvAx7Jncs89URIRKUI9kxJkD/VMYhhMbr45Or7vffVth4hMaeqZlCCTSpBKWDx7JiIiJVAwKYGZaU8TEZEiFExKFO22GMMJeBGREiiYlCh6Db16JiIiY9EEfImycd0g6957690CEWkACiYlao3rnibz59e7BSLSADTMVaJsOqYT8HfcESURkSIUTEoUreaK4QS8gomIlEDBpEStmWQ8h7lEREqgYFKi/O9M3L3eTRERmXKqCiZm9pdm9rSZPWVm3zKzGWZ2vJk9ZGbPmtk9ZpYOZTPhfEu4vqzge64P+ZvN7JKC/FUhb4uZXVeQP2YdtZTNpBgecQaGY7gPvIjIOCoOJma2GPhfQKe7vxFIAu8Hvgx8zd2XA3uAq8MtVwN73P1E4GuhHGa2Mtx3CrAK+IaZJc0sCXwduBRYCXwglKVIHTWjPU1ERI6u2mGuFNBsZimgBdgJnA/kf5xwJ/Du8PnKcE64foGZWci/290H3H0bsAU4O6Qt7r7V3QeBu4Erwz1Hq6Nm8i977BuM2ST8unVREhEpouJg4u4vAF8BthMFkb3Ao8Cr7p7/870bWBw+LwZ2hHuHQ/l5hfmj7jla/rwidRzBzNaYWZeZdfX09FT6qEA0AQ8xfHNwS0uURESKqGaYq42oV3E8cCyQJRqSGi0/Y21HuTZR+a/PdL/F3TvdvbO9vX2sIiXLxnWY6xvfiJKISBHVDHNdCGxz9x53HwL+HXgrMCcMewF0AC+Gz93AEoBwfTbQW5g/6p6j5e8uUkfNtKRjuqfJ2rVREhEpoppgsh0418xawjzGBcCvgR8B7wllVgPfCZ/vC+eE6z/0aJ3tfcD7w2qv44HlwMPAI8DysHIrTTRJf1+452h11MzhCfiYzZmIiJSgmjmTh4gmwR8DngzfdQvwSeDjZraFaH7jtnDLbcC8kP9x4LrwPU8Da4kC0feBj7p7LsyJfAxYD2wC1oayFKmjZmK9da+IyDiqetGju98A3DAqeyvRSqzRZfuB9x7le74AfGGM/HXA65YSufuYddRSa5y37hURGYd+AV+i2E7Ai4iUQK+gL1FTMkE6lWB/3PY0efDBerdARBqAeiZlaNU+8CIiY1IwKUO0dW/MVnN95StREhEpQsGkDNl0DHdb/O53oyQiUoSCSRk0zCUiMjYFkzJkFUxERMakYFKG1kwMh7lEREqgpcFliOUEfHNzvVsgIg1AwaQMsRzm+t736t0CEWkAGuYqQ2smxYFB7QMvIjKagkkZspkUIw4Hh2I01PW5z0VJRKQIBZMyZOP4sscHHoiSiEgRCiZlyG/d2xe3SXgRkXEomJQhG9fdFkVExqFgUoZWvYZeRGRMWhpchpZ8MInTa+jnzat3C0SkASiYlCE/Z7I/TnMm3/52vVsgIg1Aw1xl0G6LIiJjUzApQyyDyfXXR0lEpAgNc5Uhlqu5fvGLerdARBqAeiZlSCaM5qZkvHomIiIlUDApUzaTitcEvIhICRRMytSaUc9ERGQ0zZmUKXavoe/oqHcLRKQBKJiUKRu33Rb/7d/q3QIRaQAa5ipTfk8TERE5TMGkTNEwV4wm4K+9NkoiIkVomKtMrZlkvIa5Nm6sdwtEpAGoZ1KmbDpmE/AiIiVQMClTNpOibzDHyIj2gRcRyasqmJjZHDO718x+Y2abzOwtZjbXzDaY2bPh2BbKmpndZGZbzOwJMzuj4HtWh/LPmtnqgvwzzezJcM9NZmYhf8w6JkNrHF9DLyIyjmp7Jn8PfN/dfwd4M7AJuA54wN2XAw+Ec4BLgeUhrQFuhigwADcA5wBnAzcUBIebQ9n8fatC/tHqqLnDL3uMyST8SSdFSUSkiIqDiZnNAt4O3Abg7oPu/ipwJXBnKHYn8O7w+UrgLo/8EphjZouAS4AN7t7r7nuADcCqcG2Wu//C3R24a9R3jVVHzWXDniax6ZncckuURESKqKZncgLQA/yLmT1uZreaWRZY6O47AcJxQSi/GNhRcH93yCuW3z1GPkXqOIKZrTGzLjPr6unpqfxJC2jrXhGR16smmKSAM4Cb3f104ADFh5tsjDyvIL9k7n6Lu3e6e2d7e3s5tx5VfpgrNsuD16yJkohIEdUEk26g290fCuf3EgWXl8MQFeG4q6D8koL7O4AXx8nvGCOfInXUXGvc5kyeeSZKIiJFVBxM3P0lYIeZrQhZFwC/Bu4D8iuyVgPfCZ/vA64Kq7rOBfaGIar1wMVm1hYm3i8G1odr+8zs3LCK66pR3zVWHTXXkg5zJnHpmYiIlKDaX8D/BfBNM0sDW4E/IQpQa83samA78N5Qdh1wGbAF6AtlcfdeM/sc8Ego91l37w2fPwzcATQD3wsJ4EtHqaPmWuM2zCUiUoKqgom7bwQ6x7h0wRhlHfjoUb7nduD2MfK7gDeOkf/KWHVMhljuAy8iMg69m6tMLekkZjEKJqedVu8WiEgDUDApk5mRTcdo694bb6x3C0SkAejdXBXIauteEZEjKJhUIJtJsT8uv4D/4z+OkohIERrmqkBrnPaB7+4ev4yIxJ56JhXQniYiIkdSMKlANhOjCXgRkRIomFSgVRPwIiJH0JxJBbJxmjN5y1vq3QIRaQAKJhVozaTi8zqVv/mberdARBqAhrkqkM2kGBgeYTg3Uu+miIhMCQomFYjV1r2///tREhEpQsNcFWgNW/fuHxxmdktTnVtTY6+8Uu8WiEgDUM+kAnpzsIjIkRRMKqBgIiJyJAWTCsRu614RkXFozqQC2XSMdlu8oC57kIlIg1EwqUBrnIa5PvOZerdARBqAhrkq0BJWcx2Iy2voRUTGoWBSgXzPJBbDXJdeGiURkSI0zFWBTCpBMmHxGOY6eLDeLRCRBqCeSQWifeCTWs0lIhIomFQoVi97FBEZh4JJhWL1GnoRkXFozqRC2bj0TC6/vN4tEJEGoGBSoda49Ez+6q/q3QIRaQAa5qpQNqMJeBGRPAWTCsVmmOu886IkIlKEgkmFWjMp/QJeRCRQMKmQVnOJiBymYFKh1kyKoZwzMKx5ExGRqoOJmSXN7HEz+244P97MHjKzZ83sHjNLh/xMON8Sri8r+I7rQ/5mM7ukIH9VyNtiZtcV5I9Zx2TKpsPLHjUJLyIyIT2Ta4BNBedfBr7m7suBPcDVIf9qYI+7nwh8LZTDzFYC7wdOAVYB3wgBKgl8HbgUWAl8IJQtVsekic1ui3/wB1ESESmiqmBiZh3AO4Fbw7kB5wP3hiJ3Au8On68M54TrF4TyVwJ3u/uAu28DtgBnh7TF3be6+yBwN3DlOHVMmti8OfgjH4mSiEgR1fZMbgT+GhgJ5/OAV909/1/YbmBx+LwY2AEQru8N5Q/lj7rnaPnF6pg0+Z5J33Rf0dXXFyURkSIqDiZmdjmwy90fLcweo6iPc22i8sdq4xoz6zKzrp6enrGKVCx7qGcyzedMLrssSiIiRVTTM3kb8C4ze45oCOp8op7KHDPLv6alA3gxfO4GlgCE67OB3sL8UfccLX93kTqO4O63uHunu3e2t7dX/qRjiNXWvSIi46g4mLj79e7e4e7LiCbQf+jufwT8CHhPKLYa+E74fF84J1z/obt7yH9/WO11PLAceBh4BFgeVm6lQx33hXuOVsekmTkjCia9BwYnu2oRkSmnFr8z+STwcTPbQjS/cVvIvw2YF/I/DlwH4O5PA2uBXwPfBz7q7rkwJ/IxYD3RarG1oWyxOibNotkzaGtp4lc7Xp3sqkVEppwJeWuwuz8IPBg+byVaiTW6TD/w3qPc/wXgC2PkrwPWjZE/Zh2Tycw4c2kbjz6/p57NEBGZEvQK+ip0LpvLDzbtYvf+Aea3ZurdnNr40Ifq3QIRaQAKJlXoXNoGwKPP7+GSU46pc2tqRMFEREqgd3NV4dSO2aRTiek91LV7d5RERIpQz6QKmVSSNy2ezSPP9da7KbXznrBo7sEH69oMEZna1DOp0pnL2njqhb30D03zHy+KiBShYFKls5bOZSjnWiIsIrGmYFKlM8MkfNd0njcRERmHgkmV2rJpTlzQOr0n4UVExqEJ+AnQubSNdU/uZGTESSTGeg9lA/vwh+vdAhFpAOqZTIAzl7bxWv8wW3r217spE+9974uSiEgRCiYT4KxlcwGm5xLhHTuiJCJShILJBFg6r4X5rWkefW4azpt88INREhEpQsFkApgZnUvnakWXiMSWgskE6VzWxvbePna91l/vpoiITDoFkwnSGeZN1DsRkThSMJkgpxw7ixlNCbqm47yJiMg49DuTCdKUTPDmjjl0PT/NVnR94hP1boGINAAFkwl01rK53Pxfv6VvcJiW9DT5P+0VV9S7BSLSADTMNYHOXNZGbsTZuH0avfRx8+YoiYgUoWAygc44rg2zaTYJ/+d/HiURkSIUTCbQ7OYmViycOb2CiYhICRRMJtiZS9t47Pk95Ea83k0REZk0CiYT7Kxlc9k/MMzml/bVuykiIpNGwWSCHd4sa5otERYRKWKarF+dOjramjlm1gx+ufUVrnrLsno3p3qf/nS9WyAiDUDBZIKZGZeduog7fr6NLbv2c+KC1no3qToXXljvFohIA9AwVw185B1voLkpyVfWT4PfZ2zcGCURkSIUTGpgfmuGP3v7CXz/6Zd4fHuDLxO+9tooiYgUoWBSI3/6uycwL5vmy9//De5aJiwi05uCSY20ZlL8xfkn8sutvfzXMz31bo6ISE0pmNTQH56zlCVzm/ny9zczoh8xisg0VnEwMbMlZvYjM9tkZk+b2TUhf66ZbTCzZ8OxLeSbmd1kZlvM7AkzO6Pgu1aH8s+a2eqC/DPN7Mlwz01mZsXqmGrSqQSfuGgFm3a+xv1PvFjv5oiI1Ew1PZNh4BPufjJwLvBRM1sJXAc84O7LgQfCOcClwPKQ1gA3QxQYgBuAc4CzgRsKgsPNoWz+vlUh/2h1TDnvevOxnLxoFl/9z2cYHB6pd3PK98UvRklEpIiKg4m773T3x8LnfcAmYDFwJXBnKHYn8O7w+UrgLo/8EphjZouAS4AN7t7r7nuADcCqcG2Wu//Coxnsu0Z911h1TDmJhPHJVSvY3tvHtx7eXu/mlO+tb42SiEgREzJnYmbLgNOBh4CF7r4TooADLAjFFgM7Cm7rDnnF8rvHyKdIHVPSfz+pnXNPmMs//PBZDgwM17s55fn5z6MkIlJE1cHEzFqBbwPXuvtrxYqOkecV5JfTtjVm1mVmXT099VtRZWZ8ctXvsHv/ILf+ZFvd2lGRT30qSiIiRVQVTMysiSiQfNPd/z1kvxyGqAjHXSG/G1hScHsH8OI4+R1j5Ber4wjufou7d7p7Z3t7e2UPOUFOP66NVaccwy0//i3PvKw3CovI9FLNai4DbgM2ufvfFVy6D8ivyFoNfKcg/6qwqutcYG8YoloPXGxmbWHi/WJgfbi2z8zODXVdNeq7xqpjSvv05SfTkklx1W0P88KrB+vdHBGRCVNNz+RtwAeB881sY0iXAV8CLjKzZ4GLwjnAOmArsAX4Z+AjAO7eC3wOeCSkz4Y8gA8Dt4Z7fgt8L+QfrY4praOthbv+59kcGBzmqtseovfAYL2bJCIyISwur/ro7Oz0rq6uejcDgIe2vsIHb3+YlYtm8c0/PYdsZgq/vPm886Ljgw/WsxUiUidm9qi7d45XTr+Ar4NzTpjHP37gdJ7ofpUPf/Oxqf37kxtvjJKISBEKJnVy8SnH8MX/cSo/fqaH/33vr6bu61ZOOy1KIiJFTOHxlenv/WcfxysHBvnb9ZuZl83wmctPJrwxZur4wQ+iozbJEpEiFEzq7CPnvYHd+we4/WfbODiU44YrVjKjKVnvZh32+c9HRwUTESlCwaTOzIzPvHMl6VSCf/qvrTy+fQ//+Ienc+KCmfVumohIyTRnMgUkEsb1l57MHX9yFj37BrjiH37G2kd2aFMtEWkYCiZTyHkrFrDumt/l9OPm8NfffoJr79nI/kZ7l5eIxJKCyRSzcNYM/vXqc/jERSdx/69e5PKbfsJjjb6PvIhMe/rR4hT28LZerrn7cXbu7efCkxdw7YUn8cbFsye3EZs3R8cVKya3XhGZEkr90aKCyRS3r3+IO372HP/8k6281j/MRSsXcu2Fyznl2EkOKiISSwomozRqMMl7rX+If/npc9z6063s6x/mklMWcs0FJ7Hy2Fm1rfj++6PjFVfUth4RmZIUTEZp9GCSt/fgELf/dBu3/3Qb+waGOeO4OfxB5xLe+aZFzJzRNPEV6t1cIrGmYDLKdAkmeXv7hrinaztru7rZsms/zU1JLjt1Ee/t7OCc4+dO3C/pFUxEYq3UYKIfLTao2S1NrHn7G/iz3z2BjTteZW1XN/f/6kW+/Vg3S+e18M5TF3HhyoWc1jGHRGKKvaJFRKYd9UymkYODOb731E7ufbSbh7b1khtx5rdmuPDkBVy0ciFvO3F++a9qUc9EJNbUM4mh5nSS3zujg987o4NX+wZ5cHMPGza9zHef2Mndj+xgRlOCt5wwj3NOmMfZx8/l1MWzaUrqp0YiUj31TGJgYDjHQ1t72fDrl/nZb3eztecAAM1NSc5YOoezl83jrOPbOHXx7NdP4u/YER2XLJnkVovIVKCeiRySSSV5+0ntvP2kdgB69g3Q9VwvD23r5eFtvdz4wDPk/6Y4fn6WNy6ezZsWz+aNi2dzyuJjmFWLVWIiMq2oZyLsPTjEY9v38FT3Xp58YS9PvbCXF/f2A3D5ph8zN5um++J3cdLCmZy0sJWTFs7kxAWtU+tV+SJSE+qZSMlmNzfxjhULeMeKBYfydu8f4KkX9rL8fZ+lbzDHx866iJ8828NQLvrjwwyWtLWwbH6WE+ZnWTYv+nz8/CyL5zST0lyMSKwomMiY5rdmOG/FApjTDMD6v3w7Q7kRnn/lAM+8vJ/NL+1j6+4DbNu9n8ee33PE241TCePYOc10tDWzpK2FjrZmOuY209HWwrFzmlkwM6OJf5FpRsFEStaUTHDigpmcuGAml5266FC+u7N7/yDPvXKAbT0HeO6VA3TvOUj3nj5+uHkXPfsGjvgesyhYLZo9g2NmzeCY2TNYOCtKC2ZmDh3ntDRNvW2MRWRMCiZSNTOjfWaG9pkZzlo293XX+4dyvPDqQXb09vHS3n5eeq2fl/b2s3NvP8+/0scvtr7Cvv7X79uSTiZon5lh/swM7a3p6HNr5tBxbjbNvGyatmyaOc1NGloTqSMFE6m5GU1J3tDeyhvaW49a5uBgjl37+nn5tQF27etn12sDvLyvn57XBujZP0D3noNs3LGX3gMDjBxlzcjs5ibmZtO0tTQxpyUKMLNbmpjTnKYt28Ts5iZmNYfjjPx5ikxKCwlEqqVgIsXde++kVNOcTrJ0Xpal87JFy+VGnN4Dg/TsG2BP3yC9BwbZ0zfIK/sHjzjfta+fZ17ex96+IfaNs1vljKYEs2Y0MXNGipkzooAzc0aKWeE8m06RzSRpzaRonZEim0nRmkmRTUfHlnAtk0poWE5iS8FEips/v94tOEIycXhIrVRDuRH2Hhzi1b4hXusfYu/BIV4LaW9I+/qH2dc/fOh6956+kDdE/9BIyW1rSUeBpSWdJBuOLelwnk7RnE6GvMP5zekkzU3ReXM6wYym6DyfP6MpqUAlU56CiRR3xx3R8UMfqmcrqtKUTDC/NZpnqcRwboQDgzkODAyzP6QDIe0fyNE3WJgXnR8YzNE3EB137eunbyDH/oFhDg7m6BvKkTvaWN1RmEVvLMgHlxlNCTKpI4/5oHPEseDzEWVTSTLhnhmpJOlUgkwqQSaUyZ+nEqYgJiVRMJHipkEwqVYqmWB2c4LZzRPzJgB3ZzA3EgWWwSj49A+NcHAox8HBHAeHcvQPRdf6h8J5yI/KjNA/lGNgOEf/0AgDwzl27x8+dB5dO3yshlm0ECKTSpBOJcMxQToZjqkETUkjnUqSTtqha00F1/Nlm5L5ZGQKz1MJmhJGUzJBKmmkkwlSoVzhPYWfC68r4E0NCiYik8zMyKSSZFJJ5rTUti53Z2B4hIGhEfqHc4eOhQGnf2iEweEoKEXH6PPA0AiDuZGCvMPlhkL+UM4ZHI6GEYeGD5fPXx8sONbyZRtNSSOViIJRUzJBMmE0JaKgk0oaqUR0vSlpJBOHg1EqEQWjVD5AJYxkIh+wouvJ/PVwLVXCeTJhh4759qQSRqIgv/C7E3Y4f3TZQ8eCMlMxeCqYiExjZhaGuJLMpr7vWBvOheCTi4JNYUAayo0wHK7lyw0dKucMjxxZdig3wvCIMzQ8wtCIh3ui67mRqPxQLsofHnGGc4fzciPRd/QPjTCcGz7ieuHnqNzh8tH3To3XTyWMQ8FodABKmh2RnzC45sKTeNebj61pmxRMRGRSRL0EaKZxl2K7+6Ggkj8Oh0AzNOKMHLp2ZOAa8ShIFd6bD3q5ERgeeX2ZET9cbvR9I17QhlxU35H3cKjMyIgzZ4KGaItp2GBiZquAvweSwK3u/qU6N0lEpjmz/JBYvVsy9TRkMDGzJPB14CKgG3jEzO5z91/Xt2XT0Lp19W6BiDSARn3/xNnAFnff6u6DwN3AlXVu0/TU0hIlEZEiGjWYLAZ2FJx3hzyZaN/4RpRERIpo1GAy1rq41y2zMLM1ZtZlZl09PT2T0KxpaO3aKImIFNGowaQbKNyUvAN4cXQhd7/F3TvdvbO9vX3SGiciEjeNGkweAZab2fFmlgbeD9xX5zaJiMRWQ67mcvdhM/sYsJ5oafDt7v50nZslIhJbDRlMANx9HaB1qyIiU4B5LV+YM4WYWQ/wfIW3zwd2T2BzGk2cnz/Ozw7xfn49e2Spu4876RybYFINM+ty9856t6Ne4vz8cX52iPfz69nLe/ZGnYAXEZEpRMFERESqpmBSmlvq3YA6i/Pzx/nZId7Pr2cvg+ZMRESkauqZiIhI1RRMxmFmq8xss5ltMbPr6t2eWjOz281sl5k9VZA318w2mNmz4dhWzzbWipktMbMfmdkmM3vazK4J+dP++c1shpk9bGa/Cs/+f0L+8Wb2UHj2e8IbJ6YlM0ua2eNm9t1wHqdnf87MnjSzjWbWFfLK+nevYFJEwb4plwIrgQ+Y2cr6tqrm7gBWjcq7DnjA3ZcDD4Tz6WgY+IS7nwycC3w0/P87Ds8/AJzv7m8GTgNWmdm5wJeBr4Vn3wNcXcc21to1wKaC8zg9O8A73P20giXBZf27VzApLnb7prj7j4HeUdlXAneGz3cC757URk0Sd9/p7o+Fz/uI/sOymBg8v0f2h9OmkBw4H7g35E/LZwcwsw7gncCt4dyIybMXUda/ewWT4rRvSmShu++E6D+4wII6t6fmzGwZcDrwEDF5/jDMsxHYBWwAfgu86u7DocgWIAmkAAABzklEQVR0/vd/I/DXwEg4n0d8nh2iPxz+08weNbM1Ia+sf/cN+26uSVLSvikyvZhZK/Bt4Fp3fy36I3X6c/cccJqZzQH+Azh5rGKT26raM7PLgV3u/qiZnZfPHqPotHv2Am9z9xfNbAGwwcx+U+4XqGdSXEn7psTAy2a2CCAcd9W5PTVjZk1EgeSb7v7vITs2zw/g7q8CDxLNG80xs/wfndP13//bgHeZ2XNEQ9nnE/VU4vDsALj7i+G4i+gPibMp89+9gklx2jclch+wOnxeDXynjm2pmTBOfhuwyd3/ruDStH9+M2sPPRLMrBm4kGjO6EfAe0Kxafns7n69u3e4+zKi/43/0N3/iBg8O4CZZc1sZv4zcDHwFGX+u9ePFsdhZpcR/ZWS3zflC3VuUk2Z2beA84jeGvoycAPw/4C1wHHAduC97j56kr7hmdl/A34CPMnhsfNPEc2bTOvnN7M3EU2yJon+yFzr7p81sxOI/lqfCzwO/LG7D9SvpbUVhrn+yt0vj8uzh+f8j3CaAv6vu3/BzOZRxr97BRMREamahrlERKRqCiYiIlI1BRMREamagomIiFRNwURERKqmYCIiIlVTMBERkaopmIiISNX+P9MyJ/Z8gvTLAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot(cnt)\n",
"plt.axvline(x=10, color='red', linestyle='--')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>శిలాశాసనాల</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>నేపధ్యంలో</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>స్ధానం</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ranaut</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>మన్హట్టాన్</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>తుంగభద్రకు</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>సందర్భాలలోను</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>shares</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>చిహ్నాలైన</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>కోయిలకొండలోను</th>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"శిలాశాసనాల 10\n",
"నేపధ్యంలో 10\n",
"స్ధానం 10\n",
"ranaut 10\n",
"మన్హట్టాన్ 10\n",
"తుంగభద్రకు 10\n",
"సందర్భాలలోను 10\n",
"shares 10\n",
"చిహ్నాలైన 10\n",
"కోయిలకొండలోను 10"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#number of legitimate telugu words at freqs\n",
"freqs[freqs[0] <= 10].sort_values(ascending=False, by=0).head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looks like we also have lot of english words, not surprising."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Language model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# ignore that test is pointing to the same directory. We wont be using test set below.\n",
"FILES = dict(train=f'{TRN_PATH}', validation=f'{VAL_PATH}', test=f'{TRN_PATH}')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb+'))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Variable containing:\n",
" 0\n",
" 84694\n",
" 35129\n",
" 84694\n",
" 15\n",
" 1246\n",
" 11\n",
" 3\n",
" 5485\n",
" 21\n",
"[torch.cuda.LongTensor of size 10x1 (GPU 0)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#trn_ds is list; one for each txt file\n",
"txt = md.trn_ds[0].text[:10]\n",
"TEXT.numericalize([txt]) # change to CPU/ GPU depending on hardware"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"em_sz = 300 # size of each embedding vector\n",
"nh = 500 # number of hidden activations per layer\n",
"nl = 3 # number of layers\n",
"wd = 1e-7 # weight decay"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
"drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.7"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"learner= md.get_model(opt_fn, em_sz, nh, nl, dropouti=drops[0], dropout=drops[1], \n",
" wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])\n",
"learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
"learner.clip = 0.3"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "39bd9fcb9a1a4fc6839c79df1a2a284f",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 78%|███████▊ | 8218/10518 [46:56<13:08, 2.92it/s, loss=15.1] "
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEOCAYAAACEiBAqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvFvnyVgAAIABJREFUeJzt3XeclNW9x/HPbztbYWFh6UsHCyACgqjR2BU1xRITjTWYctWrpmDUm+rVa4xJNNForClqsBu7GImoCNK7IL2zsGyvM3PuHzO7LuvussjMPDM73/frta+deeaZeb6M4/z2nPM855hzDhERSVxJXgcQERFvqRCIiCQ4FQIRkQSnQiAikuBUCEREEpwKgYhIglMhEBFJcCoEIiIJToVARCTBqRCIiCS4FK8DdESPHj1cUVGR1zFEROLKggUL9jjnCg60X1wUgqKiIubPn+91DBGRuGJmmzqyn7qGREQSnAqBiEiCUyEQEUlwKgQiIglOhUBEJMGpEIiIJDgVAhGRGFRW3cBbK3ayp7Iu4sdSIRARiUGfFlcw7W8LWLG9POLHUiEQEYlB1fV+ALLSkiN+LBUCEZEYVFUXLASZaZGfAEKFQEQkBlXW+QDISleLQEQkIZVUBQeJ87PSIn4sFQIRkRi0p7Ke9JQkstPVNSQikpD2VNTRIzsdM4v4seJiGmoRkUTz/KJtUWkNgFoEIiIxq7reF5XjRKwQmNmjZrbbzJa38tgPzcyZWY9IHV9EJJ7lZKTw7clFUTlWJFsEjwNntNxoZv2BU4HNETy2iEjcCgQclXU+cjPivGvIOfceUNLKQ78Dfgy4SB1bRCSeVdX7cA5yMlKjcryojhGY2bnANufckmgeV0QknjReTJYdpRZB1M4aMrNM4BbgtA7uPw2YBjBgwIAIJhMRiS2VtaFC0AnPGhoCDAKWmNlGoB+w0MwKW9vZOfeQc268c258QUFBFGOKiHirqUUQpUIQtRaBc24Z0LPxfqgYjHfO7YlWBhGReNA44VxWvLcIzOwpYA4wwsy2mtlVkTqWiEhnEs0J5yCCLQLn3MUHeLwoUscWEYlnVVHuGtKVxSIiMeazFoEKgYhIQor2YLEKgYhIjKmq85GSZKSnROcrWoVARCTGVNX5yEpPicoU1KBCICIScyrr/FHrFgIVAhGRmFNZ1xC1U0dBhUBEJOZU1fmjdsYQqBCIiMScyjqfuoZERBJZlQqBiEhiqwydNRQtKgQiIjFGXUMiIgnMORe6jkBnDYmIJKTahgABF715hkCFQEQkplTUNQCQo0IgIpKYKkLLVOZ2ic7C9aBCICISU8prgi2C3AwVAhGRhFTe1CJQ15CISEJSi0BEJMGVhgpBXqYKgYhIQiqtqgega5e0qB1ThUBEJIaU1jSQlZZMWpRWJwMVAhGRmFJa3UDXzOi1BkCFQEQkppRW15MXxWsIQIVARCSmlNY00C1LhUBEJGGVVtdHdaAYVAhERGJKaXVDVE8dBRUCEZGY4ZwLdg11lkJgZo+a2W4zW95s22/MbLWZLTWzF8ysa6SOLyISbyrrfPgDrlN1DT0OnNFi29vAEc650cAa4OYIHl9EJK6UVkf/qmKIYCFwzr0HlLTY9pZzzhe6+xHQL1LHFxGJN42FoGsCnT56JfC6h8cXEYkppTXB6SW6ZXWerqE2mdktgA/4Rzv7TDOz+WY2v7i4OHrhREQ8si9RWgRmdhkwFfiWc861tZ9z7iHn3Hjn3PiCgoLoBRQR8UhZdbBFEO0xguitfACY2RnAT4AvOeeqo3lsEZFY99kYQSfpGjKzp4A5wAgz22pmVwF/BHKAt81ssZn9OVLHFxGJN17MPAoRbBE45y5uZfMjkTqeiEi821ddH/WZR0FXFouIxIyy6ga6Rnl8AFQIREQ6rN4XIBBo8xyXQ1Zao0IgIhLTxvziLa59elHEXn+fBzOPggqBiEiHOOeoafDz6tIdETuGuoZERGJYvT8Q0ddvnHk02quTQZSvI4i24oo6KmobMDMstM0MGu+Z0eL35/czC95OMiMp9HiSGZZE0+0k23+/pn0s+JoiEv98/siNDUDwGgJ/wNE9Oz2ix2lNpy4E976zlr99tMnTDMGC8lnBwCApVDCSzUhKMpKTgo+lJhspyUZqUhKpyUnB28lJZKQm0SU1mS5pyWSkJgdvt7ifnZFC96w0umen0z0rjcK8DFKT1eATCZeGZi2CqjofWenh/frcU1kHQEGOCkFYnX90P44e2A1HsJI7F/wBcASbYk013rH/fk2/Xeh5joCDgAveb/zt2H97436f7RO632K/QMDhd67ptz8APn8AX8DR4A/Q4A/g8zvq/QHqfAH2VtVTs89PTYOf2gY/NfXB222dwJCabAwpyGZYrxzG9Mvj3DF96JmbEcm3W6RTa941tLuijkFhLgTFFaFCoBZBeI3p35Ux/Tvv2jfOBQtFTb2filofe6vqKamqY09FPRv2VrFmZwWLNu/jX0u2c8frqzl1VC9unTqKft0yvY4uEneadw0VV9QxqEdWWF+/rCY0vYQHg8WduhB0dmZGekoy6SnJdM1Mo39+61/w64srmTF/K3+ds5Ezfr+Hv101kaMGdItuWJE417xraNHmfUwclB/W16+sCy7Vkh3mlkZHqBM5AQwuyGb6mSN564YTyM9K44rHP2Z7aY3XsUTiSvNCcMfrq8P++tX1foCwjz10hApBAunXLZO/XjmR6no/d7/5iddxROJKQ7OuoeSk8J8N2NgiyExLDvtrH4gKQYIp6pHFJccM5OUl25vOUhCRA2veIjhuaI+wv35VnY+UJCM9yjOPggpBQrp4Yn98AcdLi7d7HUUkbjRvETQO7IZTdb2fzLRkT649UiFIQMN65TCyMIc3V+z0OopI3GhsEXTPSqM8AoWgss7nyUAxqBAkrNMOL2T+xhL2qntIpEOaCkF2GuW1kWgR+MhUIZBoOu2wXgQczFy1y+soInGh8TqCHtnplFY30M6S619IZZ3fkzOGQIUgYR3eJ5d+3brw5goVApGOaLyyeHNJNb6AY+6GkrC+fnWdjywPzhgCFYKEZWaccXgh76/dE5GBL5HOprFFUFEbPM3zo/V7w/r6lRGYv6ijVAgS2Fmje1PvDzBzpVoFIgfSOEbw2wvGANCna5ewvn51vV8tAom+o/p3pW/XLry6LHILbYh0Fo1dQ40FINxnDkViRtOOUiFIYGbGWUcWMnttMSVV9V7HEYlpjV1D3bKCk8I1dhGFi7qGxDMXjO9Pg9/xzPwtXkcRiWmNXUNpyUlkpSU3TQkRDr7QdPNZaSoE4oHhvXI4ZlA+j3+4kdoGv9dxRGJWYyFITUkiKz2FqjAWgqqmCec0RiAe+e9ThrOjrJaHZ6/3OopIzGqcYiI1KYnsjBQqwlgIquuDr6WuIfHM5CHdOf3wXtw/ax27y2u9jiMSk5paBMlGdrhbBHUqBBIDfnrWKBr8gYjMsy7SGfj8AcyCU1Bnp6dQGcbB4qq6UNdQZzt91MweNbPdZra82bZ8M3vbzNaGfmuZrBgxsHsW3ztxKC8s2sYrSzUrqUhL9X5HalISZkZWekpYB4s7c4vgceCMFtumA+8454YB74TuS4y49stDGdO/K7e8sJwtJdVexxGJKQ3+AKnJwSmic8JdCBoHizvbWUPOufeAlpNxnAc8Ebr9BPCVSB1fDl5qchL3fmMszjmue3pR2CfVEolnPn+A1NCiMWE/a6ipRdDJuoba0Ms5twMg9LtnlI8vBzCwexa3Tj2MRZtLeeLDjV7HEYkZ9X5HSlLwKzM7I7wtgspO3DV0SMxsmpnNN7P5xcXFXsdJKF8f149TRvXkF6+s5I3lWrxGBIItgrRQ11B2egoNfkedLzzX3iTa6aO7zKw3QOj37rZ2dM495Jwb75wbX1BQELWAEjwr4r6LxzGmX1eue3oRCzbt8zqSiOca/AFSkkMtgtAXdrjOHKoMnTWUmZoYXUMvA5eFbl8GvBTl40sHdUlL5tHLJ1CYm8G1Ty6kpl5XHUtia/C7psHixr/cG0/7PFTVdT4y05JJSor+esUQ2dNHnwLmACPMbKuZXQXcCZxqZmuBU0P3JUblZ6Xxm/NHs72slkc/2OB1HBFP1fsDpLZoEVTUhWcG0qp67yacA4jYkZ1zF7fx0MmROqaE3zGDu3PKqF48MGsd35jQn+7Z6V5HEvFETb2fzNAFX9nhbhHU++niUbcQxPBgscSO6WeOoLrex33//tTrKCKe2bCnit6htQiyM0JjBGFqEdT7AqSnePd1rEIgBzS0Zw4XHN2fJ+duZkdZjddxRKKuut7HttIaRvbKASA7dL5/ZZhaBA3Nup280KEjm9n1ZpZrQY+Y2UIzOy3S4SR2XHvyUByOe99Rq0ASz4Y9VQAMLsgGIDs9uDhNuM4aqvMFSIuDFsGVzrly4DSgALgCDfQmlH7dMrl44gCemb+FTXurvI4jElU7SoOz8vbrFuwaarwCOFxXF9fHSSFoPKfpLOAx59ySZtskQfzgpKGkJifxo2eWahEbSSiNZwfldgm2BBrnBArXmgQN/gBpsd41BCwws7cIFoI3zSwHCEQulsSiXrkZ3Pn1I/l4Uwk3/HMx/oDmIpLEUF4T/MLPDQ0SJyUZWWnJ4WsR+OOjRXAVwZlCJzjnqoFUgt1DkmDOG9uXW84axevLd/LrV1d6HUckKipqgy2CnIzUpm3ZGeFbk6DB99nFal7o6HUEk4HFzrkqM7sEGAf8IXKxJJZdffxgNu2t5rEPNnJY71wuGN/f60giEbWttJb8rLT9/mrPSk+hsj6cLYLYv47gAaDazMYAPwY2AX+NWCqJebecPYpjh3Tn1heXN51RIdJZbS6pYkB+5n7bcsK4Slm9Lz7GCHwuODn9ecAfnHN/AHIiF0tiXUZqMvdcOJa0lCR+8txSrV0gndrmkurPF4KMVMprw3RBmT9AWop3XUMdLQQVZnYzcCnwqpklExwnkARWmJfBT88axbwNJfxr6Q6v44hERIM/wPbSWgZ2378QdMtKY19VfViOES8tgouAOoLXE+wE+gK/iVgqiRsXju/P4X1yueO1VZqhVDql6c8twx9wn2sRdM9KoyRMhaAhHs4aCn35/wPIM7OpQK1zTmMEQnKS8bNzDmdHWS1//s86r+OIhFVJVT3PLdwKwEkj919QsVtmGuW1Phr8h34mfb0vPqaYuBCYB1wAXAjMNbPzIxlM4sfEQflMHd2bP/9nneYikk5lyZZSAJ68+hh6tJh5Nz8r2Du+r/rQWgWBgMMXcLHfIgBuIXgNwWXOuW8DE4HbIhdL4s30M0fiHPzu7TVeRxEJm8Yz4ob2yv7cY92y0gDYV3VoA8b1oRZFzLcIgCTnXPNlJfcexHMlAfTrlsmlkwfy7IKtfLKzwus4ImGxfFsZBTnpFLSyDkd+qBAc6jhBYyGIh2mo3zCzN83scjO7HHgVeC1ysSQeXfvloeRkpHLbi8t1Oql0Cku3lTG6bx5mnz+1M2yFwBcsBDHfNeSc+xHwEDAaGAM85Jz7SSSDSfzpmpnGD08fwbyNJfxnTbHXcUQOSWWdj3XFlYzu17XVx/MzQ4XgEMcIGuKoawjn3HPOuRudczc4516IZCiJXxeN709hbgZ/mb3e6ygih2Tz3mqcg2GtjA9A8zGCMLUIYrUQmFmFmZW38lNhZuXRCinxIy0liSuPK+KDT/fy9spdXscR+cJ2VQTXIOiV2/o63anJSeRkpBxy11BjiyBmu4accznOudxWfnKcc7nRCinx5YopgxjeK5v/eWl52C7BF4m24vI6AHrmZLS5T++8DLbuqz6k49T54qhrSKSjUpOTuOv8Mewqr+WuN1Z7HUfkC9kdahEU5LTeIgAY3COb9cWfTbq4cns5Fz44h3XFlR0+TmPXUDycNSRyUMb278qlkwby9LwtbN57aH8xiXihuKKO3IwUMlLbnh56cEEWm0uqufn5ZcxeW8xf52xk3oYS7ny9438AbSsNXoTZOObgBRUCiZjvnTiU5CTj9zN1kZnEn7KaBrpmtv/lPLggG1/A8dS8zVz6yDw+2RW8hubtlbs6vJzrna+vJjMtmSP6eNfbrkIgEVOYl8HlU4p4YfE2lm0t8zqOSIfVNvh5acl2MtPaXyxmeIszihZtLm26/egHG9p83o6yGm7852KKpr/K1n01TBrcnRSNEUhn9YOThpKfmcbtr63URWYSN56cuxnn4NTDerW734jCzy/L8pWxfeidl8Fdb3zCh5/u+dzjJ9z1LpPv+DfPL9rWtO13F4499NCHQIVAIio3I5XrTh7GR+tLmKWLzCROvLVyJyN65XDTaSPa3S+9leUlU5OTeOm/pgBw+eMf8+TczRRNf5X/e2M1izbvY3PJ/mNm9158FHmZ3i7vokIgEXfxxAEM7J7JrS8sp7iizus4Iu1asqWUj9aXcPoRhR3av3GdghOGFwDB2Xh75mRw4ogC6n0BfvrCMgAemLWOr97/IQDPf/9YNt55NhvvPJtzx/SJwL/i4HhSCMzsBjNbYWbLzewpM2v7RF2Je2kpSdxz4Vh2lddyz9ufeB1HpF0LN+8D4JsTB3Ro/6mjewNwy1mjWPPrM7lgfH8Arj95GAC5GSn86ZvjmvbPTk/hqP6tT1vhlZRoH9DM+gLXAYc552rMbAbwDeDxaGeR6Dl6YDcunTyQJz7cyPe+NJQBLZb9E4kVa3dX0jUztc0rilv64WkjOP3wws+NFxw1oBszrpnM4X1yyUpPYcKgk3lt6Q4uO7ao1UnsvORV11AK0MXMUoBMYLtHOSSKrjlhCGbGA//51OsoIm1ataOcYT2zO/xlnZRkjGnjL/yJg/LJSg/+vd0zJ4PLpwyKuSIAHhQC59w24G5gM7ADKHPOvdVyPzObZmbzzWx+cbEGGTuDwrwMrji2iKfmbWHWJ7sP/ASRKHPOsWpHeZszjnZWUS8EZtYNOA8YBPQBsszskpb7Oececs6Nd86NLygoiHZMiZAfnj6CAfmZ3DhjiU4nlZhTVtNAbUOA3nmJNWzpRdfQKcAG51yxc64BeB441oMc4oGM1GS+MbE/JVX1fLS+xOs4IvvZWR6cX6hQhSDiNgOTzCzTgp1lJwOrPMghHrlyyiByM1K4ccZiAgG1CiR27CwLFYJcFYKIcs7NBZ4FFgLLQhkeinYO8U5GajKXTxnEjrJa3lix0+s4Ik12h6ae7qVCEHnOuZ8550Y6545wzl3qnNNVRgnm+pOHMbRnNj9/eYUuMpOY0dg1pEIgEgXJScZ9Fx9FaU0D059bqoFjiQm7K2rplpnq6WphXkisf63ElFG9c/nx6SN4Z/Vunpq3xes4IpRWN9DtAFNPd0YqBOKpK6cMYmz/rvzmzdVUaFlL8VhZTQO5XbydAM4LKgTiqaQk42fnHMa+6gb++bFaBeKt8poG8lQIRKLvqAHdOH5YD+55ew0rt5d7HUcSVE29n3XFVWoRiHjlrvNHk52ewln3zuYnzy71Oo4kmAZ/gFH/8waVdT5GtFh1LBGoEEhM6J3Xhee/fyxHDejKP+dv4T0tYiNRNHvtZ5+3SyYN9DCJN1QIJGb065bJ09MmUdQ9OBfRxxs1BYVEx6xPislKS2bNr8884IL1nZEKgcSU9JRkfnvhWKrrfVzw5znc8doq/JqGQiJs095qBhVkJdz1A40S818tMe3ogd34900nMqhHFg++t57bXlrudSTp5FZsL2NgfpbXMTyjQiAxqTAvg3/f9CWOG9qDJ+du5t3VWr9AImN7aQ17Kuvpl9/F6yieUSGQmGVm3H3BGABumLGY6nqfx4mkM/r7R5sAOO2wji1W3xmpEEhMK8zL4JnvTqa0uoGbZixh+bYyryNJJ9NYCMYNSKxVyZpTIZCYN6Eon7OOLOT15TuZet/7/PhZrW4m4bGnso7yWh+XThoYk2sJR4sKgcSFuy8Yw21TDwNgxvytnP7795oWEXHOUa55iuQLmLlyFwBfHtnT4yTeSvE6gEhHZKalcNVxgzh3TB9uemYJ760pZtId7zD9zJE88v4Giivq6JqZyne/NITzxvahd17iDvxJx/197ib6devCiSMSe110tQgkrhTkpPPoZeO5bHLw6s87X1/dtLBNaXUDd76+msl3/Js/zFzrZUyJA+W1DSzfVs6EovyE7hYCFQKJQynJSfzivCNYcOsp9O3ahRtOGc6GO85ixjWTuenU4aSlJPG7mWuorNNZRtK2ueuDV65fOL6/x0m8p64hiVvds9P5YPqXm+5PHJTPxEH5TBnWg6/d/yFPz9vM1ccP9jChxLL31xbTJTWZcQMT92yhRmoRSKdzVP+uTBnand+9vYZNe6t4afE2bpyxmLJqDSjLZ95bu4dJg/NJT0n2Oorn1CKQTsfMuPNrozn+rnf50m9mNW1/afF2Lj+2iEsmDWRQj8SdTkBgS0k1G/ZUcWkCzjTaGrUIpFPqn5/J9ScPA+CIvrn069YFf8DxyPsbOOnuWUy4fSZXPf4xVRpHSEiz1+4B4IThPTxOEhvUIpBO64ZTh3PDqcOB4LUGT3y4kZU7ylmxvZyUJOOd1bs5697ZPH7FRLUQEsysT3bTOy+DIQWJtwhNa1QIJCGYGZdPGbTfttlri7nisY+5+KGP+Md3jtGXQoLYXV7Lu5/s5tJJRQl/2mgjdQ1Jwjp+WAF/+tY4dpbXcvJv/0NJVb3XkSQK3lm9mwa/46IJOm20kQqBJLTTDy/kV185AoBv/uUjzWGUAN5bU0xhbgbDE3Bt4raoEEjCu3TSQH5w0hBW76zgrdDcM9I51Tb4eX35To4f1kPdQs14UgjMrKuZPWtmq81slZlN9iKHSKPrTx7OyMIcbn91lVoFndisT4KL1E8e0t3jJLHFqxbBH4A3nHMjgTHAKo9yiACQlpLEVccNYnNJNS8v2d60feX2cq5/ehG1DX4P00m4rN1VAcCph/XyOElsifpZQ2aWC5wAXA7gnKsHNEonnjtxRHAq4uufXsz1Ty/mmi8N5sH/rAfg7ZW7WPbz00lOUndCPJuzfi8jC3PIyUj1OkpM8aJFMBgoBh4zs0Vm9rCZ6SRu8VxBTjp/+ua4pvuNRQCgut7P9/6+wItYEib1vgALNu3j2CG6iKwlLwpBCjAOeMA5dxRQBUxvuZOZTTOz+WY2v7i4ONoZJUGdPbo3K395Oi98/1jGDejKg5cezfr/PYsx/bvy1spdfPDpHq8jyhf0+vId1PkCHDM43+soMceiPTBmZoXAR865otD944Hpzrmz23rO+PHj3fz586OUUOTzVu8s54zfzwZg3k9PpmduhseJ5GA45xh082sArPzl6WSmJca1tGa2wDk3/kD7Rb1F4JzbCWwxsxGhTScDK6OdQ+RgjCzMbVoq8/i73tXgcZxZV1wFwAnDCxKmCBwMr84auhb4h5ktBcYC/+tRDpEOu+q4QXzrmAHU+QLc+85afP6A15Gkg+as3wvAL8893OMkscmT0uicWwwcsLkiEmtu/+qRrC+u4v5Z69hZVss9F431OpJ0wPtri+mTl8HA7pleR4lJurJY5CD97NxgF9Hzi7ZpsZs4UFbdwLufFHPa4YW6mrgNKgQiB2lkYS6vXXc8AGN++RaLt5R6nEja8/6ne6j3BTjryN5eR4lZKgQiX8BhfXI5Z0wfAL7ypw80LUUMW7atjNRkY0z/PK+jxCwVApEv6L6Lj2LaCYMBWLG93OM00pYV28sY3itHaxO3Q4VA5BB870tDAJh63/v8e7VmLo01zjmWbSvjyL5qDbRHhUDkEHTLSuNrR/UFYNpfF7CrvNbjRNLc1n01lFY3cIQKQbtUCEQO0T0XjeXBS4/GF3Bc+9Qir+NIM2tCs42O6p3jcZLYpkIgEganH17IOWP6MG9DCX95b/2BnyBR8enuSgCG9lQhaI8KgUiY/Oyc4PUFt7+2ipUaPI4Jn+6upCAnnbwumna6PSoEImHSIzudd276EgDPLdzqcRoBWLu7kmE9tTbxgagQiITRkIJsJg3O56PQ3DbiHeccn6oQdIgKgUiYTR7cgxXby1miK449tbO8lso6H0N7aXzgQFQIRMJs6pjgVAbn/ekD6nyartora3cFB4rVIjgwFQKRMBtSkM2PTg8ut/Hy4u0ep0lca3erEHSUCoFIBHz/xCH0ycvg2QVb+foDH7Jpb5XXkRLOki2l9MhOp3t2utdRYp4KgUgEmBlHF+Uzd0MJCzbt4+631jQ9NmfdXraUVHuYrvOr8/l595PdnDSiwOsocUGFQCRCzj6ysOn2wk378PkDzF2/l4v/8hEXPjjHw2Sd32/e+ISKWh9nj9bU0x2hxTtFIuSMI3rzxn8fz8cbSrjtpRUMveX1psd2lNWys6yWwrwMDxN2TrUNfh5+fwMAxw9Ti6Aj1CIQiaCRhbmcf3R/euZ81k89qEcWAI9+sMGrWJ3a3+ZsAuCBb40jOUkrknWEWgQiEdYlLZk5N5/M9tIa+nXrgplRNP1VHnpvPT89a5TX8TqN5dvKWLyllEdCrYHTDy88wDOkkVoEIlGQnGT0z89sWjP3kkkDAPjDzLX4/AEvo3UaU+97n1tfXM7O8lqmju5NkloDHaZCIOKBq48Lrmz2u5lruPn5ZVTV+bj3nbWs2tH2ZHWBgJbDbEttw/4X7l19/GCPksQni4e1VsePH+/mz5/vdQyRsHp5yXaua2X9gj55GfzivCMoqapj0uDuDOyexX3vrOVPsz7l/m+N48sje3mQNra9+8lurnjsYx67YgInjejpdZyYYWYLnHPjD7SfWgQiHjl3TB8e/vZn/482Xo28vayW7/x1Pj95bhmXPDKXFdvL+O3ba6htCPDnWVrroDVz1u0lLSWJyYO7ex0lLqkQiHjo5FE9+dHpI3h62iR+cNJQFt526n6Pbymp4ex73wfgq0f1ZcHmfeyrqvciakxbvq2MYT2zyUjVAvVfhAqBiIfMjB+cNJRJob9k87PSmPXDE1n1yzOYcc3kpv3uvmAM004YjD/guHHG4rCNFzT4A5z3pw+49cVlOOeo98XfwHW9L8DCzfuYUJTvdZS4pdNHRWJMUeg6gwlF3bj82CLGDezGuWP6AHDDKcP53cw1/ODJhTxwydHDMmhxAAAOHklEQVT7Pe/jjSV0SU0+qIXa73pjNUu2lLJkSylPzt1MwMG8n55Mz9z4udBt2bZSahsCHDNIheCL8qxFYGbJZrbIzF7xKoNILDMzfn7u4U1FAODaLw8F4PXlO5kxfwsb9lRR2+DnvTXFXPDnOUy9732q6nztvm5ZTQNXP/ExRdNf5S+zN9Al1J3S2MiYdMc7PDBrHe2dSLJ1X/XnztTxyhMfbiI9JYljND7whXnZIrgeWAXkephBJK4kJRl3fu1Ipj+/jB8/u7TVfV5esp2LJw7g9zPX8O/Vu+mZk8764ir+dvUx9O3ahdtfXcnMVbsBGFKQxSvXHs+o/3kDgK8d1ZfnF23j/95YzcRB3Th64P5/ZW8rrWHd7kq+/eg8Th7Zk0cunxDZf3AzlXU+Tr3nP5x1ZG/6du3ClKE96JmTzr+WbudbxwwgPystalk6G08KgZn1A84Gbgdu9CKDSLz6xsQBTB7SnV+9soqZq3Y1bb9s8kDmbijh5ueX0eAP8PuZa/d73jV/m8+EonxmzN9KdnoK1508lCumDCI1OYk/fXMcdT4/px1eyOaSauZv2sebK3btVwgWbCrh6w98NlneO6t3U+8LkJYS/o6FT3dXMLhHdtNFYat3lnPG72cDNF053Nz5R/cPe4ZE4sl1BGb2LHAHkAP80Dk3tb39dR2BSNtWbC9j2dYyLprQn/c/3cOlj8xremzq6N6cMqoXGalJfPfvC5u2v/xfUxjdr2ubr3n5Y/PYsKeKN//7BNJTkjAzrnhsHu9+UgzAgPxMNpdUM6Z/V176wRQA1hVXUlJVz5F989o8e6eyzkd1vY+eOW2PQcxcuYur/zqfC8f347TDCinqkckp97wHwOTB3Smtadjvwrsvj+zJo1FsmcSTjl5HEPVCYGZTgbOcc983sxNpoxCY2TRgGsCAAQOO3rRpU1RzisSrOev2ctmj8/juiUO48dThTdsf/2ADczeU8LuLxh7wNMu/fbSJ215cDsAJwws4Z3RvfvTsUg7rncuvv3oER/TJY/itn82m+odvjOX6pxc33V/ys9PIzUhpmlKj0fBbXqfeH2DZz08jJyMVgC0l1fzXU4sY3jOb2845jB/8YyGz1+75XKbJg7vz1LRJTfeLK+p4/MMN3HTqCE0n0YZYLgR3AJcCPiCD4BjB8865S9p6jloEItHV4A9wwz8X88rSHfttf/eHJzbNnrpqRzln/mH2AV/rR6eP4Mopg3ht2Q5uemZJ0/aZN57Q9Jd+S0XdM9m497PFe848opD7vzXuc4VF2hezhWC/g7fTImhOhUDEGyVV9Uy64x3qfQFmXDOZia2covnCoq3c8M/gF/zi/zmVsb98+6CPM7pfHtnpKXy4bi8As398Ev3zM/EHHJv2VjG4QOsOfxEdLQS6jkBE2pSflcaaX5/Z7j5fGduXuoYAA7tn0TUzjUW3ncrqnRUM7J7J3A17ufvNNWwrrQHgL98ez9j+XZlw+0wAfn7OYVx2bBFmhnOO+Zv2cUSfPLqkBbuukpNMRSAKNOmciESUzx/glheWM3ZAVy6eGJx++1evrGTZ1jL+8Z1jSE3WBAeREhddQx2lQiAicvA0+6iIiHSICoGISIJTIRARSXAqBCIiCU6FQEQkwakQiIgkOBUCEZEEp0IgIpLg4uKCMjMrBjYBeUBZaPOBbjf+7gF8firD9jV/vY4+1nJ7W/ejmbWtxzua9UC5Y+G99epz0Nbjh/Leep21rXytZW2+TZ+D2P0cDHTOFRzwWc65uPkBHuro7Wa/5x/KcTr6WMvtbd2PZta2Hu9o1g7k9vy99epzEIn31uusB/N+6nMQn5+Dtn7irWvoXwdxu/m2QzlORx9rub2t+9HM2tbjHc3a/HZb7/fBCvd769XnoK3HD+W99Tpry236HHTsufH0OWhVXHQNHQozm+86MNdGLIinrBBfeZU1cuIpr7K2Lt5aBF/EQ14HOAjxlBXiK6+yRk485VXWVnT6FoGIiLQvEVoEIiLSDhUCEZEEp0IgIpLgEroQmNnxZvZnM3vYzD70Ok97zCzJzG43s/vM7DKv87THzE40s9mh9/ZEr/N0hJllmdkCM5vqdZb2mNmo0Pv6rJl9z+s87TGzr5jZX8zsJTM7zes8B2Jmg83sETN71ussrQl9Rp8IvaffCudrx20hMLNHzWy3mS1vsf0MM/vEzD41s+ntvYZzbrZz7rvAK8ATsZwVOA/oCzQAW2M8qwMqgYxIZg3lCkdegJ8AMyKTsilTOD6zq0Kf2QuBiJ1aGKasLzrnvgNcDlwUqayhXOHIu945d1Ukc7Z0kLm/Bjwbek/PDWuQL3IVWiz8ACcA44DlzbYlA+uAwUAasAQ4DDiS4Jd985+ezZ43A8iN5azAdOCa0HOfjfGsSaHn9QL+EeufA+AU4BsEv7CmxnLW0HPOBT4EvhnrWUPP+y0wLtY/B82eF7H/vw4x983A2NA+T4YzRwpxyjn3npkVtdg8EfjUObcewMyeBs5zzt0BtNrkN7MBQJlzrjyWs5rZVqA+dNcfy1mb2QekRyJnozC9tycBWQT/Z6sxs9ecc4FYzBp6nZeBl83sVeDJcOcMV1YzM+BO4HXn3MJI5AxnXi8cTG6Cret+wGLC3JsTt4WgDX2BLc3ubwWOOcBzrgIei1iith1s1ueB+8zseOC9SAZrxUFlNbOvAacDXYE/RjZaqw4qr3PuFgAzuxzYE4ki0I6DfW9PJNhFkA68FtFkn3ewn9lrCba28sxsqHPuz5EM14qDfW+7A7cDR5nZzaGC4YW2ct8L/NHMzubQpqH4nM5WCKyVbe1eMeec+1mEshzIQWV1zlUTLFpeONiszxMsXF456M8BgHPu8fBHOaCDfW9nAbMiFeYADjbrvQS/vLxysHn3At+NXJwOazW3c64KuCISB4zbweI2bAX6N7vfD9juUZYDUdbIiae8yho58Za3UdRzd7ZC8DEwzMwGmVkawQHAlz3O1BZljZx4yquskRNveRtFP3e0RscjMNr+FLCDz06nvCq0/SxgDcFR91u8zqmsyqusyhvruTXpnIhIgutsXUMiInKQVAhERBKcCoGISIJTIRARSXAqBCIiCU6FQEQkwakQSNiZWWUUjnFuB6eXDucxTzSzY7/A844ys4dDty83My/mX/ocMytqOf1xK/sUmNkb0cok3lAhkJhlZsltPeace9k5d2cEjtne/FsnAgddCICfAvd9oUAec84VAzvMbIrXWSRyVAgkoszsR2b2sZktNbNfNNv+ogVXBFthZtOaba80s1+a2VxgspltNLNfmNlCM1tmZiND+zX9ZW1mj5vZvWb2oZmtN7PzQ9uTzOz+0DFeMbPXGh9rkXGWmf2vmf0HuN7MzjGzuWa2yMxmmlmv0FTB3wVuMLPFFlzdrsDMngv9+z5u7cvSzHKA0c65Ja08NtDM3gm9N++EpkTHzIaY2Ueh1/xlay0sC65W9aqZLTGz5WZ2UWj7hND7sMTM5plZTugv/9mh93Bha60aM0s2s980+291TbOHXwTCuiKWxBivL7HWT+f7ASpDv08DHiI4m2ISwQVATgg9lh/63QVYDnQP3XfAhc1eayNwbej294GHQ7cvB/4Yuv048EzoGIcRnMsd4HyCUzUnAYUE10c4v5W8s4D7m93vBk1X3V8N/DZ0++fAD5vt9yRwXOj2AGBVK699EvBcs/vNc/8LuCx0+0rgxdDtV4CLQ7e/2/h+tnjdrwN/aXY/j+AiJuuBCaFtuQRnGM4EMkLbhgHzQ7eLCC2IAkwDbg3dTgfmA4NC9/sCy7z+XOkncj+dbRpqiS2nhX4Whe5nE/wieg+4zsy+GtreP7R9L8FFd55r8TqNU1ovIDgXf2tedMF1BFaaWa/QtuOAZ0Lbd5rZu+1k/Wez2/2Af5pZb4JfrhvaeM4pwGFmTbMG55pZjnOuotk+vYHiNp4/udm/52/AXc22fyV0+0ng7laeuwy428z+D3jFOTfbzI4EdjjnPgZwocWWzCyL4Dz2Ywm+v8Nbeb3TgNHNWkx5BP+bbAB2A33a+DdIJ6BCIJFkwB3OuQf32xhcXOUUYLJzrtrMZhFc3xig1jnXcgW2utBvP21/Zuua3bYWvzuiqtnt+4B7nHMvh7L+vI3nJBH8N9S087o1fPZvO5AOT/zlnFtjZkcTnJzsDjN7i2AXTmuvcQOwCxgTylzbyj5GsOX1ZiuPZRD8d0gnpTECiaQ3gSvNLBvAzPqaWU+Cf23uCxWBkcCkCB3/feDrobGCXgQHezsiD9gWun1Zs+0VQE6z+28B/9V4J/QXd0urgKFtHOdDglMMQ7AP/v3Q7Y8Idv3Q7PH9mFkfoNo593eCLYZxwGqgj5lNCO2TExr8ziPYUggAlxJcE7elN4HvmVlq6LnDQy0JCLYg2j27SOKbCoFEjHPuLYJdG3PMbBnwLMEv0jeAFDNbCvyK4BdfJDxHcGrf5cCDwFygrAPP+znwjJnNBvY02/4v4KuNg8XAdcD40ODqSlpZ3co5t5rgUo05LR8LPf+K0PtwKXB9aPt/Azea2TyCXUutZT4SmGdmi4FbgF875+qBiwguaboEeJvgX/P3A5eZ2UcEv9SrWnm9h4GVwMLQKaUP8lnr6yTg1VaeI52EpqGWTs3Msp1zlRZcj3YeMMU5tzPKGW4AKpxzD3dw/0ygxjnnzOwbBAeOz4toyPbzvEdw0fd9XmWQyNIYgXR2r5hZV4KDvr+KdhEIeQC44CD2P5rg4K4BpQTPKPKEmRUQHC9REejE1CIQEUlwGiMQEUlwKgQiIglOhUBEJMGpEIiIJDgVAhGRBKdCICKS4P4f5PBjGrUGsOgAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#find suitable learning rates\n",
"learner.lr_find(1e-07, 1e2)\n",
"learner.sched.plot()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"lr = 7 * 1e-4"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# learner.metrics = [accuracy]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "284a1da6aec64ed086559a68a654cb53",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch trn_loss val_loss \n",
" 0 3.704231 3.870589 \n",
" 1 3.460246 3.67693 \n",
" 2 3.443637 3.621497 \n",
" 3 3.333726 3.584137 \n",
" 4 3.291139 3.527215 \n",
" 5 3.270766 3.492533 \n",
" 6 3.254767 3.482341 \n",
" 7 3.255783 3.508195 \n",
" 8 3.243309 3.483084 \n",
" 9 3.260235 3.45983 \n",
" 10 3.209985 3.443384 \n",
" 11 3.192712 3.427085 \n",
" 12 3.191212 3.414559 \n",
" 13 3.175301 3.407779 \n",
" 78%|███████▊ | 8182/10448 [46:56<12:59, 2.91it/s, loss=3.32]"
]
}
],
"source": [
"learner.fit(lr, 4, wds=wd, cycle_len=1, cycle_mult=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"learner.save_encoder('adam1_enc')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"learner.load_encoder('adam1_enc')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch trn_loss val_loss \n",
" 0 3.178207 3.409042 \n",
" 1 3.213273 3.501319 \n",
" 2 3.226869 3.497255 \n",
" 3 3.216037 3.48765 \n",
" 4 3.178982 3.477347 \n",
" 5 3.181992 3.469156 \n",
" \r"
]
}
],
"source": [
"# learner.fit(lr, 1, wds=1e-6, cycle_len=20, cycle_save_name='adam3_20')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# learner.save_encoder('adam3_enc')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"m = learner.model\n",
"pickle.dump(m,open(f'{PATH}models/wiki_lang.pkl','wb'))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SequentialRNN(\n",
" (0): RNN_Encoder(\n",
" (encoder): Embedding(85686, 300, padding_idx=1)\n",
" (encoder_with_dropout): EmbeddingDropout(\n",
" (embed): Embedding(85686, 300, padding_idx=1)\n",
" )\n",
" (rnns): ModuleList(\n",
" (0): WeightDrop(\n",
" (module): LSTM(300, 500, dropout=0.105)\n",
" )\n",
" (1): WeightDrop(\n",
" (module): LSTM(500, 500, dropout=0.105)\n",
" )\n",
" (2): WeightDrop(\n",
" (module): LSTM(500, 300, dropout=0.105)\n",
" )\n",
" )\n",
" (dropouti): LockedDropout(\n",
" )\n",
" (dropouths): ModuleList(\n",
" (0): LockedDropout(\n",
" )\n",
" (1): LockedDropout(\n",
" )\n",
" (2): LockedDropout(\n",
" )\n",
" )\n",
" )\n",
" (1): LinearDecoder(\n",
" (decoder): Linear(in_features=300, out_features=85686, bias=False)\n",
" (dropout): LockedDropout(\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))\n",
"# m = pickle.load(open(f'{PATH}models/wiki_lang.pkl','rb'))\n",
"m[0].bs=1\n",
"m.eval()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def gen_text(ss,topk):\n",
" s = [tokenize(ss)]\n",
" t = TEXT.numericalize(s)\n",
" m.reset()\n",
" pred,*_ = m(t)\n",
" pred_i = torch.topk(pred[-1], topk)[1]\n",
" return [TEXT.vocab.itos[o] for o in to_np(pred_i)]\n",
"\n",
"def gen_sentences(ss,nb_words):\n",
" result = []\n",
" s = [tokenize(ss)]\n",
" t = TEXT.numericalize(s)\n",
" m.reset()\n",
" pred,*_ = m(t)\n",
" for i in range(nb_words):\n",
" pred_i = pred[-1].topk(2)[1]\n",
" pred_i = pred_i[1] if pred_i.data[0] < 2 else pred_i[0]\n",
" result.append(TEXT.vocab.itos[pred_i.data[0]])\n",
" pred,*_ = m(pred_i[0].unsqueeze(0))\n",
" return(result)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ఆయన', '<unk>', 'ఈ', 'ఆ', 'ఆమె', '\"', 'తరువాత', 'అయితే', 'తన', 'ఆయనకు']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_sentence = \"ఆయన కుటుంబం ఆయనకు వారి స్తోమత ప్రకారం వైద్యాన్ని అందించింది.\"\n",
"gen_text(test_sentence, 10)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రిపేరు\"శ్రీకృష్ణ\".ఆయనతండ్రి'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"''.join(gen_sentences(test_sentence, 50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"emb_weights = list(learner.model.named_parameters())[0][1]\n",
"emb_np = to_np(emb_weights.data)\n",
"\n",
"TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))\n",
"TEXT.vocab.set_vectors(vectors=emb_weights.data, dim=300, stoi=TEXT.vocab.stoi)\n",
"pickle.dump(TEXT, open(f'{PATH}models/TEXT_vec.pkl','wb'))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"TEXT_vec = pickle.load(open(f'{PATH}models/TEXT_vec.pkl','rb'))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>290</th>\n",
" <th>291</th>\n",
" <th>292</th>\n",
" <th>293</th>\n",
" <th>294</th>\n",
" <th>295</th>\n",
" <th>296</th>\n",
" <th>297</th>\n",
" <th>298</th>\n",
" <th>299</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>&lt;unk&gt;</th>\n",
" <td>0.162756</td>\n",
" <td>-0.118457</td>\n",
" <td>0.066381</td>\n",
" <td>-0.056356</td>\n",
" <td>-0.202901</td>\n",
" <td>0.066646</td>\n",
" <td>0.113037</td>\n",
" <td>0.568882</td>\n",
" <td>-0.354657</td>\n",
" <td>0.329644</td>\n",
" <td>...</td>\n",
" <td>-0.195009</td>\n",
" <td>0.371669</td>\n",
" <td>0.091774</td>\n",
" <td>0.007729</td>\n",
" <td>0.158614</td>\n",
" <td>0.325467</td>\n",
" <td>-0.185765</td>\n",
" <td>0.930058</td>\n",
" <td>-0.041299</td>\n",
" <td>-0.026871</td>\n",
" </tr>\n",
" <tr>\n",
" <th>&lt;pad&gt;</th>\n",
" <td>-0.479526</td>\n",
" <td>0.014009</td>\n",
" <td>-0.450860</td>\n",
" <td>-0.361312</td>\n",
" <td>0.154549</td>\n",
" <td>0.071390</td>\n",
" <td>-0.295752</td>\n",
" <td>-0.418422</td>\n",
" <td>0.770350</td>\n",
" <td>0.555413</td>\n",
" <td>...</td>\n",
" <td>0.291602</td>\n",
" <td>-0.546950</td>\n",
" <td>-0.103712</td>\n",
" <td>-0.113335</td>\n",
" <td>-0.270057</td>\n",
" <td>-0.270375</td>\n",
" <td>0.351973</td>\n",
" <td>-0.412007</td>\n",
" <td>0.125875</td>\n",
" <td>-0.121921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>.</th>\n",
" <td>-0.126146</td>\n",
" <td>-0.098541</td>\n",
" <td>-0.013123</td>\n",
" <td>0.106648</td>\n",
" <td>0.528485</td>\n",
" <td>-0.003247</td>\n",
" <td>-0.646804</td>\n",
" <td>0.522134</td>\n",
" <td>0.108019</td>\n",
" <td>0.543941</td>\n",
" <td>...</td>\n",
" <td>0.040240</td>\n",
" <td>0.321279</td>\n",
" <td>0.051702</td>\n",
" <td>-0.037312</td>\n",
" <td>-0.288039</td>\n",
" <td>0.988383</td>\n",
" <td>-0.087501</td>\n",
" <td>0.600596</td>\n",
" <td>-0.159826</td>\n",
" <td>0.282194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>,</th>\n",
" <td>0.043711</td>\n",
" <td>-0.091255</td>\n",
" <td>0.022957</td>\n",
" <td>0.211157</td>\n",
" <td>0.173500</td>\n",
" <td>0.058316</td>\n",
" <td>1.159627</td>\n",
" <td>0.605265</td>\n",
" <td>0.015381</td>\n",
" <td>0.610853</td>\n",
" <td>...</td>\n",
" <td>-0.099935</td>\n",
" <td>0.480227</td>\n",
" <td>0.057349</td>\n",
" <td>-0.036091</td>\n",
" <td>0.111147</td>\n",
" <td>0.623288</td>\n",
" <td>-0.184326</td>\n",
" <td>1.418015</td>\n",
" <td>0.020298</td>\n",
" <td>0.187803</td>\n",
" </tr>\n",
" <tr>\n",
" <th>కి</th>\n",
" <td>0.373087</td>\n",
" <td>0.145757</td>\n",
" <td>-0.918256</td>\n",
" <td>-1.009143</td>\n",
" <td>-1.178270</td>\n",
" <td>-0.274561</td>\n",
" <td>-0.707244</td>\n",
" <td>0.399376</td>\n",
" <td>0.197722</td>\n",
" <td>1.274530</td>\n",
" <td>...</td>\n",
" <td>0.545980</td>\n",
" <td>-0.941536</td>\n",
" <td>-0.132543</td>\n",
" <td>-0.372984</td>\n",
" <td>0.386519</td>\n",
" <td>0.748406</td>\n",
" <td>-0.599072</td>\n",
" <td>-1.956071</td>\n",
" <td>-0.498130</td>\n",
" <td>1.298580</td>\n",
" </tr>\n",
" <tr>\n",
" <th>నుండి</th>\n",
" <td>0.271972</td>\n",
" <td>0.008300</td>\n",
" <td>-0.261310</td>\n",
" <td>-0.271940</td>\n",
" <td>-1.232825</td>\n",
" <td>-0.001914</td>\n",
" <td>-1.091928</td>\n",
" <td>0.509688</td>\n",
" <td>0.286951</td>\n",
" <td>0.536997</td>\n",
" <td>...</td>\n",
" <td>-0.270993</td>\n",
" <td>-0.130284</td>\n",
" <td>-0.021892</td>\n",
" <td>-0.098667</td>\n",
" <td>-0.450791</td>\n",
" <td>0.570602</td>\n",
" <td>0.177364</td>\n",
" <td>-0.171380</td>\n",
" <td>0.128581</td>\n",
" <td>-0.475457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ఉన్నాయి</th>\n",
" <td>-0.114024</td>\n",
" <td>-0.124042</td>\n",
" <td>-0.311016</td>\n",
" <td>-0.048162</td>\n",
" <td>0.314308</td>\n",
" <td>0.240846</td>\n",
" <td>-4.632638</td>\n",
" <td>0.775169</td>\n",
" <td>0.661511</td>\n",
" <td>0.968530</td>\n",
" <td>...</td>\n",
" <td>-0.004587</td>\n",
" <td>0.103790</td>\n",
" <td>0.014730</td>\n",
" <td>-0.635435</td>\n",
" <td>0.146665</td>\n",
" <td>1.464527</td>\n",
" <td>0.039011</td>\n",
" <td>0.259942</td>\n",
" <td>0.172892</td>\n",
" <td>0.677400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>మీ</th>\n",
" <td>1.038532</td>\n",
" <td>0.625470</td>\n",
" <td>-0.522149</td>\n",
" <td>-0.468230</td>\n",
" <td>-0.561634</td>\n",
" <td>0.374316</td>\n",
" <td>-0.048730</td>\n",
" <td>0.466691</td>\n",
" <td>-0.038677</td>\n",
" <td>1.585641</td>\n",
" <td>...</td>\n",
" <td>0.840410</td>\n",
" <td>2.888513</td>\n",
" <td>0.066506</td>\n",
" <td>0.026903</td>\n",
" <td>0.290671</td>\n",
" <td>0.485242</td>\n",
" <td>0.535367</td>\n",
" <td>-1.048379</td>\n",
" <td>0.104699</td>\n",
" <td>0.856214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ఉంది</th>\n",
" <td>0.111868</td>\n",
" <td>0.211571</td>\n",
" <td>-0.292743</td>\n",
" <td>-0.233061</td>\n",
" <td>0.190581</td>\n",
" <td>0.227745</td>\n",
" <td>-1.724061</td>\n",
" <td>0.750188</td>\n",
" <td>0.477490</td>\n",
" <td>1.293393</td>\n",
" <td>...</td>\n",
" <td>-0.310083</td>\n",
" <td>0.125917</td>\n",
" <td>-0.133561</td>\n",
" <td>-0.162669</td>\n",
" <td>0.177769</td>\n",
" <td>1.329759</td>\n",
" <td>0.129581</td>\n",
" <td>0.850391</td>\n",
" <td>0.244996</td>\n",
" <td>0.585481</td>\n",
" </tr>\n",
" <tr>\n",
" <th>గ్రామం</th>\n",
" <td>0.350491</td>\n",
" <td>-0.038242</td>\n",
" <td>0.184957</td>\n",
" <td>-0.552766</td>\n",
" <td>-1.279211</td>\n",
" <td>0.063894</td>\n",
" <td>1.263454</td>\n",
" <td>0.927973</td>\n",
" <td>0.461793</td>\n",
" <td>1.577360</td>\n",
" <td>...</td>\n",
" <td>0.149890</td>\n",
" <td>0.034798</td>\n",
" <td>-0.373647</td>\n",
" <td>0.095328</td>\n",
" <td>0.144108</td>\n",
" <td>-2.156482</td>\n",
" <td>0.859847</td>\n",
" <td>0.522844</td>\n",
" <td>0.414594</td>\n",
" <td>0.991583</td>\n",
" </tr>\n",
" <tr>\n",
" <th>దూరంలో</th>\n",
" <td>-0.230388</td>\n",
" <td>0.472743</td>\n",
" <td>-0.017128</td>\n",
" <td>-0.983632</td>\n",
" <td>3.105000</td>\n",
" <td>-0.159550</td>\n",
" <td>-0.509644</td>\n",
" <td>0.722185</td>\n",
" <td>-0.487401</td>\n",
" <td>1.580943</td>\n",
" <td>...</td>\n",
" <td>0.426283</td>\n",
" <td>0.063384</td>\n",
" <td>-0.045331</td>\n",
" <td>0.543215</td>\n",
" <td>0.471905</td>\n",
" <td>1.090400</td>\n",
" <td>-0.339869</td>\n",
" <td>-1.046772</td>\n",
" <td>0.323809</td>\n",
" <td>1.054026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.744862</td>\n",
" <td>-0.247973</td>\n",
" <td>-0.146063</td>\n",
" <td>-0.577485</td>\n",
" <td>-0.807728</td>\n",
" <td>-0.236516</td>\n",
" <td>-0.613125</td>\n",
" <td>0.630741</td>\n",
" <td>0.188854</td>\n",
" <td>1.338283</td>\n",
" <td>...</td>\n",
" <td>0.179789</td>\n",
" <td>0.452522</td>\n",
" <td>-0.251397</td>\n",
" <td>-0.129144</td>\n",
" <td>-0.205054</td>\n",
" <td>0.664164</td>\n",
" <td>0.249665</td>\n",
" <td>-0.132851</td>\n",
" <td>-0.025986</td>\n",
" <td>-0.582486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>\"</th>\n",
" <td>-0.049747</td>\n",
" <td>0.314193</td>\n",
" <td>0.200771</td>\n",
" <td>-0.271289</td>\n",
" <td>0.850634</td>\n",
" <td>0.149836</td>\n",
" <td>0.352507</td>\n",
" <td>0.562104</td>\n",
" <td>0.010921</td>\n",
" <td>0.360467</td>\n",
" <td>...</td>\n",
" <td>-0.042071</td>\n",
" <td>0.167196</td>\n",
" <td>0.141774</td>\n",
" <td>-0.362660</td>\n",
" <td>0.660838</td>\n",
" <td>0.777507</td>\n",
" <td>-0.263045</td>\n",
" <td>0.621559</td>\n",
" <td>-0.025154</td>\n",
" <td>0.037878</td>\n",
" </tr>\n",
" <tr>\n",
" <th>గ్రామంలో</th>\n",
" <td>1.024729</td>\n",
" <td>0.615187</td>\n",
" <td>0.286993</td>\n",
" <td>-0.175022</td>\n",
" <td>0.460615</td>\n",
" <td>0.160861</td>\n",
" <td>-0.986948</td>\n",
" <td>1.342301</td>\n",
" <td>-0.253582</td>\n",
" <td>2.256250</td>\n",
" <td>...</td>\n",
" <td>0.418742</td>\n",
" <td>-0.390475</td>\n",
" <td>0.193697</td>\n",
" <td>0.124218</td>\n",
" <td>0.177791</td>\n",
" <td>-0.998894</td>\n",
" <td>0.674249</td>\n",
" <td>1.680783</td>\n",
" <td>0.267778</td>\n",
" <td>0.704820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>)</th>\n",
" <td>0.026697</td>\n",
" <td>0.348036</td>\n",
" <td>-0.162904</td>\n",
" <td>0.012066</td>\n",
" <td>-0.294381</td>\n",
" <td>0.195183</td>\n",
" <td>0.641476</td>\n",
" <td>0.902897</td>\n",
" <td>0.153347</td>\n",
" <td>0.945835</td>\n",
" <td>...</td>\n",
" <td>-0.213940</td>\n",
" <td>0.043878</td>\n",
" <td>0.072876</td>\n",
" <td>0.701433</td>\n",
" <td>0.510473</td>\n",
" <td>1.081460</td>\n",
" <td>-0.019017</td>\n",
" <td>0.141006</td>\n",
" <td>0.305660</td>\n",
" <td>-0.029373</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>15 rows × 300 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 \\\n",
"<unk> 0.162756 -0.118457 0.066381 -0.056356 -0.202901 0.066646 \n",
"<pad> -0.479526 0.014009 -0.450860 -0.361312 0.154549 0.071390 \n",
". -0.126146 -0.098541 -0.013123 0.106648 0.528485 -0.003247 \n",
", 0.043711 -0.091255 0.022957 0.211157 0.173500 0.058316 \n",
"కి 0.373087 0.145757 -0.918256 -1.009143 -1.178270 -0.274561 \n",
"నుండి 0.271972 0.008300 -0.261310 -0.271940 -1.232825 -0.001914 \n",
"ఉన్నాయి -0.114024 -0.124042 -0.311016 -0.048162 0.314308 0.240846 \n",
"మీ 1.038532 0.625470 -0.522149 -0.468230 -0.561634 0.374316 \n",
"ఉంది 0.111868 0.211571 -0.292743 -0.233061 0.190581 0.227745 \n",
"గ్రామం 0.350491 -0.038242 0.184957 -0.552766 -1.279211 0.063894 \n",
"దూరంలో -0.230388 0.472743 -0.017128 -0.983632 3.105000 -0.159550 \n",
"10 0.744862 -0.247973 -0.146063 -0.577485 -0.807728 -0.236516 \n",
"\" -0.049747 0.314193 0.200771 -0.271289 0.850634 0.149836 \n",
"గ్రామంలో 1.024729 0.615187 0.286993 -0.175022 0.460615 0.160861 \n",
") 0.026697 0.348036 -0.162904 0.012066 -0.294381 0.195183 \n",
"\n",
" 6 7 8 9 ... 290 \\\n",
"<unk> 0.113037 0.568882 -0.354657 0.329644 ... -0.195009 \n",
"<pad> -0.295752 -0.418422 0.770350 0.555413 ... 0.291602 \n",
". -0.646804 0.522134 0.108019 0.543941 ... 0.040240 \n",
", 1.159627 0.605265 0.015381 0.610853 ... -0.099935 \n",
"కి -0.707244 0.399376 0.197722 1.274530 ... 0.545980 \n",
"నుండి -1.091928 0.509688 0.286951 0.536997 ... -0.270993 \n",
"ఉన్నాయి -4.632638 0.775169 0.661511 0.968530 ... -0.004587 \n",
"మీ -0.048730 0.466691 -0.038677 1.585641 ... 0.840410 \n",
"ఉంది -1.724061 0.750188 0.477490 1.293393 ... -0.310083 \n",
"గ్రామం 1.263454 0.927973 0.461793 1.577360 ... 0.149890 \n",
"దూరంలో -0.509644 0.722185 -0.487401 1.580943 ... 0.426283 \n",
"10 -0.613125 0.630741 0.188854 1.338283 ... 0.179789 \n",
"\" 0.352507 0.562104 0.010921 0.360467 ... -0.042071 \n",
"గ్రామంలో -0.986948 1.342301 -0.253582 2.256250 ... 0.418742 \n",
") 0.641476 0.902897 0.153347 0.945835 ... -0.213940 \n",
"\n",
" 291 292 293 294 295 296 \\\n",
"<unk> 0.371669 0.091774 0.007729 0.158614 0.325467 -0.185765 \n",
"<pad> -0.546950 -0.103712 -0.113335 -0.270057 -0.270375 0.351973 \n",
". 0.321279 0.051702 -0.037312 -0.288039 0.988383 -0.087501 \n",
", 0.480227 0.057349 -0.036091 0.111147 0.623288 -0.184326 \n",
"కి -0.941536 -0.132543 -0.372984 0.386519 0.748406 -0.599072 \n",
"నుండి -0.130284 -0.021892 -0.098667 -0.450791 0.570602 0.177364 \n",
"ఉన్నాయి 0.103790 0.014730 -0.635435 0.146665 1.464527 0.039011 \n",
"మీ 2.888513 0.066506 0.026903 0.290671 0.485242 0.535367 \n",
"ఉంది 0.125917 -0.133561 -0.162669 0.177769 1.329759 0.129581 \n",
"గ్రామం 0.034798 -0.373647 0.095328 0.144108 -2.156482 0.859847 \n",
"దూరంలో 0.063384 -0.045331 0.543215 0.471905 1.090400 -0.339869 \n",
"10 0.452522 -0.251397 -0.129144 -0.205054 0.664164 0.249665 \n",
"\" 0.167196 0.141774 -0.362660 0.660838 0.777507 -0.263045 \n",
"గ్రామంలో -0.390475 0.193697 0.124218 0.177791 -0.998894 0.674249 \n",
") 0.043878 0.072876 0.701433 0.510473 1.081460 -0.019017 \n",
"\n",
" 297 298 299 \n",
"<unk> 0.930058 -0.041299 -0.026871 \n",
"<pad> -0.412007 0.125875 -0.121921 \n",
". 0.600596 -0.159826 0.282194 \n",
", 1.418015 0.020298 0.187803 \n",
"కి -1.956071 -0.498130 1.298580 \n",
"నుండి -0.171380 0.128581 -0.475457 \n",
"ఉన్నాయి 0.259942 0.172892 0.677400 \n",
"మీ -1.048379 0.104699 0.856214 \n",
"ఉంది 0.850391 0.244996 0.585481 \n",
"గ్రామం 0.522844 0.414594 0.991583 \n",
"దూరంలో -1.046772 0.323809 1.054026 \n",
"10 -0.132851 -0.025986 -0.582486 \n",
"\" 0.621559 -0.025154 0.037878 \n",
"గ్రామంలో 1.680783 0.267778 0.704820 \n",
") 0.141006 0.305660 -0.029373 \n",
"\n",
"[15 rows x 300 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"telugu2vec = pd.DataFrame(to_np(TEXT_vec.vocab.vectors))\n",
"telugu2vec.index = TEXT_vec.vocab.itos\n",
"telugu2vec.head(15)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"telugu2save = telugu2vec[~telugu2vec.index.str.contains(' ')]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# remove tokens with weird characters due to bad segmentation - still leaves out lot of bad tokens\n",
"# we can decide to keep unknown and padding tokens in the beginning if we need them later.\n",
"telugu2save = telugu2save.iloc[4:-3, :]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(85679, 300)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"telugu2save.shape"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"telugu2save.to_csv(f'{PATH}models/telugu2vec.vec', sep=' ', header=False, line_terminator='\\n')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"word_list = list(telugu2save.index)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"with open('word_list.txt', 'w+') as word_file:\n",
" word_file.write('\\n'.join(word_list))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30.236264039141496"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# perplexity approximation\n",
"math.exp(3.409042)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/0dd0df21cf404cf2bb51d0148c8b7d8b"
},
"gist": {
"data": {
"description": "fastai.text imdb example",
"public": true
},
"id": "0dd0df21cf404cf2bb51d0148c8b7d8b"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "86px",
"width": "252px"
},
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 4,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment