hanfried/n_gram_language_model.ipynb

## n_gram_language_model.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:19:31.470988Z",
     "start_time": "2018-04-27T14:19:30.622142Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import datasets\n",
    "\n",
    "DATASET = \"opensubs\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:22:10.734114Z",
     "start_time": "2018-04-27T14:19:31.472495Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:   0%|          | 3/2317 [00:00<01:26, 26.69it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading OpenSubtitles conversations in data/opensubs.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:   4%|▍         | 95/2317 [00:04<01:39, 22.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Action/2004/59_84873_113518_appurushdo.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  14%|█▍        | 334/2317 [00:16<01:37, 20.25it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Action/2003/602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  26%|██▌       | 608/2317 [00:31<01:27, 19.58it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Drama/2004/146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  36%|███▌      | 839/2317 [00:43<01:16, 19.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Drama/2002/3265_149497_204017_unfaithful.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  37%|███▋      | 865/2317 [00:44<01:14, 19.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Drama/2003/1723_68784_89159_big_fish.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  44%|████▎     | 1011/2317 [00:52<01:07, 19.26it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Drama/2000/179_88528_119102_batoru_rowaiaru.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  53%|█████▎    | 1236/2317 [01:04<00:56, 19.21it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Horror/1922/1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  55%|█████▍    | 1263/2317 [01:06<00:55, 19.05it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Comedy/2004/2480_226704_299940_little_black_book.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  65%|██████▌   | 1510/2317 [01:25<00:45, 17.59it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Comedy/2003/529_124078_171007_how_to_lose_a_guy_in_10_days.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files:  86%|████████▋ | 2004/2317 [01:59<00:18, 16.75it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping file data/opensubs/OpenSubtitles/en/Family/2001/3935_19508_22105_cats__dogs.xml.gz with errors.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenSubtitles data files: 100%|██████████| 2317/2317 [02:20<00:00, 16.52it/s]\n",
      "100%|██████████| 1648080/1648080 [00:18<00:00, 89958.80it/s]\n"
     ]
    }
   ],
   "source": [
    "dataset_path = os.path.join(\"data\", DATASET)\n",
    "data = datasets.readOpensubsData(dataset_path, max_len=35)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:22:10.753722Z",
     "start_time": "2018-04-27T14:22:10.736145Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('i il make you a sandwich\\n', 'make it to go\\n'),\n",
       " ('is shayna coming or what\\n', 'come on give it to me\\n'),\n",
       " ('come on give it to me\\n', 'oh shayna oh yeah\\n'),\n",
       " ('shayna yeah\\n', 'kim s been looking for you\\n'),\n",
       " ('kim s been looking for you\\n', 'she in her room kitchen i think\\n'),\n",
       " ('she in her room kitchen i think\\n', 'thanks\\n'),\n",
       " ('hey guys shayna\\n', 'oh shit what s going on\\n'),\n",
       " ('oh shit what s going on\\n', 'surprise surprise\\n'),\n",
       " ('i m sure\\n', 'coming back from a break\\n'),\n",
       " ('coming back from a break\\n', 'no\\n'),\n",
       " ('no\\n', 'just starting then\\n'),\n",
       " ('just starting then\\n', 'if you need someone to practice on\\n'),\n",
       " ('just dry a few minutes\\n', 'now what do you say\\n'),\n",
       " ('now what do you say\\n', 'cotton candy or red vine red\\n'),\n",
       " ('do you have clear sure\\n', 'clear whose kid are you\\n'),\n",
       " ('clear whose kid are you\\n', 'red is bold babycakes\\n'),\n",
       " ('red is bold babycakes\\n', 'it says look here and we il match\\n'),\n",
       " ('daddy will hate it even better\\n', 'mom\\n'),\n",
       " ('and he still thinks that\\n', 'three weeks on one week off\\n'),\n",
       " ('three weeks on one week off\\n', 'company girls\\n'),\n",
       " ('kim\\n', 'rickie lee\\n'),\n",
       " ('rickie lee\\n', 'hi i m lavender rose\\n'),\n",
       " ('hi i m lavender rose\\n', 'cat\\n'),\n",
       " ('are you sure i don t know you\\n', 'you don t\\n'),\n",
       " ('i will talk to you later\\n', 'you have fun at the party tomorrow\\n'),\n",
       " ('i will love you\\n', 'love you more\\n'),\n",
       " ('love you more\\n', 'wait a sec\\n'),\n",
       " ('yeah the nails that was me too\\n', 'what s next a g string\\n'),\n",
       " ('all day sucker what s that\\n', 'guess\\n'),\n",
       " ('maybe you could blow me\\n', 'and i could\\n'),\n",
       " ('fuck you after\\n', 'okay\\n'),\n",
       " ('so that s velvet\\n', 'knight with a k\\n'),\n",
       " ('manager\\n', 'whatever\\n'),\n",
       " ('man\\n', 'let me tell you how it goes here\\n'),\n",
       " ('because you look awfully familiar\\n', '\\n'),\n",
       " ('thirteen fourteen nineteen\\n', 'nineteen years old almost twenty\\n'),\n",
       " ('nineteen years old almost twenty\\n', '\\n'),\n",
       " ('excuse me\\n', 'so why did you stop you were good\\n'),\n",
       " ('anyhow those charges didn t stick\\n', 'i m sorry that happened to you\\n'),\n",
       " ('yeah well\\n', 'look here s what\\n'),\n",
       " ('look here s what\\n', 'end of every show\\n'),\n",
       " ('you going to tell him ever\\n', 'why would i\\n'),\n",
       " ('why would i\\n', 'it s who you are\\n'),\n",
       " ('it s who you are\\n', 'it s who i was\\n'),\n",
       " ('where s it on your ass\\n', 'no wait i ve seen your ass\\n'),\n",
       " ('yeah i m going to wonder\\n', 'you know what else i m going to do\\n'),\n",
       " ('you know what else i m going to do\\n',\n",
       "  'i m gonna keep my big mouth shut\\n'),\n",
       " ('i m gonna keep my big mouth shut\\n', 'the past is the past tay\\n'),\n",
       " ('the past is the past tay\\n', 'and this is my future\\n'),\n",
       " ('and this is my future\\n', 'okay\\n')]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:22:10.774077Z",
     "start_time": "2018-04-27T14:22:10.755736Z"
    }
   },
   "outputs": [],
   "source": [
    "from collections import *\n",
    "from tqdm import tqdm_notebook\n",
    "\n",
    "def train_char_lm(conversations, order=4):\n",
    "    lm = defaultdict(Counter)\n",
    "    pad = \"~\" * order\n",
    "    for dialog in tqdm_notebook([''.join([pad, s1, s2]) for s1, s2 in conversations]):\n",
    "        for i in range(len(dialog)-order):\n",
    "            for j in range(i-1):\n",
    "                history, char = dialog[i+j:i+order], dialog[i+order]\n",
    "                lm[history][char]+=1\n",
    "        def normalize(counter):\n",
    "            s = float(sum(counter.values()))\n",
    "            return [(c,cnt/s) for c,cnt in counter.items()]\n",
    "    outlm = {hist:normalize(chars) for hist, chars in lm.items()}\n",
    "    return outlm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:29:06.549951Z",
     "start_time": "2018-04-27T14:22:10.775560Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d4d6bfde64fd4d71a5052e51c2a850b0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/html": [
       "<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
       "<p>\n",
       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
       "  that the widgets JavaScript is still loading. If this message persists, it\n",
       "  likely means that the widgets JavaScript library is either not installed or\n",
       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
       "  Widgets Documentation</a> for setup instructions.\n",
       "</p>\n",
       "<p>\n",
       "  If you're reading this message in another frontend (for example, a static\n",
       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
       "  it may mean that your frontend doesn't currently support widgets.\n",
       "</p>\n"
      ],
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=578347), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "lm = train_char_lm(data, order=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T14:29:06.554726Z",
     "start_time": "2018-04-27T14:29:06.551509Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('\\n', 0.42470295661785024),\n",
       " (' ', 0.4600718430505665),\n",
       " ('w', 0.0978170765404808),\n",
       " ('u', 0.0038684719535783366),\n",
       " ('r', 0.004973749654600719),\n",
       " ('q', 0.0013815971262779773),\n",
       " ('s', 0.0005526388505111909),\n",
       " ('n', 0.0019342359767891683),\n",
       " ('o', 0.0008289582757667864),\n",
       " ('p', 0.003592152528322741),\n",
       " ('v', 0.00027631942525559546)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lm['ello']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T17:59:35.626886Z",
     "start_time": "2018-04-27T17:59:35.615234Z"
    }
   },
   "outputs": [],
   "source": [
    "from random import random\n",
    "\n",
    "def generate_letter(lm, history, order):\n",
    "    for i in range(order-1):\n",
    "        history = history[-(order-i):]\n",
    "        dist = lm.get(history)\n",
    "        if not dist:\n",
    "            continue\n",
    "        x = random()\n",
    "        for c,v in dist:\n",
    "            x = x - v\n",
    "            if x <= 0: return c\n",
    "\n",
    "    return generate_letter(lm, history[:-1], order)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T17:57:35.655297Z",
     "start_time": "2018-04-27T17:57:35.616456Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /home/janek/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "import pickle\n",
    "import re\n",
    "import numpy as np\n",
    "import csv\n",
    "\n",
    "nltk.download('stopwords')\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "def text_prepare(text):\n",
    "    \"\"\"Performs tokenization and simple preprocessing.\"\"\"\n",
    "    \n",
    "    replace_by_space_re = re.compile('[/(){}\\[\\]\\|@,;]')\n",
    "    bad_symbols_re = re.compile('[^0-9a-z #+_]')\n",
    "    stopwords_set = set(stopwords.words('english'))\n",
    "\n",
    "    text = text.lower()\n",
    "    text = replace_by_space_re.sub(' ', text)\n",
    "    text = bad_symbols_re.sub('', text)\n",
    "    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])\n",
    "\n",
    "    return text.strip()\n",
    "\n",
    "def generate_answer(lm, question, order=4, max_len=25):\n",
    "    history = \"~\" * order + text_prepare(question) + \"\\n\"\n",
    "    out = []\n",
    "    while True:\n",
    "        c = generate_letter(lm, history, order)\n",
    "        history = history[-order:] + c\n",
    "        out.append(c)\n",
    "        if out[-1] == '\\n' or (len(out) > max_len and out[-1].isspace()):\n",
    "            return \"\".join(out[:-1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T17:59:39.322327Z",
     "start_time": "2018-04-27T17:59:39.318069Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'hello'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate_answer(lm, \"Hey\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T18:00:00.333693Z",
     "start_time": "2018-04-27T18:00:00.328499Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "that it fence\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'that it fence'"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate_answer(lm, \"How are you doing?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T18:00:02.765722Z",
     "start_time": "2018-04-27T18:00:02.760140Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "holy sharolina\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'holy sharolina'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate_answer(lm, \"What's your hobby?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-27T18:00:07.273402Z",
     "start_time": "2018-04-27T18:00:07.268222Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chucklng\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'chucklng'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate_answer(lm, \"How to write a loop in python?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:19:31.470988Z",
	"start_time": "2018-04-27T14:19:30.622142Z"
	}
	},
	"outputs": [],
	"source": [
	"import os\n",
	"\n",
	"import datasets\n",
	"\n",
	"DATASET = \"opensubs\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:22:10.734114Z",
	"start_time": "2018-04-27T14:19:31.472495Z"
	}
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 0%\| \| 3/2317 [00:00<01:26, 26.69it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Loading OpenSubtitles conversations in data/opensubs.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 4%\|▍ \| 95/2317 [00:04<01:39, 22.38it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Action/2004/59_84873_113518_appurushdo.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 14%\|█▍ \| 334/2317 [00:16<01:37, 20.25it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Action/2003/602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 26%\|██▌ \| 608/2317 [00:31<01:27, 19.58it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Drama/2004/146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 36%\|███▌ \| 839/2317 [00:43<01:16, 19.33it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Drama/2002/3265_149497_204017_unfaithful.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 37%\|███▋ \| 865/2317 [00:44<01:14, 19.38it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Drama/2003/1723_68784_89159_big_fish.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 44%\|████▎ \| 1011/2317 [00:52<01:07, 19.26it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Drama/2000/179_88528_119102_batoru_rowaiaru.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 53%\|█████▎ \| 1236/2317 [01:04<00:56, 19.21it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Horror/1922/1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 55%\|█████▍ \| 1263/2317 [01:06<00:55, 19.05it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2004/2480_226704_299940_little_black_book.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 65%\|██████▌ \| 1510/2317 [01:25<00:45, 17.59it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2003/529_124078_171007_how_to_lose_a_guy_in_10_days.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 86%\|████████▋ \| 2004/2317 [01:59<00:18, 16.75it/s]"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Skipping file data/opensubs/OpenSubtitles/en/Family/2001/3935_19508_22105_cats__dogs.xml.gz with errors.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenSubtitles data files: 100%\|██████████\| 2317/2317 [02:20<00:00, 16.52it/s]\n",
	"100%\|██████████\| 1648080/1648080 [00:18<00:00, 89958.80it/s]\n"
	]
	}
	],
	"source": [
	"dataset_path = os.path.join(\"data\", DATASET)\n",
	"data = datasets.readOpensubsData(dataset_path, max_len=35)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:22:10.753722Z",
	"start_time": "2018-04-27T14:22:10.736145Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('i il make you a sandwich\\n', 'make it to go\\n'),\n",
	" ('is shayna coming or what\\n', 'come on give it to me\\n'),\n",
	" ('come on give it to me\\n', 'oh shayna oh yeah\\n'),\n",
	" ('shayna yeah\\n', 'kim s been looking for you\\n'),\n",
	" ('kim s been looking for you\\n', 'she in her room kitchen i think\\n'),\n",
	" ('she in her room kitchen i think\\n', 'thanks\\n'),\n",
	" ('hey guys shayna\\n', 'oh shit what s going on\\n'),\n",
	" ('oh shit what s going on\\n', 'surprise surprise\\n'),\n",
	" ('i m sure\\n', 'coming back from a break\\n'),\n",
	" ('coming back from a break\\n', 'no\\n'),\n",
	" ('no\\n', 'just starting then\\n'),\n",
	" ('just starting then\\n', 'if you need someone to practice on\\n'),\n",
	" ('just dry a few minutes\\n', 'now what do you say\\n'),\n",
	" ('now what do you say\\n', 'cotton candy or red vine red\\n'),\n",
	" ('do you have clear sure\\n', 'clear whose kid are you\\n'),\n",
	" ('clear whose kid are you\\n', 'red is bold babycakes\\n'),\n",
	" ('red is bold babycakes\\n', 'it says look here and we il match\\n'),\n",
	" ('daddy will hate it even better\\n', 'mom\\n'),\n",
	" ('and he still thinks that\\n', 'three weeks on one week off\\n'),\n",
	" ('three weeks on one week off\\n', 'company girls\\n'),\n",
	" ('kim\\n', 'rickie lee\\n'),\n",
	" ('rickie lee\\n', 'hi i m lavender rose\\n'),\n",
	" ('hi i m lavender rose\\n', 'cat\\n'),\n",
	" ('are you sure i don t know you\\n', 'you don t\\n'),\n",
	" ('i will talk to you later\\n', 'you have fun at the party tomorrow\\n'),\n",
	" ('i will love you\\n', 'love you more\\n'),\n",
	" ('love you more\\n', 'wait a sec\\n'),\n",
	" ('yeah the nails that was me too\\n', 'what s next a g string\\n'),\n",
	" ('all day sucker what s that\\n', 'guess\\n'),\n",
	" ('maybe you could blow me\\n', 'and i could\\n'),\n",
	" ('fuck you after\\n', 'okay\\n'),\n",
	" ('so that s velvet\\n', 'knight with a k\\n'),\n",
	" ('manager\\n', 'whatever\\n'),\n",
	" ('man\\n', 'let me tell you how it goes here\\n'),\n",
	" ('because you look awfully familiar\\n', '\\n'),\n",
	" ('thirteen fourteen nineteen\\n', 'nineteen years old almost twenty\\n'),\n",
	" ('nineteen years old almost twenty\\n', '\\n'),\n",
	" ('excuse me\\n', 'so why did you stop you were good\\n'),\n",
	" ('anyhow those charges didn t stick\\n', 'i m sorry that happened to you\\n'),\n",
	" ('yeah well\\n', 'look here s what\\n'),\n",
	" ('look here s what\\n', 'end of every show\\n'),\n",
	" ('you going to tell him ever\\n', 'why would i\\n'),\n",
	" ('why would i\\n', 'it s who you are\\n'),\n",
	" ('it s who you are\\n', 'it s who i was\\n'),\n",
	" ('where s it on your ass\\n', 'no wait i ve seen your ass\\n'),\n",
	" ('yeah i m going to wonder\\n', 'you know what else i m going to do\\n'),\n",
	" ('you know what else i m going to do\\n',\n",
	" 'i m gonna keep my big mouth shut\\n'),\n",
	" ('i m gonna keep my big mouth shut\\n', 'the past is the past tay\\n'),\n",
	" ('the past is the past tay\\n', 'and this is my future\\n'),\n",
	" ('and this is my future\\n', 'okay\\n')]"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data[:50]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:22:10.774077Z",
	"start_time": "2018-04-27T14:22:10.755736Z"
	}
	},
	"outputs": [],
	"source": [
	"from collections import *\n",
	"from tqdm import tqdm_notebook\n",
	"\n",
	"def train_char_lm(conversations, order=4):\n",
	" lm = defaultdict(Counter)\n",
	" pad = \"~\" * order\n",
	" for dialog in tqdm_notebook([''.join([pad, s1, s2]) for s1, s2 in conversations]):\n",
	" for i in range(len(dialog)-order):\n",
	" for j in range(i-1):\n",
	" history, char = dialog[i+j:i+order], dialog[i+order]\n",
	" lm[history][char]+=1\n",
	" def normalize(counter):\n",
	" s = float(sum(counter.values()))\n",
	" return [(c,cnt/s) for c,cnt in counter.items()]\n",
	" outlm = {hist:normalize(chars) for hist, chars in lm.items()}\n",
	" return outlm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:29:06.549951Z",
	"start_time": "2018-04-27T14:22:10.775560Z"
	}
	},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "d4d6bfde64fd4d71a5052e51c2a850b0",
	"version_major": 2,
	"version_minor": 0
	},
	"text/html": [
	"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
	"<p>\n",
	" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
	" that the widgets JavaScript is still loading. If this message persists, it\n",
	" likely means that the widgets JavaScript library is either not installed or\n",
	" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
	" Widgets Documentation</a> for setup instructions.\n",
	"</p>\n",
	"<p>\n",
	" If you're reading this message in another frontend (for example, a static\n",
	" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
	" it may mean that your frontend doesn't currently support widgets.\n",
	"</p>\n"
	],
	"text/plain": [
	"HBox(children=(IntProgress(value=0, max=578347), HTML(value='')))"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n"
	]
	}
	],
	"source": [
	"lm = train_char_lm(data, order=12)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T14:29:06.554726Z",
	"start_time": "2018-04-27T14:29:06.551509Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('\\n', 0.42470295661785024),\n",
	" (' ', 0.4600718430505665),\n",
	" ('w', 0.0978170765404808),\n",
	" ('u', 0.0038684719535783366),\n",
	" ('r', 0.004973749654600719),\n",
	" ('q', 0.0013815971262779773),\n",
	" ('s', 0.0005526388505111909),\n",
	" ('n', 0.0019342359767891683),\n",
	" ('o', 0.0008289582757667864),\n",
	" ('p', 0.003592152528322741),\n",
	" ('v', 0.00027631942525559546)]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"lm['ello']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T17:59:35.626886Z",
	"start_time": "2018-04-27T17:59:35.615234Z"
	}
	},
	"outputs": [],
	"source": [
	"from random import random\n",
	"\n",
	"def generate_letter(lm, history, order):\n",
	" for i in range(order-1):\n",
	" history = history[-(order-i):]\n",
	" dist = lm.get(history)\n",
	" if not dist:\n",
	" continue\n",
	" x = random()\n",
	" for c,v in dist:\n",
	" x = x - v\n",
	" if x <= 0: return c\n",
	"\n",
	" return generate_letter(lm, history[:-1], order)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T17:57:35.655297Z",
	"start_time": "2018-04-27T17:57:35.616456Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package stopwords to /home/janek/nltk_data...\n",
	"[nltk_data] Package stopwords is already up-to-date!\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import nltk\n",
	"import pickle\n",
	"import re\n",
	"import numpy as np\n",
	"import csv\n",
	"\n",
	"nltk.download('stopwords')\n",
	"from nltk.corpus import stopwords\n",
	"\n",
	"def text_prepare(text):\n",
	" \"\"\"Performs tokenization and simple preprocessing.\"\"\"\n",
	" \n",
	" replace_by_space_re = re.compile('[/(){}\\[\\]\\\|@,;]')\n",
	" bad_symbols_re = re.compile('[^0-9a-z #+_]')\n",
	" stopwords_set = set(stopwords.words('english'))\n",
	"\n",
	" text = text.lower()\n",
	" text = replace_by_space_re.sub(' ', text)\n",
	" text = bad_symbols_re.sub('', text)\n",
	" text = ' '.join([x for x in text.split() if x and x not in stopwords_set])\n",
	"\n",
	" return text.strip()\n",
	"\n",
	"def generate_answer(lm, question, order=4, max_len=25):\n",
	" history = \"~\" * order + text_prepare(question) + \"\\n\"\n",
	" out = []\n",
	" while True:\n",
	" c = generate_letter(lm, history, order)\n",
	" history = history[-order:] + c\n",
	" out.append(c)\n",
	" if out[-1] == '\\n' or (len(out) > max_len and out[-1].isspace()):\n",
	" return \"\".join(out[:-1])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T17:59:39.322327Z",
	"start_time": "2018-04-27T17:59:39.318069Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"hello\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'hello'"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"generate_answer(lm, \"Hey\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T18:00:00.333693Z",
	"start_time": "2018-04-27T18:00:00.328499Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"that it fence\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'that it fence'"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"generate_answer(lm, \"How are you doing?\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T18:00:02.765722Z",
	"start_time": "2018-04-27T18:00:02.760140Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"holy sharolina\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'holy sharolina'"
	]
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"generate_answer(lm, \"What's your hobby?\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-04-27T18:00:07.273402Z",
	"start_time": "2018-04-27T18:00:07.268222Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"chucklng\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'chucklng'"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"generate_answer(lm, \"How to write a loop in python?\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}