Skip to content

Instantly share code, notes, and snippets.

@hanfried
Created April 29, 2018 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hanfried/5dff59c0981185c8147fab975bded082 to your computer and use it in GitHub Desktop.
Save hanfried/5dff59c0981185c8147fab975bded082 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:19:31.470988Z",
"start_time": "2018-04-27T14:19:30.622142Z"
}
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import datasets\n",
"\n",
"DATASET = \"opensubs\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:22:10.734114Z",
"start_time": "2018-04-27T14:19:31.472495Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 0%| | 3/2317 [00:00<01:26, 26.69it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading OpenSubtitles conversations in data/opensubs.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 4%|▍ | 95/2317 [00:04<01:39, 22.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Action/2004/59_84873_113518_appurushdo.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 14%|█▍ | 334/2317 [00:16<01:37, 20.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Action/2003/602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 26%|██▌ | 608/2317 [00:31<01:27, 19.58it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2004/146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 36%|███▌ | 839/2317 [00:43<01:16, 19.33it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2002/3265_149497_204017_unfaithful.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 37%|███▋ | 865/2317 [00:44<01:14, 19.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2003/1723_68784_89159_big_fish.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 44%|████▎ | 1011/2317 [00:52<01:07, 19.26it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2000/179_88528_119102_batoru_rowaiaru.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 53%|█████▎ | 1236/2317 [01:04<00:56, 19.21it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Horror/1922/1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 55%|█████▍ | 1263/2317 [01:06<00:55, 19.05it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2004/2480_226704_299940_little_black_book.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 65%|██████▌ | 1510/2317 [01:25<00:45, 17.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2003/529_124078_171007_how_to_lose_a_guy_in_10_days.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 86%|████████▋ | 2004/2317 [01:59<00:18, 16.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping file data/opensubs/OpenSubtitles/en/Family/2001/3935_19508_22105_cats__dogs.xml.gz with errors.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OpenSubtitles data files: 100%|██████████| 2317/2317 [02:20<00:00, 16.52it/s]\n",
"100%|██████████| 1648080/1648080 [00:18<00:00, 89958.80it/s]\n"
]
}
],
"source": [
"dataset_path = os.path.join(\"data\", DATASET)\n",
"data = datasets.readOpensubsData(dataset_path, max_len=35)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:22:10.753722Z",
"start_time": "2018-04-27T14:22:10.736145Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('i il make you a sandwich\\n', 'make it to go\\n'),\n",
" ('is shayna coming or what\\n', 'come on give it to me\\n'),\n",
" ('come on give it to me\\n', 'oh shayna oh yeah\\n'),\n",
" ('shayna yeah\\n', 'kim s been looking for you\\n'),\n",
" ('kim s been looking for you\\n', 'she in her room kitchen i think\\n'),\n",
" ('she in her room kitchen i think\\n', 'thanks\\n'),\n",
" ('hey guys shayna\\n', 'oh shit what s going on\\n'),\n",
" ('oh shit what s going on\\n', 'surprise surprise\\n'),\n",
" ('i m sure\\n', 'coming back from a break\\n'),\n",
" ('coming back from a break\\n', 'no\\n'),\n",
" ('no\\n', 'just starting then\\n'),\n",
" ('just starting then\\n', 'if you need someone to practice on\\n'),\n",
" ('just dry a few minutes\\n', 'now what do you say\\n'),\n",
" ('now what do you say\\n', 'cotton candy or red vine red\\n'),\n",
" ('do you have clear sure\\n', 'clear whose kid are you\\n'),\n",
" ('clear whose kid are you\\n', 'red is bold babycakes\\n'),\n",
" ('red is bold babycakes\\n', 'it says look here and we il match\\n'),\n",
" ('daddy will hate it even better\\n', 'mom\\n'),\n",
" ('and he still thinks that\\n', 'three weeks on one week off\\n'),\n",
" ('three weeks on one week off\\n', 'company girls\\n'),\n",
" ('kim\\n', 'rickie lee\\n'),\n",
" ('rickie lee\\n', 'hi i m lavender rose\\n'),\n",
" ('hi i m lavender rose\\n', 'cat\\n'),\n",
" ('are you sure i don t know you\\n', 'you don t\\n'),\n",
" ('i will talk to you later\\n', 'you have fun at the party tomorrow\\n'),\n",
" ('i will love you\\n', 'love you more\\n'),\n",
" ('love you more\\n', 'wait a sec\\n'),\n",
" ('yeah the nails that was me too\\n', 'what s next a g string\\n'),\n",
" ('all day sucker what s that\\n', 'guess\\n'),\n",
" ('maybe you could blow me\\n', 'and i could\\n'),\n",
" ('fuck you after\\n', 'okay\\n'),\n",
" ('so that s velvet\\n', 'knight with a k\\n'),\n",
" ('manager\\n', 'whatever\\n'),\n",
" ('man\\n', 'let me tell you how it goes here\\n'),\n",
" ('because you look awfully familiar\\n', '\\n'),\n",
" ('thirteen fourteen nineteen\\n', 'nineteen years old almost twenty\\n'),\n",
" ('nineteen years old almost twenty\\n', '\\n'),\n",
" ('excuse me\\n', 'so why did you stop you were good\\n'),\n",
" ('anyhow those charges didn t stick\\n', 'i m sorry that happened to you\\n'),\n",
" ('yeah well\\n', 'look here s what\\n'),\n",
" ('look here s what\\n', 'end of every show\\n'),\n",
" ('you going to tell him ever\\n', 'why would i\\n'),\n",
" ('why would i\\n', 'it s who you are\\n'),\n",
" ('it s who you are\\n', 'it s who i was\\n'),\n",
" ('where s it on your ass\\n', 'no wait i ve seen your ass\\n'),\n",
" ('yeah i m going to wonder\\n', 'you know what else i m going to do\\n'),\n",
" ('you know what else i m going to do\\n',\n",
" 'i m gonna keep my big mouth shut\\n'),\n",
" ('i m gonna keep my big mouth shut\\n', 'the past is the past tay\\n'),\n",
" ('the past is the past tay\\n', 'and this is my future\\n'),\n",
" ('and this is my future\\n', 'okay\\n')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:50]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:22:10.774077Z",
"start_time": "2018-04-27T14:22:10.755736Z"
}
},
"outputs": [],
"source": [
"from collections import *\n",
"from tqdm import tqdm_notebook\n",
"\n",
"def train_char_lm(conversations, order=4):\n",
" lm = defaultdict(Counter)\n",
" pad = \"~\" * order\n",
" for dialog in tqdm_notebook([''.join([pad, s1, s2]) for s1, s2 in conversations]):\n",
" for i in range(len(dialog)-order):\n",
" for j in range(i-1):\n",
" history, char = dialog[i+j:i+order], dialog[i+order]\n",
" lm[history][char]+=1\n",
" def normalize(counter):\n",
" s = float(sum(counter.values()))\n",
" return [(c,cnt/s) for c,cnt in counter.items()]\n",
" outlm = {hist:normalize(chars) for hist, chars in lm.items()}\n",
" return outlm"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:29:06.549951Z",
"start_time": "2018-04-27T14:22:10.775560Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d4d6bfde64fd4d71a5052e51c2a850b0",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"HBox(children=(IntProgress(value=0, max=578347), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"lm = train_char_lm(data, order=12)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T14:29:06.554726Z",
"start_time": "2018-04-27T14:29:06.551509Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('\\n', 0.42470295661785024),\n",
" (' ', 0.4600718430505665),\n",
" ('w', 0.0978170765404808),\n",
" ('u', 0.0038684719535783366),\n",
" ('r', 0.004973749654600719),\n",
" ('q', 0.0013815971262779773),\n",
" ('s', 0.0005526388505111909),\n",
" ('n', 0.0019342359767891683),\n",
" ('o', 0.0008289582757667864),\n",
" ('p', 0.003592152528322741),\n",
" ('v', 0.00027631942525559546)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm['ello']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T17:59:35.626886Z",
"start_time": "2018-04-27T17:59:35.615234Z"
}
},
"outputs": [],
"source": [
"from random import random\n",
"\n",
"def generate_letter(lm, history, order):\n",
" for i in range(order-1):\n",
" history = history[-(order-i):]\n",
" dist = lm.get(history)\n",
" if not dist:\n",
" continue\n",
" x = random()\n",
" for c,v in dist:\n",
" x = x - v\n",
" if x <= 0: return c\n",
"\n",
" return generate_letter(lm, history[:-1], order)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T17:57:35.655297Z",
"start_time": "2018-04-27T17:57:35.616456Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /home/janek/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"import pickle\n",
"import re\n",
"import numpy as np\n",
"import csv\n",
"\n",
"nltk.download('stopwords')\n",
"from nltk.corpus import stopwords\n",
"\n",
"def text_prepare(text):\n",
" \"\"\"Performs tokenization and simple preprocessing.\"\"\"\n",
" \n",
" replace_by_space_re = re.compile('[/(){}\\[\\]\\|@,;]')\n",
" bad_symbols_re = re.compile('[^0-9a-z #+_]')\n",
" stopwords_set = set(stopwords.words('english'))\n",
"\n",
" text = text.lower()\n",
" text = replace_by_space_re.sub(' ', text)\n",
" text = bad_symbols_re.sub('', text)\n",
" text = ' '.join([x for x in text.split() if x and x not in stopwords_set])\n",
"\n",
" return text.strip()\n",
"\n",
"def generate_answer(lm, question, order=4, max_len=25):\n",
" history = \"~\" * order + text_prepare(question) + \"\\n\"\n",
" out = []\n",
" while True:\n",
" c = generate_letter(lm, history, order)\n",
" history = history[-order:] + c\n",
" out.append(c)\n",
" if out[-1] == '\\n' or (len(out) > max_len and out[-1].isspace()):\n",
" return \"\".join(out[:-1])\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T17:59:39.322327Z",
"start_time": "2018-04-27T17:59:39.318069Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hello\n"
]
},
{
"data": {
"text/plain": [
"'hello'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"generate_answer(lm, \"Hey\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T18:00:00.333693Z",
"start_time": "2018-04-27T18:00:00.328499Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"that it fence\n"
]
},
{
"data": {
"text/plain": [
"'that it fence'"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"generate_answer(lm, \"How are you doing?\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T18:00:02.765722Z",
"start_time": "2018-04-27T18:00:02.760140Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"holy sharolina\n"
]
},
{
"data": {
"text/plain": [
"'holy sharolina'"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"generate_answer(lm, \"What's your hobby?\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2018-04-27T18:00:07.273402Z",
"start_time": "2018-04-27T18:00:07.268222Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"chucklng\n"
]
},
{
"data": {
"text/plain": [
"'chucklng'"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"generate_answer(lm, \"How to write a loop in python?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment