Created
April 29, 2018 17:10
-
-
Save hanfried/5dff59c0981185c8147fab975bded082 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:19:31.470988Z", | |
"start_time": "2018-04-27T14:19:30.622142Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"\n", | |
"import datasets\n", | |
"\n", | |
"DATASET = \"opensubs\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:22:10.734114Z", | |
"start_time": "2018-04-27T14:19:31.472495Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 0%| | 3/2317 [00:00<01:26, 26.69it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Loading OpenSubtitles conversations in data/opensubs.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 4%|▍ | 95/2317 [00:04<01:39, 22.38it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Action/2004/59_84873_113518_appurushdo.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 14%|█▍ | 334/2317 [00:16<01:37, 20.25it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Action/2003/602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 26%|██▌ | 608/2317 [00:31<01:27, 19.58it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2004/146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 36%|███▌ | 839/2317 [00:43<01:16, 19.33it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2002/3265_149497_204017_unfaithful.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 37%|███▋ | 865/2317 [00:44<01:14, 19.38it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2003/1723_68784_89159_big_fish.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 44%|████▎ | 1011/2317 [00:52<01:07, 19.26it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Drama/2000/179_88528_119102_batoru_rowaiaru.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 53%|█████▎ | 1236/2317 [01:04<00:56, 19.21it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Horror/1922/1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 55%|█████▍ | 1263/2317 [01:06<00:55, 19.05it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2004/2480_226704_299940_little_black_book.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 65%|██████▌ | 1510/2317 [01:25<00:45, 17.59it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Comedy/2003/529_124078_171007_how_to_lose_a_guy_in_10_days.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 86%|████████▋ | 2004/2317 [01:59<00:18, 16.75it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Skipping file data/opensubs/OpenSubtitles/en/Family/2001/3935_19508_22105_cats__dogs.xml.gz with errors.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenSubtitles data files: 100%|██████████| 2317/2317 [02:20<00:00, 16.52it/s]\n", | |
"100%|██████████| 1648080/1648080 [00:18<00:00, 89958.80it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"dataset_path = os.path.join(\"data\", DATASET)\n", | |
"data = datasets.readOpensubsData(dataset_path, max_len=35)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:22:10.753722Z", | |
"start_time": "2018-04-27T14:22:10.736145Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('i il make you a sandwich\\n', 'make it to go\\n'),\n", | |
" ('is shayna coming or what\\n', 'come on give it to me\\n'),\n", | |
" ('come on give it to me\\n', 'oh shayna oh yeah\\n'),\n", | |
" ('shayna yeah\\n', 'kim s been looking for you\\n'),\n", | |
" ('kim s been looking for you\\n', 'she in her room kitchen i think\\n'),\n", | |
" ('she in her room kitchen i think\\n', 'thanks\\n'),\n", | |
" ('hey guys shayna\\n', 'oh shit what s going on\\n'),\n", | |
" ('oh shit what s going on\\n', 'surprise surprise\\n'),\n", | |
" ('i m sure\\n', 'coming back from a break\\n'),\n", | |
" ('coming back from a break\\n', 'no\\n'),\n", | |
" ('no\\n', 'just starting then\\n'),\n", | |
" ('just starting then\\n', 'if you need someone to practice on\\n'),\n", | |
" ('just dry a few minutes\\n', 'now what do you say\\n'),\n", | |
" ('now what do you say\\n', 'cotton candy or red vine red\\n'),\n", | |
" ('do you have clear sure\\n', 'clear whose kid are you\\n'),\n", | |
" ('clear whose kid are you\\n', 'red is bold babycakes\\n'),\n", | |
" ('red is bold babycakes\\n', 'it says look here and we il match\\n'),\n", | |
" ('daddy will hate it even better\\n', 'mom\\n'),\n", | |
" ('and he still thinks that\\n', 'three weeks on one week off\\n'),\n", | |
" ('three weeks on one week off\\n', 'company girls\\n'),\n", | |
" ('kim\\n', 'rickie lee\\n'),\n", | |
" ('rickie lee\\n', 'hi i m lavender rose\\n'),\n", | |
" ('hi i m lavender rose\\n', 'cat\\n'),\n", | |
" ('are you sure i don t know you\\n', 'you don t\\n'),\n", | |
" ('i will talk to you later\\n', 'you have fun at the party tomorrow\\n'),\n", | |
" ('i will love you\\n', 'love you more\\n'),\n", | |
" ('love you more\\n', 'wait a sec\\n'),\n", | |
" ('yeah the nails that was me too\\n', 'what s next a g string\\n'),\n", | |
" ('all day sucker what s that\\n', 'guess\\n'),\n", | |
" ('maybe you could blow me\\n', 'and i could\\n'),\n", | |
" ('fuck you after\\n', 'okay\\n'),\n", | |
" ('so that s velvet\\n', 'knight with a k\\n'),\n", | |
" ('manager\\n', 'whatever\\n'),\n", | |
" ('man\\n', 'let me tell you how it goes here\\n'),\n", | |
" ('because you look awfully familiar\\n', '\\n'),\n", | |
" ('thirteen fourteen nineteen\\n', 'nineteen years old almost twenty\\n'),\n", | |
" ('nineteen years old almost twenty\\n', '\\n'),\n", | |
" ('excuse me\\n', 'so why did you stop you were good\\n'),\n", | |
" ('anyhow those charges didn t stick\\n', 'i m sorry that happened to you\\n'),\n", | |
" ('yeah well\\n', 'look here s what\\n'),\n", | |
" ('look here s what\\n', 'end of every show\\n'),\n", | |
" ('you going to tell him ever\\n', 'why would i\\n'),\n", | |
" ('why would i\\n', 'it s who you are\\n'),\n", | |
" ('it s who you are\\n', 'it s who i was\\n'),\n", | |
" ('where s it on your ass\\n', 'no wait i ve seen your ass\\n'),\n", | |
" ('yeah i m going to wonder\\n', 'you know what else i m going to do\\n'),\n", | |
" ('you know what else i m going to do\\n',\n", | |
" 'i m gonna keep my big mouth shut\\n'),\n", | |
" ('i m gonna keep my big mouth shut\\n', 'the past is the past tay\\n'),\n", | |
" ('the past is the past tay\\n', 'and this is my future\\n'),\n", | |
" ('and this is my future\\n', 'okay\\n')]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data[:50]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:22:10.774077Z", | |
"start_time": "2018-04-27T14:22:10.755736Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import *\n", | |
"from tqdm import tqdm_notebook\n", | |
"\n", | |
"def train_char_lm(conversations, order=4):\n", | |
" lm = defaultdict(Counter)\n", | |
" pad = \"~\" * order\n", | |
" for dialog in tqdm_notebook([''.join([pad, s1, s2]) for s1, s2 in conversations]):\n", | |
" for i in range(len(dialog)-order):\n", | |
" for j in range(i-1):\n", | |
" history, char = dialog[i+j:i+order], dialog[i+order]\n", | |
" lm[history][char]+=1\n", | |
" def normalize(counter):\n", | |
" s = float(sum(counter.values()))\n", | |
" return [(c,cnt/s) for c,cnt in counter.items()]\n", | |
" outlm = {hist:normalize(chars) for hist, chars in lm.items()}\n", | |
" return outlm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:29:06.549951Z", | |
"start_time": "2018-04-27T14:22:10.775560Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "d4d6bfde64fd4d71a5052e51c2a850b0", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/html": [ | |
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n", | |
"<p>\n", | |
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", | |
" that the widgets JavaScript is still loading. If this message persists, it\n", | |
" likely means that the widgets JavaScript library is either not installed or\n", | |
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n", | |
" Widgets Documentation</a> for setup instructions.\n", | |
"</p>\n", | |
"<p>\n", | |
" If you're reading this message in another frontend (for example, a static\n", | |
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n", | |
" it may mean that your frontend doesn't currently support widgets.\n", | |
"</p>\n" | |
], | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=578347), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"lm = train_char_lm(data, order=12)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T14:29:06.554726Z", | |
"start_time": "2018-04-27T14:29:06.551509Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('\\n', 0.42470295661785024),\n", | |
" (' ', 0.4600718430505665),\n", | |
" ('w', 0.0978170765404808),\n", | |
" ('u', 0.0038684719535783366),\n", | |
" ('r', 0.004973749654600719),\n", | |
" ('q', 0.0013815971262779773),\n", | |
" ('s', 0.0005526388505111909),\n", | |
" ('n', 0.0019342359767891683),\n", | |
" ('o', 0.0008289582757667864),\n", | |
" ('p', 0.003592152528322741),\n", | |
" ('v', 0.00027631942525559546)]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lm['ello']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T17:59:35.626886Z", | |
"start_time": "2018-04-27T17:59:35.615234Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from random import random\n", | |
"\n", | |
"def generate_letter(lm, history, order):\n", | |
" for i in range(order-1):\n", | |
" history = history[-(order-i):]\n", | |
" dist = lm.get(history)\n", | |
" if not dist:\n", | |
" continue\n", | |
" x = random()\n", | |
" for c,v in dist:\n", | |
" x = x - v\n", | |
" if x <= 0: return c\n", | |
"\n", | |
" return generate_letter(lm, history[:-1], order)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T17:57:35.655297Z", | |
"start_time": "2018-04-27T17:57:35.616456Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package stopwords to /home/janek/nltk_data...\n", | |
"[nltk_data] Package stopwords is already up-to-date!\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import nltk\n", | |
"import pickle\n", | |
"import re\n", | |
"import numpy as np\n", | |
"import csv\n", | |
"\n", | |
"nltk.download('stopwords')\n", | |
"from nltk.corpus import stopwords\n", | |
"\n", | |
"def text_prepare(text):\n", | |
" \"\"\"Performs tokenization and simple preprocessing.\"\"\"\n", | |
" \n", | |
" replace_by_space_re = re.compile('[/(){}\\[\\]\\|@,;]')\n", | |
" bad_symbols_re = re.compile('[^0-9a-z #+_]')\n", | |
" stopwords_set = set(stopwords.words('english'))\n", | |
"\n", | |
" text = text.lower()\n", | |
" text = replace_by_space_re.sub(' ', text)\n", | |
" text = bad_symbols_re.sub('', text)\n", | |
" text = ' '.join([x for x in text.split() if x and x not in stopwords_set])\n", | |
"\n", | |
" return text.strip()\n", | |
"\n", | |
"def generate_answer(lm, question, order=4, max_len=25):\n", | |
" history = \"~\" * order + text_prepare(question) + \"\\n\"\n", | |
" out = []\n", | |
" while True:\n", | |
" c = generate_letter(lm, history, order)\n", | |
" history = history[-order:] + c\n", | |
" out.append(c)\n", | |
" if out[-1] == '\\n' or (len(out) > max_len and out[-1].isspace()):\n", | |
" return \"\".join(out[:-1])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T17:59:39.322327Z", | |
"start_time": "2018-04-27T17:59:39.318069Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"hello\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'hello'" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"generate_answer(lm, \"Hey\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T18:00:00.333693Z", | |
"start_time": "2018-04-27T18:00:00.328499Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"that it fence\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'that it fence'" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"generate_answer(lm, \"How are you doing?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T18:00:02.765722Z", | |
"start_time": "2018-04-27T18:00:02.760140Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"holy sharolina\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'holy sharolina'" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"generate_answer(lm, \"What's your hobby?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-04-27T18:00:07.273402Z", | |
"start_time": "2018-04-27T18:00:07.268222Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"chucklng\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'chucklng'" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"generate_answer(lm, \"How to write a loop in python?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment