Skip to content

Instantly share code, notes, and snippets.

@zilista
Created March 14, 2020 15:53
Show Gist options
  • Save zilista/a6b46d5b43d6afa672fcfab7ccfed5bb to your computer and use it in GitHub Desktop.
Save zilista/a6b46d5b43d6afa672fcfab7ccfed5bb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import json #Чтобы обрабатывать данные в в формате json\n",
"import pandas as pd #Для анализа табличных данных\n",
"import re #Используем регулярные выражения \n",
"import nltk #Для обработки естественного языка\n",
"import numpy as np\n",
"import string\n",
"import spacy\n",
"from scipy import stats\n",
"from matplotlib import pyplot\n",
"import en_core_web_sm #загрузка модели для анализа английского языка"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#https://raw.githubusercontent.com/taivop/joke-dataset/master/wocka.json #скачаем файл для примера\n",
"#https://github.com/taivop/joke-dataset/blob/master/reddit_jokes.json"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"with open('wocka.json', 'r') as json_file:\n",
" data = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"wocka = pd.DataFrame.from_dict(data)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title \n",
"0 Cow With No Legs \n",
"1 Jumping Cow \n",
"2 Black, White and Red \n",
"3 Guy in a Bar \n",
"4 Progress "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Токенизация"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def tokenize_words(text):\n",
" tokens = re.split('\\W+', text)\n",
" text = [word for word in tokens]\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"wocka['tokenize'] = wocka['body'].apply(lambda x: tokenize_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Удаление мусора и стоп-слов"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: What do you call a cow with no legs?\r\n",
"\r\n",
"Ground Beef!\r\n",
"\n",
"After: what do you call a cow with no legs ground beef\n"
]
}
],
"source": [
"def remove_punctuation(text):\n",
" removal = ' '.join(word.strip(string.punctuation) for word in text.split())\n",
" removal = removal.lower()\n",
" return removal\n",
"\n",
"print('Before: ' + wocka['body'][0] + '\\r\\n')\n",
"print('After: ' + remove_punctuation(wocka['body'][0]))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"stopwords = nltk.corpus.stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def remove_stop_words(text):\n",
" text = ''.join([word for word in text if word not in string.punctuation])\n",
" tokens = re.split('\\W+', text)\n",
" text = [word for word in tokens if word not in stopwords]\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop'] = wocka['body'].apply(lambda x: remove_stop_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Расширенные аббревиатуры"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"abbreviation_map = {\n",
" 'lol': 'laugh out loud',\n",
" 'brb': 'be right back',\n",
" 'serp': 'search engine results page'\n",
"}\n",
"\n",
"\n",
"def expand_abbreviations(text, abbreviations):\n",
" for word in sorted(abbreviations.keys(), key = len, reverse=True):\n",
" text = re.sub(word, abbreviations[word], text)\n",
" \n",
" return text\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"sample_sentence = remove_punctuation(\"Hold my bear whie I check the serp one moment. brb.\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: hold my bear whie i check the serp one moment brb\n",
"After: hold my bear whie i check the search engine results page one moment be right back\n"
]
}
],
"source": [
"print('Before: ' + sample_sentence)\n",
"print('After: ' + expand_abbreviations(sample_sentence, abbreviation_map))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Стемминг"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"ps = nltk.PorterStemmer()\n",
"\n",
"def stemming(text):\n",
" text = [ps.stem(word) for word in text]\n",
" return text\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['studi', 'studi', 'studi', 'studi']\n"
]
}
],
"source": [
"word_form = ['study', 'studies', 'studied', 'studying']\n",
"\n",
"print(stemming(word_form))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop_stemmed'] = wocka['body_no_stop'].apply(lambda x: stemming(x))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между']\n"
]
}
],
"source": [
"from nltk.corpus import stopwords\n",
"print(stopwords.words(\"russian\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лемматизация"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"lemmatize = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"better: good\n"
]
}
],
"source": [
"print('better: ' + lemmatize.lemmatize(\"better\", pos='a'))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()\n",
"\n",
"def lemmatization(text):\n",
" lemma = [word.lemma_ for word in nlp(' '.join(text))]\n",
" return lemma"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop_lemma'] = wocka['body_no_stop'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Grams"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from nltk.util import ngrams"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def create_ngrams(text, n):\n",
" ngrams_obj = ngrams(text, n)\n",
" return list(ngrams_obj)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: generator 'ngrams' raised StopIteration\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
}
],
"source": [
"wocka['bigrams'] = wocka['tokenize'].apply(lambda x: create_ngrams(x, 2))\n",
"wocka['trigrams'] = wocka['tokenize'].apply(lambda x: create_ngrams(x, 3))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tagging"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def pos(text):\n",
" tokens= [word.text for word in nlp(\" \".join([word for word in text]))]\n",
" speech_parts = [word.pos_ for word in nlp(\" \".join([word for word in text]))]\n",
" combined = list(zip(tokens, speech_parts))\n",
" return combined"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"wocka['parts_of_speech'] = wocka['tokenize'].apply(lambda x: pos(x))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech \n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import sent_tokenize\n",
"from spacy import displacy"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def remove_line_breaks(text):\n",
" removal = ' '.join(word.strip('\\r\\n') for word in text.split())\n",
" removal = removal.lower()\n",
" return removal\n",
"\n",
"def tokenize_sentences(text):\n",
" sentences = sent_tokenize(remove_line_breaks(text))\n",
" return (sentences)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()\n",
"doc = tokenize_sentences(wocka['body'][0])"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"068bfe01e5794e56bb73dd81eeac2cf4-0\" class=\"displacy\" width=\"950\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">what</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">do</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">AUX</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">you</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">PRON</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">call</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">VERB</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">a</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">DET</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">cow</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">with</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">ADP</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">no</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">DET</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"850\">legs?</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"850\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 350.0,2.0 350.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,154.0 L62,142.0 78,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,52.0 345.0,52.0 345.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,154.0 L162,142.0 178,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-2\" stroke-width=\"2px\" d=\"M270,152.0 C270,102.0 340.0,102.0 340.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M270,154.0 L262,142.0 278,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-3\" stroke-width=\"2px\" d=\"M470,152.0 C470,102.0 540.0,102.0 540.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,154.0 L462,142.0 478,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-4\" stroke-width=\"2px\" d=\"M370,152.0 C370,52.0 545.0,52.0 545.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">oprd</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M545.0,154.0 L553.0,142.0 537.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-5\" stroke-width=\"2px\" d=\"M370,152.0 C370,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M650.0,154.0 L658.0,142.0 642.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-6\" stroke-width=\"2px\" d=\"M770,152.0 C770,102.0 840.0,102.0 840.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M770,154.0 L762,142.0 778,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-7\" stroke-width=\"2px\" d=\"M670,152.0 C670,52.0 845.0,52.0 845.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-7\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M845.0,154.0 L853.0,142.0 837.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"d5cdaa947c3e42bc8d9a2451eee1eff9-0\" class=\"displacy\" width=\"250\" height=\"187.0\" direction=\"ltr\" style=\"max-width: none; height: 187.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">ground</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">beef!</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-d5cdaa947c3e42bc8d9a2451eee1eff9-0-0\" stroke-width=\"2px\" d=\"M70,52.0 C70,2.0 150.0,2.0 150.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-d5cdaa947c3e42bc8d9a2451eee1eff9-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,54.0 L62,42.0 78,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for x in doc:\n",
" displacy.render(nlp(x), style='dep', jupyter=True, options={'compact': False, 'distance': 100})"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"doc = tokenize_sentences('мама мыть раму, а я учил урок')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"844a18e83c0640abb4e861f0def39b6b-0\" class=\"displacy\" width=\"750\" height=\"187.0\" direction=\"ltr\" style=\"max-width: none; height: 187.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">мама</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">мыть</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">раму,</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">а</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">я</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">учил</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">урок</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-0\" stroke-width=\"2px\" d=\"M70,52.0 C70,2.0 150.0,2.0 150.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,54.0 L62,42.0 78,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-1\" stroke-width=\"2px\" d=\"M170,52.0 C170,2.0 250.0,2.0 250.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,54.0 L162,42.0 178,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-2\" stroke-width=\"2px\" d=\"M270,52.0 C270,2.0 350.0,2.0 350.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">intj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M350.0,54.0 L358.0,42.0 342.0,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-3\" stroke-width=\"2px\" d=\"M470,52.0 C470,2.0 550.0,2.0 550.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,54.0 L462,42.0 478,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-4\" stroke-width=\"2px\" d=\"M570,52.0 C570,2.0 650.0,2.0 650.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M650.0,54.0 L658.0,42.0 642.0,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for x in doc:\n",
" displacy.render(nlp(x), style='dep', jupyter=True, options={'compact': False, 'distance': 100})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NER"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"def ner(text):\n",
" doc = nlp(text)\n",
" ent_text = [ent.text for ent in doc.ents]\n",
" ent_start_char = [ent.start_char for ent in doc.ents]\n",
" ent_end_char = [ent.end_char for ent in doc.ents]\n",
" ent_label = [ent.label_ for ent in doc.ents]\n",
" combined = list(zip(ent_text, ent_label))\n",
" return combined"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[('Saddam Hussein', 'PERSON'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('County Cavan', 'GPE'),\n",
" ('Ireland', 'GPE'),\n",
" ('Paddy', 'WORK_OF_ART'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Sean', 'PERSON'),\n",
" ('Gerry', 'PERSON'),\n",
" ('8', 'CARDINAL'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('1 million', 'CARDINAL'),\n",
" ('Begorra', 'WORK_OF_ART'),\n",
" ('Paddy', 'PERSON'),\n",
" ('the next day', 'DATE'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('2', 'CARDINAL'),\n",
" ('Murphy', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('16 thousand', 'CARDINAL'),\n",
" ('14 thousand', 'CARDINAL'),\n",
" ('1 and a half million', 'CARDINAL'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('the next day', 'DATE'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Ted', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('a minute', 'TIME'),\n",
" ('Paddy', 'PERSON'),\n",
" ('500', 'CARDINAL'),\n",
" ('19', 'CARDINAL'),\n",
" ('2 million', 'CARDINAL'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('the next day', 'DATE'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('2 million', 'CARDINAL')]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ner(wocka['body'][13])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Word Embeddings: word2vec, GloVe"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import nltk\n",
"import gensim\n",
"from gensim import corpora, models, similarities"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"corpus = wocka['title'].values.tolist() + wocka['body'].values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"tok_corp = [nltk.word_tokenize(sent) for sent in corpus]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Cow', 'With', 'No', 'Legs'],\n",
" ['Jumping', 'Cow'],\n",
" ['Black', ',', 'White', 'and', 'Red'],\n",
" ['Guy', 'in', 'a', 'Bar'],\n",
" ['Progress']]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tok_corp[:5]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"model = gensim.models.Word2Vec(tok_corp, min_count=1, size=32)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('guy', 0.9541391134262085),\n",
" ('woman', 0.9270364046096802),\n",
" ('boy', 0.9067033529281616),\n",
" ('doctor', 0.9010869264602661),\n",
" ('priest', 0.881646990776062),\n",
" ('girl', 0.8726887702941895),\n",
" ('lady', 0.8652868270874023),\n",
" ('lawyer', 0.8572322130203247),\n",
" ('farmer', 0.8566977977752686),\n",
" ('bartender', 0.848757266998291)]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar('man'.lower())"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9270364"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity('man', 'woman')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example: Joke Lines and Length"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"def split_punchline(joke):\n",
" count = len(re.split('\\r\\n\\r\\n|\\r\\n', joke))\n",
" return count\n",
"\n",
"def character_count(joke):\n",
" length = len(joke)\n",
" return length"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"wocka['count_lines'] = wocka['body'].apply(lambda x: split_punchline(x))\n",
"wocka['joke_length'] = wocka['body'].apply(lambda x: character_count(x))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# Скачиваем файл https://github.com/zacanger/profane-words/blob/master/words.json"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"with open('words.json') as json_file:\n",
" badwords_list = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"def detect_bad_words(list):\n",
" text = [word for word in list if word not in badwords_list]\n",
" if len(text) < len(list):\n",
" return True\n",
" else:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"wocka['dirty'] = wocka['body_no_stop'].apply(lambda x: detect_bad_words(x))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" <th>count_lines</th>\n",
" <th>joke_length</th>\n",
" <th>dirty</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>52</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>78</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" <td>2</td>\n",
" <td>56</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" <td>2</td>\n",
" <td>51</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" <td>1</td>\n",
" <td>72</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech count_lines \\\n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... 2 \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... 2 \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... 1 \n",
"\n",
" joke_length dirty \n",
"0 52 False \n",
"1 78 False \n",
"2 56 False \n",
"3 51 False \n",
"4 72 False "
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFvZJREFUeJzt3Xu0ZGV95vHvE0QyChGRRqFpbDRtIs6MLatjyGiyUByFNtoagwPjIDFqmzU4E9fojKijYuIFMl4mWRPJagOKeAHiJXYiycAQL+NE0YY0AiLaYgNNd5oOFwE1ROA3f+z3xOJwLnVudc7Z/f2sVauq3npr71/vqn7Ou9/atStVhSSpv35msQuQJC0sg16Ses6gl6SeM+glqecMeknqOYNeknrOoNeMJKkkP7/YdcyXJF9M8qp5XuY9SZ4wT8t6c5I/bbdXt+3/sHla9hGt1n3mY3laugx6AQsTeEtpfW2dZyT52Byef2ySB1o43pNkR5KLkvzSYL+q2r+qbhhiWTumW2dVvbuq5mU7Jdme5DkDy76p1Xr/fCxfS5dBL83MzqraHzgAOAb4NvB/kxw33yuar5G7ZNDrIZK8Osm2JLcn2ZzksEn6PTPJzUme1e7/YpJL2/OuT/LSWa7/mCR/m+TOJFclOXbgsS8m+f0k/y/J3UkuSXLwwOMvT3JjktuSvHVsFJvkeODNwL9ro/GrBlb5+MmWN5nq7KiqtwF/Cpw1UMM/T28lWZ/kW23ZtyR5Q5JHAn8FHDawd3BY2+P4VJKPJbkL+K1J9kJ+O8nOJLuSvH5gvR9J8s6B+/+815DkfOAI4C/a+v7b+KmgVsPm9vptS/LqgWWd0fZePtr+LdcmWTfddtLSYNDrQZI8G3gP8FLgUOBG4IIJ+j0P+CTwkqr6QguvS4FPAIcAJwMfTPKUGa5/JfB54J3AQcAbgE8nWTHQ7d8Dr2jreXjrQ5KjgA8CL2u1PwpYCVBVfw28G7iwTVc8dbrlzcBngKPbNhjvHOA1VXUA8C+Bv6mqHwIn0PYO2mVn678B+BRwIPDxSdb3LGAN8Fzg9MHpmMlU1SnATcAL2vr+YIJunwR2AIcBvwm8e9yeygvp3gsHApuB/zXderU0GPQa72XAuVV1ZVXdC7wJ+JUkqwf6nAhsAtZX1ddb268D26vqw1V1X1VdCXyaLjBm4j8AF1fVxVX1QFVdCmwB1g/0+XBVfaeqfgxcBKxt7b8J/EVVfaWq/gl4GzDMyZwmW96wdgKhC8DxfgIcleTnquqOtl2m8tWq+vP2b//xJH3eUVU/rKqrgQ/T/VGdkySrgGcCb6yqf6yqrXR7KqcMdPtKe13uB84HnjrBorQEGfQa7zC6UTwAVXUPcBttZNy8DrioBc2YxwO/3KZb7kxyJ90fjcfNcP2PB04ct5xn0o3Qx/z9wO0fAfsP1H7zQO0/arVPZ7LlDWsl3R+UOyd47CV0f6RuTPKlJL8yzbJunubx8X1upPt3z9VhwO1Vdfe4ZQ++7uO308/6OcLy4Iuk8XbShS0AbTriMcAtA31OBM5JcktV/c/WdjPwpar6t3Nc/83A+VX16ml7PtQu4BfG7iT5F3S1j1moU7W+GLiyTck8SFV9A9iQZF/gtXR7DKumqGWYGlfRfQgM3bz72LTPD4FHDPQb/0d2qmXvBA5KcsBA2B/Bg193LVOO6DXeJ4BXJFmbZD+6ee3Lq2r7QJ+dwHHAf07yH1vbXwJPSnJKkn3b5ZeSPHmKdT0syc8OXPYFPga8IMnzkuzT2o9NcvgQtX+qPfffJHk48A66KZUxu4HVSeb8vk9nZZK3A6+i+6B3fJ+HJ3lZkkdV1U+Au4CxQxl3A49J8qhZrP6tSR7RPv94BXBha98KrE9yUJLH0e15DdoNTHh8f1XdDPwt8J62zf818Eom/5xAy4hBr0FVVZcBb6WbX98FPBE4aYKON9GF/RuTvKqNAp/b+u6k280/C9hvivWdDfx44PLhFjgb6IJzD90I/78yxHu1qq4F/hPdB4a7gLuBW4F7W5c/a9e3JZlurnwyhyW5B7gH+Abwr4Bjq+qSSfqfAmxvR9H8Dt1nEFTVt+k+/LyhTVHNZPrlS8A24DLgvQPrPh+4CtgOXMJP/wCMeQ/w39v6JvrA+WRgNd3r91ng7e0zEi1z8YdHBNCC7/eq6s8Xu5b5kmR/unnzNVX1/cWuR1osjuhFmwJ4MvB3i13LXCV5QZvWeCTwXuBquhGutNcy6PdySc6i281/Y1XdOF3/ZWAD3dTDTrpjzU8qd1u1l3PqRpJ6zhG9JPXckjiO/uCDD67Vq1cvdhmStKxcccUV/1BVK6brtySCfvXq1WzZsmWxy5CkZSXJUJ+rOXUjST1n0EtSzxn0ktRz0wZ9O+/F19P9AMS1Sd7R2o9McnmS7ya5sJ1bhCT7tfvb2uOrF/afIEmayjAj+nuBZ7cfalgLHJ/kGLrzmHygqtYAd9CdAIl2fUdV/TzwAQZ+eUeSNHrDnCiq2jnJAfZtlwKeTXe2QIDzgBe12xvafdrjxyUZPIOgJGmEhpqjb6eL3Up3JsBLge8Bd1bVfa3LDn76AwUraT+M0B7/AQ8+J7gkaYSGCvqqur+q1gKHA0+nOwHWQ7q164lG7w85z0KSjUm2JNmyZ8+eYeuVJM3QjI66qao7gS8CxwAHDvyM2OH89FdudtD9Ag7t8UcBt0+wrE1Vta6q1q1YMe0XuyRJszTtN2OTrAB+UlV3tp9mew7dB6xfoPsx5guAU4HPtadsbve/2h7/G88eqJlYffrnp3x8+5nPH1ElUj8McwqEQ4HzkuxDtwdwUVX9ZZJvARckeSfdeczPaf3PAc5Pso1uJP+QXyeSJI3OtEFfVd8EnjZB+w108/Xj2/+R7sejJUlLgN+MlaSeM+glqecMeknqOYNeknrOoJeknjPoJannDHpJ6jmDXpJ6zqCXpJ4z6CWp5wx6Seo5g16Ses6gl6SeM+glqecMeknquWF+eER6CH8FSlo+HNFLUs8Z9JLUcwa9JPWcQS9JPWfQS1LPGfSS1HMGvST1nEEvST1n0EtSzxn0ktRz0wZ9klVJvpDkuiTXJvnd1n5GkluSbG2X9QPPeVOSbUmuT/K8hfwHSJKmNsy5bu4DXl9VVyY5ALgiyaXtsQ9U1XsHOyc5CjgJeApwGPB/kjypqu6fz8IlScOZNuirahewq92+O8l1wMopnrIBuKCq7gW+n2Qb8HTgq/NQr3piupOiSZo/M5qjT7IaeBpweWt6bZJvJjk3yaNb20rg5oGn7WCCPwxJNibZkmTLnj17Zly4JGk4Qwd9kv2BTwOvq6q7gLOBJwJr6Ub87xvrOsHT6yENVZuqal1VrVuxYsWMC5ckDWeooE+yL13If7yqPgNQVbur6v6qegD4EN30DHQj+FUDTz8c2Dl/JUuSZmKYo24CnANcV1XvH2g/dKDbi4Fr2u3NwElJ9ktyJLAG+Pr8lSxJmolhjrp5BnAKcHWSra3tzcDJSdbSTctsB14DUFXXJrkI+BbdETunecSNJC2eYY66+QoTz7tfPMVz3gW8aw51SZLmid+MlaSeM+glqecMeknqOYNeknrOoJeknjPoJannhjmOXpoxT1omLR2O6CWp5wx6Seo5g16Ses6gl6SeM+glqecMeknqOYNeknrOoJeknvMLU3spv9Ak7T0c0UtSzxn0ktRzBr0k9ZxBL0k9Z9BLUs8Z9JLUcwa9JPWcQS9JPWfQS1LPGfSS1HPTngIhySrgo8DjgAeATVX1h0kOAi4EVgPbgZdW1R1JAvwhsB74EfBbVXXlwpSvvdF0p2/YfubzR1SJtDwMM6K/D3h9VT0ZOAY4LclRwOnAZVW1Bris3Qc4AVjTLhuBs+e9aknS0KYN+qraNTYir6q7geuAlcAG4LzW7TzgRe32BuCj1fkacGCSQ+e9cknSUGY0R59kNfA04HLgsVW1C7o/BsAhrdtK4OaBp+1obeOXtTHJliRb9uzZM/PKJUlDGTrok+wPfBp4XVXdNVXXCdrqIQ1Vm6pqXVWtW7FixbBlSJJmaKjz0SfZly7kP15Vn2nNu5McWlW72tTMra19B7Bq4OmHAzvnq2BpOn5YKz3YtCP6dhTNOcB1VfX+gYc2A6e226cCnxtof3k6xwA/GJvikSSN3jAj+mcApwBXJ9na2t4MnAlclOSVwE3Aie2xi+kOrdxGd3jlK+a1YknSjEwb9FX1FSaedwc4boL+BZw2x7qkBePUjvY2fjNWknrOoJeknjPoJannDHpJ6jmDXpJ6zqCXpJ4z6CWp5wx6Seo5g16Ses6gl6SeM+glqecMeknqOYNeknrOoJeknjPoJannDHpJ6jmDXpJ6zqCXpJ4z6CWp5wx6Seo5g16Ses6gl6SeM+glqecMeknqOYNeknrOoJeknps26JOcm+TWJNcMtJ2R5JYkW9tl/cBjb0qyLcn1SZ63UIVLkoYzzIj+I8DxE7R/oKrWtsvFAEmOAk4CntKe88Ek+8xXsZKkmZs26Kvqy8DtQy5vA3BBVd1bVd8HtgFPn0N9kqQ5etgcnvvaJC8HtgCvr6o7gJXA1wb67GhtD5FkI7AR4IgjjphDGZrM6tM/v9gl9NJU23X7mc8fYSXScGb7YezZwBOBtcAu4H2tPRP0rYkWUFWbqmpdVa1bsWLFLMuQJE1nVkFfVbur6v6qegD4ED+dntkBrBroejiwc24lSpLmYlZBn+TQgbsvBsaOyNkMnJRkvyRHAmuAr8+tREnSXEw7R5/kk8CxwMFJdgBvB45NspZuWmY78BqAqro2yUXAt4D7gNOq6v6FKV2SNIxpg76qTp6g+Zwp+r8LeNdcipIkzR+/GStJPTeXwyulXvKwVPWNI3pJ6jmDXpJ6zqCXpJ4z6CWp5wx6Seo5g16Ses6gl6SeM+glqef8wtQy5hd7JA3DEb0k9ZxBL0k9Z9BLUs8Z9JLUcwa9JPWcQS9JPWfQS1LPeRy9NI/m+t2G7Wc+f54qkX7KEb0k9ZxBL0k9Z9BLUs85Ry8tIdPN8TuHr9lwRC9JPWfQS1LPGfSS1HPTBn2Sc5PcmuSagbaDklya5Lvt+tGtPUn+KMm2JN9McvRCFi9Jmt4wI/qPAMePazsduKyq1gCXtfsAJwBr2mUjcPb8lClJmq1pg76qvgzcPq55A3Beu30e8KKB9o9W52vAgUkOna9iJUkzN9s5+sdW1S6Adn1Ia18J3DzQb0dre4gkG5NsSbJlz549syxDkjSd+f4wNhO01UQdq2pTVa2rqnUrVqyY5zIkSWNmG/S7x6Zk2vWtrX0HsGqg3+HAztmXJ0maq9kG/Wbg1Hb7VOBzA+0vb0ffHAP8YGyKR5K0OKY9BUKSTwLHAgcn2QG8HTgTuCjJK4GbgBNb94uB9cA24EfAKxagZkmTmOoUCp4+Ye81bdBX1cmTPHTcBH0LOG2uRUmS5o/fjJWknjPoJannDHpJ6jmDXpJ6zh8ekZaRuf74uPZOjuglqecMeknqOaduljB30yXNB0f0ktRzBr0k9ZxTN9JeYrqpQM+F01+O6CWp5xzRS1p07m0sLEf0ktRzBr0k9ZxTN5IAp0/6zBG9JPWcQS9JPWfQS1LPGfSS1HMGvST1nEEvST3n4ZWShuLhl8uXQS9pJPr6+wrL4Q+gQS9pXvQ1yPvAOXpJ6rk5jeiTbAfuBu4H7quqdUkOAi4EVgPbgZdW1R1zK1OSNFvzMaJ/VlWtrap17f7pwGVVtQa4rN2XJC2ShZi62QCc126fB7xoAdYhSRrSXIO+gEuSXJFkY2t7bFXtAmjXh0z0xCQbk2xJsmXPnj1zLEOSNJm5HnXzjKrameQQ4NIk3x72iVW1CdgEsG7duppjHZL2YsvhEMfFNKcRfVXtbNe3Ap8Fng7sTnIoQLu+da5FSpJmb9Yj+iSPBH6mqu5ut58L/B6wGTgVOLNdf24+CpWk2ZrLMf5z3RtYCnsbc5m6eSzw2SRjy/lEVf11km8AFyV5JXATcOLcy5S0N/PLWHMz66CvqhuAp07Qfhtw3FyKkiTNH0+BsIgcpUgaBYNekqbQhwGZ57qRpJ4z6CWp5wx6Seo5g16Ses6gl6SeM+glqecMeknqOY+jX0B9OP5W0vLniF6Ses6gl6SeM+glqecMeknqOYNeknrOoJeknjPoJannDHpJ6jmDXpJ6zqCXpJ4z6CWp5zzXzTSmO1/N9jOfP6JKJGl2ln3QL3YQe+IySUvdsg/66RjEkvZ2ztFLUs8Z9JLUcwsW9EmOT3J9km1JTl+o9UiSprYgQZ9kH+CPgROAo4CTkxy1EOuSJE1toUb0Twe2VdUNVfVPwAXAhgValyRpCgt11M1K4OaB+zuAXx7skGQjsLHdvSfJ9bNc18HAP8zyuQtpqdYFS7c265oZ65qZJVlXzppTXY8fptNCBX0maKsH3anaBGya84qSLVW1bq7LmW9LtS5YurVZ18xY18zszXUt1NTNDmDVwP3DgZ0LtC5J0hQWKui/AaxJcmSShwMnAZsXaF2SpCksyNRNVd2X5LXA/wb2Ac6tqmsXYl3Mw/TPAlmqdcHSrc26Zsa6ZmavrStVNX0vSdKy5TdjJannDHpJ6rllHfRL5TQLSVYl+UKS65Jcm+R3W/sZSW5JsrVd1i9CbduTXN3Wv6W1HZTk0iTfbdePHnFNvzCwTbYmuSvJ6xZjeyU5N8mtSa4ZaJtw+6TzR+399s0kR4+4rv+R5Ntt3Z9NcmBrX53kxwPb7U9GXNekr1uSN7XtdX2S5424rgsHatqeZGtrH+X2miwbRvseq6pleaH7kPd7wBOAhwNXAUctUi2HAke32wcA36E79cMZwBsWeTttBw4e1/YHwOnt9unAWYv8Ov493Rc/Rr69gF8DjgaumW77AOuBv6L7nsgxwOUjruu5wMPa7bMG6lo92G8RtteEr1v7P3AVsB9wZPv/us+o6hr3+PuAty3C9posG0b6HlvOI/olc5qFqtpVVVe223cD19F9O3ip2gCc126fB7xoEWs5DvheVd24GCuvqi8Dt49rnmz7bAA+Wp2vAQcmOXRUdVXVJVV1X7v7Nbrvp4zUJNtrMhuAC6rq3qr6PrCN7v/tSOtKEuClwCcXYt1TmSIbRvoeW85BP9FpFhY9XJOsBp4GXN6aXtt2wc4d9RRJU8AlSa5Id9oJgMdW1S7o3ojAIYtQ15iTePB/wMXeXjD59llK77nfphv5jTkyyd8l+VKSX12EeiZ63ZbK9vpVYHdVfXegbeTba1w2jPQ9tpyDftrTLIxakv2BTwOvq6q7gLOBJwJrgV10u4+j9oyqOpruTKKnJfm1RahhQum+TPdC4M9a01LYXlNZEu+5JG8B7gM+3pp2AUdU1dOA/wJ8IsnPjbCkyV63JbG9gJN58GBi5NtrgmyYtOsEbXPeZss56JfUaRaS7Ev3Qn68qj4DUFW7q+r+qnoA+BALtNs6lara2a5vBT7batg9tjvYrm8ddV3NCcCVVbW71bjo26uZbPss+nsuyanArwMvqzap26ZGbmu3r6CbC3/SqGqa4nVbCtvrYcBvABeOtY16e02UDYz4Pbacg37JnGahzQGeA1xXVe8faB+cW3sxcM345y5wXY9McsDYbboP866h206ntm6nAp8bZV0DHjTSWuztNWCy7bMZeHk7MuIY4Adju9+jkOR44I3AC6vqRwPtK9L9BgRJngCsAW4YYV2TvW6bgZOS7JfkyFbX10dVV/Mc4NtVtWOsYZTba7JsYNTvsVF88rxQF7pPqL9D9xf5LYtYxzPpdq++CWxtl/XA+cDVrX0zcOiI63oC3VEPVwHXjm0j4DHAZcB32/VBi7DNHgHcBjxqoG3k24vuD80u4Cd0o6lXTrZ96Har/7i9364G1o24rm1087dj77E/aX1f0l7fq4ArgReMuK5JXzfgLW17XQ+cMMq6WvtHgN8Z13eU22uybBjpe8xTIEhSzy3nqRtJ0hAMeknqOYNeknrOoJeknjPoJannDHpJ6jmDXpJ67v8Dp00T9HTmF1kAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"bins = np.linspace(0, 200, 40)\n",
"pyplot.hist(wocka['joke_length'], bins)\n",
"pyplot.title('Joke Length Distribution')\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEICAYAAABWJCMKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGhxJREFUeJzt3X+UXWV97/H3x4QfVX4kgYFCEgjU1Iq9FdIR4tV61WhIoiW0iitcC1OMpnbRVtatt0a51yhIhfbecmVVcaUkGCgIKYrJVSpMo+i1LZAJhPAj0AwxkDExmTLhlygKfu8f+xndGc6Zc8782Gea5/Naa9bZ53ues/ez95w5n7Ofvc8eRQRmZpafV7S7A2Zm1h4OADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAbFxJ+pKkz7Rp2ZJ0raR9ku4Z5bw+Jenvx6pvTSzvOUknj9G8PiHpmjQ9S1JImjxG8z4h9XXSWMzPquUAyIykHZL2SHpVqfZBSXe2sVvj5c3AO4EZEXH60AclvVVSX5UdSsv8eXrTfE5Sn6S1kt5QbhcRh0XE9ibm1bD/EfGXEfHB0fY9LXOHpHeU5v1E6utLYzF/q5YDIE+TgY+0uxOtGsGnzBOBHRHxo/HozyjsiojDgMOBucAjwP+TNG+sFzRWn/TtwOQAyNNfAx+VNGXoA7WGCCTdKemDafoPJf2zpCslPSVpu6T/nOo7Je2V1DVktkdL6pb0rKTvSDqxNO/fSI8NSHpU0vtKj31J0tWSbpP0I+BtNfp7vKT16fm9kj6U6kuBa4A3pk/an260UerNq0a7gyR9WdJXJB0s6RWSlkt6TNKT6RP9tEbLi0JfRHwy9fWK0jJC0qvT9CJJD6ft9wNJH017cP8IHF/amzg+DVXdIunvJT0D/GGd4asPSNolabekPy8td78hu/JehqTrgROA/5uW9xdDXy/DbcPUj7WSrkvr8pCkzkbbycaPAyBPPcCdwEdH+PwzgC3AUcCNwE3AG4BXA38A/K2kw0rt3w9cChwNbAZuAEhvYt1pHscA5wJfkPS60nP/K3AZxafl79Xoy5eBPuB44L3AX0qaFxGrgA8D/5qGKFY0sV4151VuIOlXgK8BLwDvi4ifAn8GnA38l/TcfcDnm1he2VeBOeWhuZJVwB9FxOHAbwLfSns1C0l7E+lnV2q/GLgFmELa1jW8DZgNzAeWl4d16omI84AngN9Ny/urGs0abcOzKF4vU4D1wN82Wq6NHwdAvj4J/KmkjhE89/sRcW0a970ZmAlcEhEvRMQdwE8pwmDQNyLiuxHxAnAxxafymcC7KYZoro2IFyPiXuArFG8cg9ZFxD9HxM8j4iflTqR5vBn4WET8JCI2U3ySPq/VFWpyXkcA3wQeAy4ojXv/EXBx+jT/AvAp4L0tDr/sAkTxxjjUz4BTJB0REfvSdhrOv0bE19I2+3GdNp+OiB9FxAPAtRThOypNbsPvRcRtadtdD7x+tMu1kXMAZCoiHgS+DiwfwdP3lKZ/nOY3tFbeA9hZWu5zwADFJ8QTgTPSUNJTkp6i2Fv41VrPreF4YCAini3VHgemt7AurcxrLvBbwOWx/1UUTwRuLa3DVuAl4NgWlj8dCOCpGo+9B1gEPJ6G0N7YYF7DbbNabR6nWP/RamYb/rA0/TxwqI9TtI8DIG8rgA+x/x/o4AHTV5Zq5TfkkZg5OJGGhqZRfOLdCXwnIqaUfg6LiD8uPXe4y9XuAqZJOrxUOwH4wQj62My87gA+C2yQVH5z3wksHLIeh0ZEK/34PeDeWgesI2JjRCymGCb7GrB28KE682rmEr8zS9MnUKw/FL//4X73Vf0+rAIOgIxFRC/FEM6flWr9FH+wfyBpkqQPAL82ykUtkvRmSQdTHAu4OyJ2UuyB/Lqk89KB1YMkvUHSa5vs/07gX4DPSjpU0m8BS6k/7j3qeaVx7xspQuDoVP4icNngwW1JHZIWN1qmCtMlrQA+CHyiRpuDJb1f0pER8TPgGYq9Cyj2xI6SdGSr6wv8T0mvTMdbLqB4HUBxjGaRpGmSfhW4aMjz9gA1v58wlr8Pq4YDwC4Bhh54/BDw34EngddR/FGPxo0UexsDwG9TDPOQhgrmA0soPj3+kOJMmENamPe5wKz0/FuBFRHR3cLzy59om5pXRFxK8Un8n9LZPp+jOKB5h6RngbsoDpTXc7yk54DngI3AfwLemo6f1HIesCOd1fNhigPtRMQjFAddt6fhp1aGcb4D9AIbgP9VWvb1wP3ADoo9npuHPO+zwP9Iy6t1EsFofx9WIfkfwliuJJ1FcfD61Hb3xawdvAdgWUoHHt9DcUqsWZZ89N2yk8bMdwKbgPPb3B2ztvEQkJlZpjwEZGaWqQk9BHT00UfHrFmz2t0NM7P/UDZt2vTvEdHwW/4TOgBmzZpFT4+P0ZmZtULS48208xCQmVmmGgaApNdI2lz6eUbSRembgt2StqXbqam9JF2VLgW7RdKc0ry6Uvttevklg83MrEINAyAiHo2IU9OXZX6b4gJOt1JcRGxDRMym+Dbh4EXFFlJcZnY2sAy4GiB9Y3IFxTckTwdWDIaGmZlVr9UhoHnAYxHxOMU1x9ek+hqK66GT6telf3ZxFzBF0nHAmUB3RAxExD6K68AvGPUamJnZiLQaAEsorj0CcGxE7AZIt8ek+nT2v9RsX6rVq+9H0jJJPZJ6+vv7W+yemZk1q+kASFdyPAv4h0ZNa9RimPr+hYiVEdEZEZ0dHSP5XyVmZtaMVvYAFlJcr3zwH3/sSUM7pNu9qd7H/tcan0FxZcB6dTMza4NWAuBcfjn8A8XlbwfP5OkC1pXq56ezgeYCT6chotuB+ZKmpoO/81PNzMzaoKkvgkl6JfBOiv99OuhyYK2kpRT/KPqcVL+N4t/X9VKcMXQBQEQMSLqU4vrnUFyGd2DUa2BmZiMyoS8G19nZGaP5JvCs5d+o+9iOy9814vmamU1kkjZFRGejdv4msJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZppoKAElTJN0i6RFJWyW9UdI0Sd2StqXbqamtJF0lqVfSFklzSvPpSu23Seoar5UyM7PGmt0D+BzwzYj4DeD1wFZgObAhImYDG9J9gIXA7PSzDLgaQNI0YAVwBnA6sGIwNMzMrHoNA0DSEcBbgFUAEfHTiHgKWAysSc3WAGen6cXAdVG4C5gi6TjgTKA7IgYiYh/QDSwY07UxM7OmNbMHcDLQD1wr6T5J10h6FXBsROwGSLfHpPbTgZ2l5/elWr26mZm1QTMBMBmYA1wdEacBP+KXwz21qEYthqnv/2RpmaQeST39/f1NdM/MzEaimQDoA/oi4u50/xaKQNiThnZIt3tL7WeWnj8D2DVMfT8RsTIiOiOis6Ojo5V1MTOzFjQMgIj4IbBT0mtSaR7wMLAeGDyTpwtYl6bXA+ens4HmAk+nIaLbgfmSpqaDv/NTzczM2mByk+3+FLhB0sHAduACivBYK2kp8ARwTmp7G7AI6AWeT22JiAFJlwIbU7tLImJgTNbCzMxa1lQARMRmoLPGQ/NqtA3gwjrzWQ2sbqWDZmY2PvxNYDOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy1VQASNoh6QFJmyX1pNo0Sd2StqXbqakuSVdJ6pW0RdKc0ny6UvttkrrGZ5XMzKwZrewBvC0iTo2IznR/ObAhImYDG9J9gIXA7PSzDLgaisAAVgBnAKcDKwZDw8zMqjeaIaDFwJo0vQY4u1S/Lgp3AVMkHQecCXRHxEBE7AO6gQWjWL6ZmY1CswEQwB2SNklalmrHRsRugHR7TKpPB3aWntuXavXq+5G0TFKPpJ7+/v7m18TMzFoyucl2b4qIXZKOAbolPTJMW9WoxTD1/QsRK4GVAJ2dnS973MzMxkZTewARsSvd7gVupRjD35OGdki3e1PzPmBm6ekzgF3D1M3MrA0aBoCkV0k6fHAamA88CKwHBs/k6QLWpen1wPnpbKC5wNNpiOh2YL6kqeng7/xUMzOzNmhmCOhY4FZJg+1vjIhvStoIrJW0FHgCOCe1vw1YBPQCzwMXAETEgKRLgY2p3SURMTBma2JmZi1pGAARsR14fY36k8C8GvUALqwzr9XA6ta7aWZmY83fBDYzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy1TTASBpkqT7JH093T9J0t2Stkm6WdLBqX5Iut+bHp9VmsfHU/1RSWeO9cqYmVnzWtkD+AiwtXT/CuDKiJgN7AOWpvpSYF9EvBq4MrVD0inAEuB1wALgC5Imja77ZmY2Uk0FgKQZwLuAa9J9AW8HbklN1gBnp+nF6T7p8Xmp/WLgpoh4ISK+D/QCp4/FSpiZWeua3QP4P8BfAD9P948CnoqIF9P9PmB6mp4O7ARIjz+d2v+iXuM5vyBpmaQeST39/f0trIqZmbWiYQBIejewNyI2lcs1mkaDx4Z7zi8LESsjojMiOjs6Ohp1z8zMRmhyE23eBJwlaRFwKHAExR7BFEmT06f8GcCu1L4PmAn0SZoMHAkMlOqDys8xM7OKNdwDiIiPR8SMiJhFcRD3WxHxfuDbwHtTsy5gXZpen+6THv9WRESqL0lnCZ0EzAbuGbM1MTOzljSzB1DPx4CbJH0GuA9YleqrgOsl9VJ88l8CEBEPSVoLPAy8CFwYES+NYvlmZjYKLQVARNwJ3Jmmt1PjLJ6I+AlwTp3nXwZc1monzcxs7PmbwGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmWoYAJIOlXSPpPslPSTp06l+kqS7JW2TdLOkg1P9kHS/Nz0+qzSvj6f6o5LOHK+VMjOzxprZA3gBeHtEvB44FVggaS5wBXBlRMwG9gFLU/ulwL6IeDVwZWqHpFOAJcDrgAXAFyRNGsuVMTOz5jUMgCg8l+4elH4CeDtwS6qvAc5O04vTfdLj8yQp1W+KiBci4vtAL3D6mKyFmZm1rKljAJImSdoM7AW6gceApyLixdSkD5iepqcDOwHS408DR5XrNZ5TXtYyST2Sevr7+1tfIzMza0pTARARL0XEqcAMik/tr63VLN2qzmP16kOXtTIiOiOis6Ojo5numZnZCLR0FlBEPAXcCcwFpkianB6aAexK033ATID0+JHAQLle4zlmZlaxZs4C6pA0JU3/CvAOYCvwbeC9qVkXsC5Nr0/3SY9/KyIi1Zeks4ROAmYD94zVipiZWWsmN27CccCadMbOK4C1EfF1SQ8DN0n6DHAfsCq1XwVcL6mX4pP/EoCIeEjSWuBh4EXgwoh4aWxXx8zMmtUwACJiC3Bajfp2apzFExE/Ac6pM6/LgMta76aZmY01fxPYzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDQNA0kxJ35a0VdJDkj6S6tMkdUvalm6nprokXSWpV9IWSXNK8+pK7bdJ6hq/1TIzs0aa2QN4EfjziHgtMBe4UNIpwHJgQ0TMBjak+wALgdnpZxlwNRSBAawAzgBOB1YMhoaZmVWvYQBExO6IuDdNPwtsBaYDi4E1qdka4Ow0vRi4Lgp3AVMkHQecCXRHxEBE7AO6gQVjujZmZta0lo4BSJoFnAbcDRwbEbuhCAngmNRsOrCz9LS+VKtXH7qMZZJ6JPX09/e30j0zM2tB0wEg6TDgK8BFEfHMcE1r1GKY+v6FiJUR0RkRnR0dHc12z8zMWtRUAEg6iOLN/4aI+Goq70lDO6TbvaneB8wsPX0GsGuYupmZtUEzZwEJWAVsjYi/KT20Hhg8k6cLWFeqn5/OBpoLPJ2GiG4H5kuamg7+zk81MzNrg8lNtHkTcB7wgKTNqfYJ4HJgraSlwBPAOemx24BFQC/wPHABQEQMSLoU2JjaXRIRA2OyFmZm1rKGARAR36P2+D3AvBrtA7iwzrxWA6tb6aCZmY0PfxPYzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w18z2AA9Ks5d8Y9vEdl7+rop6YmbWH9wDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy1TDAJC0WtJeSQ+WatMkdUvalm6nprokXSWpV9IWSXNKz+lK7bdJ6hqf1TEzs2Y1swfwJWDBkNpyYENEzAY2pPsAC4HZ6WcZcDUUgQGsAM4ATgdWDIaGmZm1R8MAiIjvAgNDyouBNWl6DXB2qX5dFO4Cpkg6DjgT6I6IgYjYB3Tz8lAxM7MKjfQYwLERsRsg3R6T6tOBnaV2falWr25mZm0y1geBVaMWw9RfPgNpmaQeST39/f1j2jkzM/ulkQbAnjS0Q7rdm+p9wMxSuxnArmHqLxMRKyOiMyI6Ozo6Rtg9MzNrZKQBsB4YPJOnC1hXqp+fzgaaCzydhohuB+ZLmpoO/s5PNTMza5PJjRpI+jLwVuBoSX0UZ/NcDqyVtBR4AjgnNb8NWAT0As8DFwBExICkS4GNqd0lETH0wLKZmVWoYQBExLl1HppXo20AF9aZz2pgdUu9MzOzceNvApuZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWqYangeZq1vJvDPv4jsvfVVFPzMzGh/cAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uULwUxQr5UhJn9R+c9ADOzTHkPYJx4D8HMJjrvAZiZZcoBYGaWqcqHgCQtAD4HTAKuiYjLq+7DRDDcEJGHh8ysCpUGgKRJwOeBdwJ9wEZJ6yPi4Sr7MdE1On7QiAPEzJpR9R7A6UBvRGwHkHQTsBhwAIyh0QbIROZwMxs7VQfAdGBn6X4fcEa5gaRlwLJ09zlJj45ieUcD/z6K548X96s1v+iXrmhzT/Y34bfXBON+tWY0/TqxmUZVB4Bq1GK/OxErgZVjsjCpJyI6x2JeY8n9ao371Rr3qzU596vqs4D6gJml+zOAXRX3wczMqD4ANgKzJZ0k6WBgCbC+4j6YmRkVDwFFxIuS/gS4neI00NUR8dA4LnJMhpLGgfvVGverNe5Xa7LtlyKicSszMzvg+JvAZmaZcgCYmWXqgAwASQskPSqpV9LyNvZjpqRvS9oq6SFJH0n1T0n6gaTN6WdRG/q2Q9IDafk9qTZNUrekbel2asV9ek1pm2yW9Iyki9qxvSStlrRX0oOlWs3to8JV6fW2RdKcivv115IeScu+VdKUVJ8l6cel7fbFivtV9/cm6eNpez0q6cyK+3VzqU87JG1O9Sq3V733hmpfYxFxQP1QHFx+DDgZOBi4HzilTX05DpiTpg8H/g04BfgU8NE2b6cdwNFDan8FLE/Ty4Er2vx7/CHFF1oq317AW4A5wIONtg+wCPhHiu+5zAXurrhf84HJafqKUr9mldu1YXvV/L2lv4H7gUOAk9Lf66Sq+jXk8f8NfLIN26vee0Olr7EDcQ/gF5ebiIifAoOXm6hcROyOiHvT9LPAVopvQ09Ui4E1aXoNcHYb+zIPeCwiHm/HwiPiu8DAkHK97bMYuC4KdwFTJB1XVb8i4o6IeDHdvYvi+zWVqrO96lkM3BQRL0TE94Feir/bSvslScD7gC+Px7KHM8x7Q6WvsQMxAGpdbqLtb7qSZgGnAXen0p+kXbnVVQ+1JAHcIWmTistvABwbEbuheIECx7ShX4OWsP8fZru3F9TfPhPpNfcBik+Kg06SdJ+k70j6nTb0p9bvbaJsr98B9kTEtlKt8u015L2h0tfYgRgADS83UTVJhwFfAS6KiGeAq4FfA04FdlPshlbtTRExB1gIXCjpLW3oQ00qviR4FvAPqTQRttdwJsRrTtLFwIvADam0GzghIk4D/htwo6QjKuxSvd/bhNhewLns/yGj8u1V472hbtMatVFvswMxACbU5SYkHUTxC74hIr4KEBF7IuKliPg58HeM0+7vcCJiV7rdC9ya+rBncLcy3e6tul/JQuDeiNiT+tj27ZXU2z5tf81J6gLeDbw/0qBxGmJ5Mk1vohhr//Wq+jTM720ibK/JwO8DNw/Wqt5etd4bqPg1diAGwIS53EQaY1wFbI2IvynVy2N3vwc8OPS549yvV0k6fHCa4iDigxTbqSs16wLWVdmvkv0+mbV7e5XU2z7rgfPTmRpzgacHd+OroOKfLH0MOCsini/VO1T8Dw4knQzMBrZX2K96v7f1wBJJh0g6KfXrnqr6lbwDeCQi+gYLVW6veu8NVP0aq+KId9U/FEfM/40iwS9uYz/eTLGbtgXYnH4WAdcDD6T6euC4ivt1MsVZGPcDDw1uI+AoYAOwLd1Oa8M2eyXwJHBkqVb59qIIoN3Azyg+fS2tt30ods8/n15vDwCdFferl2J8ePA19sXU9j3p93s/cC/wuxX3q+7vDbg4ba9HgYVV9ivVvwR8eEjbKrdXvfeGSl9jvhSEmVmmDsQhIDMza4IDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NM/X+uPr6OXnlceAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"bins = np.linspace(0, 200, 40)\n",
"pyplot.hist(wocka['count_lines'], bins)\n",
"pyplot.title('Number of Joke Distribution')\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAD8CAYAAACRkhiPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFCNJREFUeJzt3X+MZeV93/H3JxjTpI4LDmOX7C5ZHK2jYpQseISpLFtUpLDgxoubuF3+MJi4WtsB1VbyR7ErFdcWEm1jW6VJcNdhBVQ2mBi7bGKos6ZuUCSDGQjmh9eEBRMz3hU7MQkQEREt/vaPe4a9zN6ZvTv3Mnd2n/dLuppzn/Occ773zu79zPl1n1QVkqQ2/dSkC5AkTY4hIEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWrYayZdwKGceOKJtX79+kmXIUlHjPvuu++vq2pqmL6rPgTWr1/PzMzMpMuQpCNGkr8atq+HgySpYYaAJDXMEJCkhhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWGr/o7hUay/4uvLXvbJq989xkokaXVyT0CSGmYISFLDDAFJapghIEkNMwQkqWGGgCQ17JAhkGRdkm8l2ZXkkSQf7drfkGRnkse6nyd07UlyTZLdSR5Mckbfui7p+j+W5JJX72VJkoYxzJ7AfuB3quqfAWcBlyU5FbgCuLOqNgB3ds8Bzgc2dI+twLXQCw3gSuDtwJnAlfPBIUmajEOGQFXtrar7u+nngV3AGmAzcEPX7Qbgwm56M3Bj9dwNHJ/kJOA8YGdVPVNVfwPsBDaN9dVIkg7LYZ0TSLIeOB24B3hTVe2FXlAAb+y6rQGe6ltstmtbrF2SNCFDh0CS1wG3Ah+rqueW6jqgrZZoH7StrUlmkszMzc0NW6Ik6TANFQJJjqUXAF+sqq92zU93h3nofu7r2meBdX2LrwX2LNF+kKraVlXTVTU9NTU17GuRJB2mYa4OCnAdsKuqPts3awcwf4XPJcBtfe0Xd1cJnQU82x0u+gZwbpITuhPC53ZtkqQJGeZbRN8BvB94KMkDXdsngKuBW5J8EPgh8L5u3u3ABcBu4AXgUoCqeibJp4F7u36fqqpnxvIqJEnLcsgQqKo/Z/DxfIBzBvQv4LJF1rUd2H44BUqSXj3eMSxJDTuqB5U5Eo0yEA44GI6kw+OegCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlq2DDDS25Psi/Jw31tX07yQPd4cn7EsSTrk/x937zP9y3ztiQPJdmd5Jpu2EpJ0gQNM57A9cDvATfON1TVv52fTvIZ4Nm+/o9X1cYB67kW2ArcTW8Iyk3AHYdfsiRpXA65J1BVdwEDxwLu/pr/N8BNS60jyUnA66vq293wkzcCFx5+uZKkcRr1nMA7gaer6rG+tlOS/EWSP0vyzq5tDTDb12e2axsoydYkM0lm5ubmRixRkrSYUUPgIl65F7AXOLmqTgd+G/hSktczeKD6WmylVbWtqqaranpqamrEEiVJi1n2GMNJXgP8a+Bt821V9SLwYjd9X5LHgbfQ+8t/bd/ia4E9y922JGk8RtkT+FXg+1X18mGeJFNJjumm3wxsAJ6oqr3A80nO6s4jXAzcNsK2JUljMMwlojcB3wZ+Kclskg92s7Zw8AnhdwEPJvku8BXgw1U1f1L5I8AfAruBx/HKIEmauEMeDqqqixZp/8CAtluBWxfpPwOcdpj1SZJeRd4xLEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsOGGVRme5J9SR7ua/tkkh8leaB7XNA37+NJdid5NMl5fe2burbdSa4Y/0uRJB2uYfYErgc2DWj/XFVt7B63AyQ5ld6IY2/tlvmDJMd0Q07+PnA+cCpwUddXkjRBw4wsdleS9UOubzNwczfg/A+S7AbO7ObtrqonAJLc3PX93mFXLEkam1HOCVye5MHucNEJXdsa4Km+PrNd22LtkqQJWm4IXAv8IrAR2At8pmvPgL61RPtASbYmmUkyMzc3t8wSJUmHsqwQqKqnq+qlqvoJ8AUOHPKZBdb1dV0L7FmifbH1b6uq6aqanpqaWk6JkqQhLCsEkpzU9/S9wPyVQzuALUmOS3IKsAH4DnAvsCHJKUleS+/k8Y7lly1JGodDnhhOchNwNnBiklngSuDsJBvpHdJ5EvgQQFU9kuQWeid89wOXVdVL3XouB74BHANsr6pHxv5qJEmHZZirgy4a0HzdEv2vAq4a0H47cPthVSdJelV5x7AkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWHDjCy2HfhXwL6qOq1r+2/ArwH/ADwOXFpVf5tkPbALeLRb/O6q+nC3zNuA64Gfpje4zEeratHB5qVhrb/i68te9smr3z3GSqQjzzB7AtcDmxa07QROq6pfBv4S+HjfvMeramP3+HBf+7XAVnrjDm8YsE5J0go7ZAhU1V3AMwva/rSq9ndP7wbWLrWObmD611fVt7u//m8ELlxeyZKkcRnHOYHfBO7oe35Kkr9I8mdJ3tm1rQFm+/rMdm0DJdmaZCbJzNzc3BhKlCQNMlIIJPmPwH7gi13TXuDkqjod+G3gS0leD2TA4oueD6iqbVU1XVXTU1NTo5QoSVrCIU8MLybJJfROGJ8zf4K3ql4EXuym70vyOPAWen/59x8yWgvsWe62JUnjsaw9gSSbgP8AvKeqXuhrn0pyTDf9ZnongJ+oqr3A80nOShLgYuC2kauXJI1kmEtEbwLOBk5MMgtcSe9qoOOAnb3P9JcvBX0X8Kkk+4GXgA9X1fxJ5Y9w4BLRO3jleQRJ0gQcMgSq6qIBzdct0vdW4NZF5s0Apx1WdZKkV5V3DEtSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1LBlf22Ejj5+L7/UHvcEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlq2FAhkGR7kn1JHu5re0OSnUke636e0LUnyTVJdid5MMkZfctc0vV/rBueUpI0QcPuCVwPbFrQdgVwZ1VtAO7sngOcT29YyQ3AVuBa6IUGvVHJ3g6cCVw5HxySpMkYKgSq6i7gmQXNm4EbuukbgAv72m+snruB45OcBJwH7KyqZ6rqb4CdHBwskqQVNMo5gTd1A8jT/Xxj174GeKqv32zXtli7JGlCXo0TwxnQVku0H7yCZGuSmSQzc3NzYy1OknTAKCHwdHeYh+7nvq59FljX128tsGeJ9oNU1baqmq6q6ampqRFKlCQtZZQQ2AHMX+FzCXBbX/vF3VVCZwHPdoeLvgGcm+SE7oTwuV2bJGlChvoq6SQ3AWcDJyaZpXeVz9XALUk+CPwQeF/X/XbgAmA38AJwKUBVPZPk08C9Xb9PVdXCk82SpBU0VAhU1UWLzDpnQN8CLltkPduB7UNXJ0l6VXnHsCQ1zJHFNHGjjGgmaTTuCUhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhi07BJL8UpIH+h7PJflYkk8m+VFf+wV9y3w8ye4kjyY5bzwvQZK0XMseT6CqHgU2AiQ5BvgR8DV6w0l+rqp+t79/klOBLcBbgZ8HvpnkLVX10nJrkCSNZlyHg84BHq+qv1qiz2bg5qp6sap+QG8M4jPHtH1J0jKMKwS2ADf1Pb88yYNJtic5oWtbAzzV12e2aztIkq1JZpLMzM3NjalESdJCI4dAktcC7wH+qGu6FvhFeoeK9gKfme86YPEatM6q2lZV01U1PTU1NWqJkqRFjGNP4Hzg/qp6GqCqnq6ql6rqJ8AXOHDIZxZY17fcWmDPGLYvSVqmcYTARfQdCkpyUt+89wIPd9M7gC1JjktyCrAB+M4Yti9JWqZlXx0EkORngH8JfKiv+b8m2UjvUM+T8/Oq6pEktwDfA/YDl3llkCRN1kghUFUvAD+3oO39S/S/CrhqlG1KksbHO4YlqWGGgCQ1zBCQpIYZApLUMENAkho20tVB0pFu/RVfX/ayT1797jFWIk2GewKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhhkCktSwcYwx/GSSh5I8kGSma3tDkp1JHut+ntC1J8k1SXZ3A9GfMer2JUnLN649gX9RVRurarp7fgVwZ1VtAO7snkNvPOIN3WMrvUHpJUkT8modDtoM3NBN3wBc2Nd+Y/XcDRy/YExiSdIKGscXyBXwp0kK+J9VtQ14U1XtBaiqvUne2PVdAzzVt+xs17Z3DHVIRwy/uE6rxThC4B1Vtaf7oN+Z5PtL9M2AtjqoU7KV3uEiTj755DGUKEkaZOTDQVW1p/u5D/gacCbw9Pxhnu7nvq77LLCub/G1wJ4B69xWVdNVNT01NTVqiZKkRYwUAkn+cZKfnZ8GzgUeBnYAl3TdLgFu66Z3ABd3VwmdBTw7f9hIkrTyRj0c9Cbga0nm1/Wlqvo/Se4FbknyQeCHwPu6/rcDFwC7gReAS0fcviRpBCOFQFU9AfzKgPYfA+cMaC/gslG2Ka0Wo5zclVYL7xiWpIY5xrB0hBl1D8RLTNXPPQFJapghIEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1LBlh0CSdUm+lWRXkkeSfLRr/2SSHyV5oHtc0LfMx5PsTvJokvPG8QIkScs3yngC+4Hfqar7u3GG70uys5v3uar63f7OSU4FtgBvBX4e+GaSt1TVSyPUIEkawbL3BKpqb1Xd300/D+wC1iyxyGbg5qp6sap+QG+c4TOXu31J0ujGck4gyXrgdOCerunyJA8m2Z7khK5tDfBU32KzLBIaSbYmmUkyMzc3N44SJUkDjDy8ZJLXAbcCH6uq55JcC3waqO7nZ4DfBDJg8Rq0zqraBmwDmJ6eHthH0vKMMjylQ1MefUbaE0hyLL0A+GJVfRWgqp6uqpeq6ifAFzhwyGcWWNe3+FpgzyjblySNZpSrgwJcB+yqqs/2tZ/U1+29wMPd9A5gS5LjkpwCbAC+s9ztS5JGN8rhoHcA7wceSvJA1/YJ4KIkG+kd6nkS+BBAVT2S5Bbge/SuLLrMK4MkabKWHQJV9ecMPs5/+xLLXAVctdxtSpLGyzuGJalhhoAkNWzkS0QltcPLS48+7glIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMq4MkrQivLFqd3BOQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDVvxS0STbAL+O3AM8IdVdfVK1yDpyDLK5aXgJaZLWdE9gSTHAL8PnA+cSm8UslNXsgZJ0gErvSdwJrC7qp4ASHIzsJnekJOS9KrwRrXFrXQIrAGe6ns+C7x9hWuQpKEd7QGy0iEwaEziOqhTshXY2j39uySPLnN7JwJ/vZwF81+WucXxOuz6J1X3gO0u+71fJax/so6K+if4OfILw3Zc6RCYBdb1PV8L7FnYqaq2AdtG3ViSmaqaHnU9k3Ik138k1w7WP2nWv3JW+hLRe4ENSU5J8lpgC7BjhWuQJHVWdE+gqvYnuRz4Br1LRLdX1SMrWYMk6YAVv0+gqm4Hbl+hzY18SGnCjuT6j+TawfonzfpXSKoOOi8rSWqEXxshSQ07KkIgyaYkjybZneSKAfOPS/Llbv49SdavfJWDDVH7B5LMJXmge/y7SdS5mCTbk+xL8vAi85Pkmu71PZjkjJWucTFD1H52kmf73vv/tNI1LiXJuiTfSrIrySNJPjqgz2p+/4epf1X+DpL8oyTfSfLdrvb/PKDPqv3ceYWqOqIf9E4wPw68GXgt8F3g1AV9fgv4fDe9BfjypOs+jNo/APzepGtd4jW8CzgDeHiR+RcAd9C7R+Qs4J5J13wYtZ8N/Mmk61yi/pOAM7rpnwX+csC/n9X8/g9T/6r8HXTv5+u66WOBe4CzFvRZlZ87Cx9Hw57Ay19FUVX/AMx/FUW/zcAN3fRXgHOSDLpxbaUNU/uqVlV3Ac8s0WUzcGP13A0cn+SklaluaUPUvqpV1d6qur+bfh7YRe+u/H6r+f0fpv5VqXs//657emz3WHiCdbV+7rzC0RACg76KYuE/pJf7VNV+4Fng51akuqUNUzvAr3e78l9Jsm7A/NVs2Ne4Wv3zbpf/jiRvnXQxi+kONZxO7y/SfkfE+79E/bBKfwdJjknyALAP2FlVi773q+xz5xWOhhAY5qsohvq6igkYpq4/BtZX1S8D3+TAXxZHitX63g/jfuAXqupXgP8B/O8J1zNQktcBtwIfq6rnFs4esMiqev8PUf+q/R1U1UtVtZHeNx+cmeS0BV1W/XsPR0cIDPNVFC/3SfIa4J+wOg4DHLL2qvpxVb3YPf0C8LYVqm1chvqqkNWoqp6b3+Wv3v0txyY5ccJlvUKSY+l9gH6xqr46oMuqfv8PVf+R8Duoqr8F/h+wacGs1fq58wpHQwgM81UUO4BLuunfAP5vdWdrJuyQtS84fvseesdNjyQ7gIu7q1TOAp6tqr2TLmoYSf7p/DHcJGfS+//y48lWdUBX23XArqr67CLdVu37P0z9q/V3kGQqyfHd9E8Dvwp8f0G31fq58worfsfwuNUiX0WR5FPATFXtoPcP7X8l2U0vibdMruIDhqz93yd5D7CfXu0fmFjBAyS5id4VHCcmmQWupHeSjKr6PL27wy8AdgMvAJdOptKDDVH7bwAfSbIf+Htgyyr7T/wO4P3AQ92xaYBPACfD6n//Ga7+1fo7OAm4Ib2Bsn4KuKWq/uRI+NxZyDuGJalhR8PhIEnSMhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWGGgCQ17P8D2KYN9QmVDiQAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"transform = np.asarray(wocka[['count_lines']].values)\n",
"dtf = stats.boxcox(transform)[0]\n",
"pyplot.hist(dtf, bins=20)\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"stopwords = nltk.corpus.stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"def lemmatization(text):\n",
" text = re.sub(\"[0-9]*|\\_\", \"\", text)\n",
" tokens = re.split('\\W+', text)\n",
" text = [word.lower() for word in tokens]\n",
" lemma = \" \".join([word.lemma_ for word in nlp(\" \".join([word for word in text if word not in stopwords]))])\n",
" return lemma"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"wocka['wocka_no_stop_lemma'] = wocka['body'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"wocka['title_no_stop_lemma'] = wocka['title'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" <th>count_lines</th>\n",
" <th>joke_length</th>\n",
" <th>dirty</th>\n",
" <th>wocka_no_stop_lemma</th>\n",
" <th>title_no_stop_lemma</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>52</td>\n",
" <td>False</td>\n",
" <td>call cow legs ground beef</td>\n",
" <td>cow legs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>78</td>\n",
" <td>False</td>\n",
" <td>call cow jumping barbed wire fence utter destr...</td>\n",
" <td>jump cow</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" <td>2</td>\n",
" <td>56</td>\n",
" <td>False</td>\n",
" <td>black white red newspaper</td>\n",
" <td>black white red</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" <td>2</td>\n",
" <td>51</td>\n",
" <td>False</td>\n",
" <td>guy walk bar say ouch</td>\n",
" <td>guy bar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" <td>1</td>\n",
" <td>72</td>\n",
" <td>False</td>\n",
" <td>opposite pro con opposite progress congress</td>\n",
" <td>progress</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech count_lines \\\n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... 2 \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... 2 \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... 1 \n",
"\n",
" joke_length dirty wocka_no_stop_lemma \\\n",
"0 52 False call cow legs ground beef \n",
"1 78 False call cow jumping barbed wire fence utter destr... \n",
"2 56 False black white red newspaper \n",
"3 51 False guy walk bar say ouch \n",
"4 72 False opposite pro con opposite progress congress \n",
"\n",
" title_no_stop_lemma \n",
"0 cow legs \n",
"1 jump cow \n",
"2 black white red \n",
"3 guy bar \n",
"4 progress "
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Count Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>abdomen</th>\n",
" <th>abdominal</th>\n",
" <th>abhor</th>\n",
" <th>abraham</th>\n",
" <th>accent</th>\n",
" <th>accident</th>\n",
" <th>account</th>\n",
" <th>achieve</th>\n",
" <th>acid</th>\n",
" <th>acquire</th>\n",
" <th>...</th>\n",
" <th>yes</th>\n",
" <th>yet</th>\n",
" <th>yew</th>\n",
" <th>yield</th>\n",
" <th>yo</th>\n",
" <th>yokels</th>\n",
" <th>york</th>\n",
" <th>young</th>\n",
" <th>yup</th>\n",
" <th>zealand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 2175 columns</p>\n",
"</div>"
],
"text/plain": [
" abdomen abdominal abhor abraham accent accident account achieve \\\n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
" acid acquire ... yes yet yew yield yo yokels york young yup \\\n",
"0 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"2 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"3 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"4 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"\n",
" zealand \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 2175 columns]"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_vect = CountVectorizer()\n",
"X_counts = count_vect.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_counts_df = pd.DataFrame(X_counts.toarray())\n",
"X_counts_df.columns = count_vect.get_feature_names()\n",
"X_counts_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Gram Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"ngram_vector = CountVectorizer(ngram_range=(2,3))\n",
"X_counts = ngram_vector.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_counts_df = pd.DataFrame(X_counts.toarray())\n",
"X_counts_df.columns = ngram_vector.get_feature_names()\n",
"# X_counts_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vector = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>abdomen</th>\n",
" <th>abdominal</th>\n",
" <th>abhor</th>\n",
" <th>abraham</th>\n",
" <th>accent</th>\n",
" <th>accident</th>\n",
" <th>account</th>\n",
" <th>achieve</th>\n",
" <th>acid</th>\n",
" <th>acquire</th>\n",
" <th>...</th>\n",
" <th>yes</th>\n",
" <th>yet</th>\n",
" <th>yew</th>\n",
" <th>yield</th>\n",
" <th>yo</th>\n",
" <th>yokels</th>\n",
" <th>york</th>\n",
" <th>young</th>\n",
" <th>yup</th>\n",
" <th>zealand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>0.065774</td>\n",
" <td>0.131548</td>\n",
" <td>0.065774</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.027508</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.025242</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.088984</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.088896</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.043038</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.129113</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.059502</td>\n",
" <td>0.054600</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.232210</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.103903</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.048936</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.095581</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.106920</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16 rows × 2175 columns</p>\n",
"</div>"
],
"text/plain": [
" abdomen abdominal abhor abraham accent accident account \\\n",
"84 0.065774 0.131548 0.065774 0.000000 0.0 0.000000 0.0 \n",
"85 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"86 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"87 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"88 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"89 0.000000 0.000000 0.000000 0.088984 0.0 0.000000 0.0 \n",
"90 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"91 0.000000 0.000000 0.000000 0.000000 0.0 0.043038 0.0 \n",
"92 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"93 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"94 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"95 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"96 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"97 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"98 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"99 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"\n",
" achieve acid acquire ... yes yet yew yield yo \\\n",
"84 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"85 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"86 0.000000 0.000000 0.0 ... 0.000000 0.0 0.027508 0.0 0.0 \n",
"87 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"88 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"89 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"90 0.088896 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"91 0.000000 0.129113 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"92 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"93 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"94 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"95 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"96 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"97 0.000000 0.000000 0.0 ... 0.232210 0.0 0.000000 0.0 0.0 \n",
"98 0.000000 0.000000 0.0 ... 0.048936 0.0 0.000000 0.0 0.0 \n",
"99 0.000000 0.000000 0.0 ... 0.095581 0.0 0.000000 0.0 0.0 \n",
"\n",
" yokels york young yup zealand \n",
"84 0.000000 0.000000 0.000000 0.0 0.0 \n",
"85 0.000000 0.000000 0.000000 0.0 0.0 \n",
"86 0.000000 0.025242 0.000000 0.0 0.0 \n",
"87 0.000000 0.000000 0.000000 0.0 0.0 \n",
"88 0.000000 0.000000 0.000000 0.0 0.0 \n",
"89 0.000000 0.000000 0.000000 0.0 0.0 \n",
"90 0.000000 0.000000 0.000000 0.0 0.0 \n",
"91 0.000000 0.000000 0.000000 0.0 0.0 \n",
"92 0.000000 0.000000 0.000000 0.0 0.0 \n",
"93 0.000000 0.000000 0.000000 0.0 0.0 \n",
"94 0.000000 0.000000 0.000000 0.0 0.0 \n",
"95 0.000000 0.000000 0.000000 0.0 0.0 \n",
"96 0.059502 0.054600 0.000000 0.0 0.0 \n",
"97 0.000000 0.000000 0.103903 0.0 0.0 \n",
"98 0.000000 0.000000 0.000000 0.0 0.0 \n",
"99 0.000000 0.000000 0.106920 0.0 0.0 \n",
"\n",
"[16 rows x 2175 columns]"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_tfidf = tfidf_vector.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_tfidf_df = pd.DataFrame(X_tfidf.toarray())\n",
"X_tfidf_df.columns = tfidf_vector.get_feature_names()\n",
"X_tfidf_df[84:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Machine Learning"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"with open('reddit_jokes.json', 'r') as json_file:\n",
" data_reddit = json.load(json_file)\n",
" \n",
"reddit = pd.DataFrame.from_dict(data_reddit)\n",
"reddit = reddit[:10000]"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"reddit['tokenize'] = reddit['body'].apply(lambda x: tokenize_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop'] = reddit['body'].apply(lambda x: remove_stop_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop_stemmed'] = reddit['body_no_stop'].apply(lambda x: stemming(x))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"reddit['count_lines'] = reddit['body'].apply(lambda x: split_punchline(x))\n",
"reddit['joke_length'] = reddit['body'].apply(lambda x: character_count(x))"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"reddit['dirty'] = reddit['body_no_stop'].apply(lambda x: detect_bad_words(x))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop_lemma'] = reddit['body'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"reddit['title_no_stop_lemma'] = reddit['title'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vector = TfidfVectorizer()\n",
"\n",
"X_tfidf = tfidf_vector.fit_transform(reddit['body_no_stop_lemma'])\n",
"X_tfidf_df = pd.DataFrame(X_tfidf.toarray())"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"X_tfidf_title = tfidf_vector.fit_transform(reddit['title_no_stop_lemma'])\n",
"X_tfidf_df_title = pd.DataFrame(X_tfidf.toarray())"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"X_features = pd.concat([reddit['dirty'], reddit['joke_length'], reddit['count_lines'], X_tfidf_df, X_tfidf_df_title], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dirty</th>\n",
" <th>joke_length</th>\n",
" <th>count_lines</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>...</th>\n",
" <th>14397</th>\n",
" <th>14398</th>\n",
" <th>14399</th>\n",
" <th>14400</th>\n",
" <th>14401</th>\n",
" <th>14402</th>\n",
" <th>14403</th>\n",
" <th>14404</th>\n",
" <th>14405</th>\n",
" <th>14406</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>57</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>65</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>174</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>528</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>57</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28817 columns</p>\n",
"</div>"
],
"text/plain": [
" dirty joke_length count_lines 0 1 2 3 4 5 6 ... \\\n",
"0 False 57 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"1 False 65 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"2 False 174 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"3 False 528 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"4 False 57 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"\n",
" 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 28817 columns]"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random forest"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import KFold, cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"rf = RandomForestClassifier(n_jobs=-1)\n",
"k_fold = KFold(n_splits=5)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([0.1305, 0.14 , 0.1215, 0.148 , 0.164 ])"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cross_val_score(rf, X_features, reddit['score'], cv=k_fold, scoring='accuracy')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment