Skip to content

Instantly share code, notes, and snippets.

@zilista
Created March 14, 2020 15:53
Show Gist options
  • Save zilista/a6b46d5b43d6afa672fcfab7ccfed5bb to your computer and use it in GitHub Desktop.
Save zilista/a6b46d5b43d6afa672fcfab7ccfed5bb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import json #Чтобы обрабатывать данные в в формате json\n",
"import pandas as pd #Для анализа табличных данных\n",
"import re #Используем регулярные выражения \n",
"import nltk #Для обработки естественного языка\n",
"import numpy as np\n",
"import string\n",
"import spacy\n",
"from scipy import stats\n",
"from matplotlib import pyplot\n",
"import en_core_web_sm #загрузка модели для анализа английского языка"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#https://raw.githubusercontent.com/taivop/joke-dataset/master/wocka.json #скачаем файл для примера\n",
"#https://github.com/taivop/joke-dataset/blob/master/reddit_jokes.json"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"with open('wocka.json', 'r') as json_file:\n",
" data = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"wocka = pd.DataFrame.from_dict(data)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title \n",
"0 Cow With No Legs \n",
"1 Jumping Cow \n",
"2 Black, White and Red \n",
"3 Guy in a Bar \n",
"4 Progress "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Токенизация"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def tokenize_words(text):\n",
" tokens = re.split('\\W+', text)\n",
" text = [word for word in tokens]\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"wocka['tokenize'] = wocka['body'].apply(lambda x: tokenize_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Удаление мусора и стоп-слов"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: What do you call a cow with no legs?\r\n",
"\r\n",
"Ground Beef!\r\n",
"\n",
"After: what do you call a cow with no legs ground beef\n"
]
}
],
"source": [
"def remove_punctuation(text):\n",
" removal = ' '.join(word.strip(string.punctuation) for word in text.split())\n",
" removal = removal.lower()\n",
" return removal\n",
"\n",
"print('Before: ' + wocka['body'][0] + '\\r\\n')\n",
"print('After: ' + remove_punctuation(wocka['body'][0]))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"stopwords = nltk.corpus.stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def remove_stop_words(text):\n",
" text = ''.join([word for word in text if word not in string.punctuation])\n",
" tokens = re.split('\\W+', text)\n",
" text = [word for word in tokens if word not in stopwords]\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop'] = wocka['body'].apply(lambda x: remove_stop_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Расширенные аббревиатуры"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"abbreviation_map = {\n",
" 'lol': 'laugh out loud',\n",
" 'brb': 'be right back',\n",
" 'serp': 'search engine results page'\n",
"}\n",
"\n",
"\n",
"def expand_abbreviations(text, abbreviations):\n",
" for word in sorted(abbreviations.keys(), key = len, reverse=True):\n",
" text = re.sub(word, abbreviations[word], text)\n",
" \n",
" return text\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"sample_sentence = remove_punctuation(\"Hold my bear whie I check the serp one moment. brb.\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: hold my bear whie i check the serp one moment brb\n",
"After: hold my bear whie i check the search engine results page one moment be right back\n"
]
}
],
"source": [
"print('Before: ' + sample_sentence)\n",
"print('After: ' + expand_abbreviations(sample_sentence, abbreviation_map))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Стемминг"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"ps = nltk.PorterStemmer()\n",
"\n",
"def stemming(text):\n",
" text = [ps.stem(word) for word in text]\n",
" return text\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['studi', 'studi', 'studi', 'studi']\n"
]
}
],
"source": [
"word_form = ['study', 'studies', 'studied', 'studying']\n",
"\n",
"print(stemming(word_form))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop_stemmed'] = wocka['body_no_stop'].apply(lambda x: stemming(x))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между']\n"
]
}
],
"source": [
"from nltk.corpus import stopwords\n",
"print(stopwords.words(\"russian\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лемматизация"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"lemmatize = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"better: good\n"
]
}
],
"source": [
"print('better: ' + lemmatize.lemmatize(\"better\", pos='a'))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()\n",
"\n",
"def lemmatization(text):\n",
" lemma = [word.lemma_ for word in nlp(' '.join(text))]\n",
" return lemma"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"wocka['body_no_stop_lemma'] = wocka['body_no_stop'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Grams"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from nltk.util import ngrams"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def create_ngrams(text, n):\n",
" ngrams_obj = ngrams(text, n)\n",
" return list(ngrams_obj)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: generator 'ngrams' raised StopIteration\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
}
],
"source": [
"wocka['bigrams'] = wocka['tokenize'].apply(lambda x: create_ngrams(x, 2))\n",
"wocka['trigrams'] = wocka['tokenize'].apply(lambda x: create_ngrams(x, 3))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tagging"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def pos(text):\n",
" tokens= [word.text for word in nlp(\" \".join([word for word in text]))]\n",
" speech_parts = [word.pos_ for word in nlp(\" \".join([word for word in text]))]\n",
" combined = list(zip(tokens, speech_parts))\n",
" return combined"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"wocka['parts_of_speech'] = wocka['tokenize'].apply(lambda x: pos(x))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech \n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import sent_tokenize\n",
"from spacy import displacy"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def remove_line_breaks(text):\n",
" removal = ' '.join(word.strip('\\r\\n') for word in text.split())\n",
" removal = removal.lower()\n",
" return removal\n",
"\n",
"def tokenize_sentences(text):\n",
" sentences = sent_tokenize(remove_line_breaks(text))\n",
" return (sentences)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()\n",
"doc = tokenize_sentences(wocka['body'][0])"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"068bfe01e5794e56bb73dd81eeac2cf4-0\" class=\"displacy\" width=\"950\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">what</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">do</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">AUX</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">you</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">PRON</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">call</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">VERB</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">a</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">DET</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">cow</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">with</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">ADP</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">no</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">DET</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"850\">legs?</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"850\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 350.0,2.0 350.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,154.0 L62,142.0 78,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,52.0 345.0,52.0 345.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,154.0 L162,142.0 178,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-2\" stroke-width=\"2px\" d=\"M270,152.0 C270,102.0 340.0,102.0 340.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M270,154.0 L262,142.0 278,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-3\" stroke-width=\"2px\" d=\"M470,152.0 C470,102.0 540.0,102.0 540.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,154.0 L462,142.0 478,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-4\" stroke-width=\"2px\" d=\"M370,152.0 C370,52.0 545.0,52.0 545.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">oprd</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M545.0,154.0 L553.0,142.0 537.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-5\" stroke-width=\"2px\" d=\"M370,152.0 C370,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M650.0,154.0 L658.0,142.0 642.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-6\" stroke-width=\"2px\" d=\"M770,152.0 C770,102.0 840.0,102.0 840.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M770,154.0 L762,142.0 778,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-7\" stroke-width=\"2px\" d=\"M670,152.0 C670,52.0 845.0,52.0 845.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-068bfe01e5794e56bb73dd81eeac2cf4-0-7\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M845.0,154.0 L853.0,142.0 837.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"d5cdaa947c3e42bc8d9a2451eee1eff9-0\" class=\"displacy\" width=\"250\" height=\"187.0\" direction=\"ltr\" style=\"max-width: none; height: 187.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">ground</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">beef!</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-d5cdaa947c3e42bc8d9a2451eee1eff9-0-0\" stroke-width=\"2px\" d=\"M70,52.0 C70,2.0 150.0,2.0 150.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-d5cdaa947c3e42bc8d9a2451eee1eff9-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,54.0 L62,42.0 78,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for x in doc:\n",
" displacy.render(nlp(x), style='dep', jupyter=True, options={'compact': False, 'distance': 100})"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"doc = tokenize_sentences('мама мыть раму, а я учил урок')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"844a18e83c0640abb4e861f0def39b6b-0\" class=\"displacy\" width=\"750\" height=\"187.0\" direction=\"ltr\" style=\"max-width: none; height: 187.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">мама</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">мыть</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">раму,</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">а</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">я</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">учил</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
"</text>\n",
"\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"97.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">урок</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">PROPN</tspan>\n",
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-0\" stroke-width=\"2px\" d=\"M70,52.0 C70,2.0 150.0,2.0 150.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,54.0 L62,42.0 78,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-1\" stroke-width=\"2px\" d=\"M170,52.0 C170,2.0 250.0,2.0 250.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,54.0 L162,42.0 178,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-2\" stroke-width=\"2px\" d=\"M270,52.0 C270,2.0 350.0,2.0 350.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">intj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M350.0,54.0 L358.0,42.0 342.0,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-3\" stroke-width=\"2px\" d=\"M470,52.0 C470,2.0 550.0,2.0 550.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,54.0 L462,42.0 478,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-844a18e83c0640abb4e861f0def39b6b-0-4\" stroke-width=\"2px\" d=\"M570,52.0 C570,2.0 650.0,2.0 650.0,52.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-844a18e83c0640abb4e861f0def39b6b-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M650.0,54.0 L658.0,42.0 642.0,42.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for x in doc:\n",
" displacy.render(nlp(x), style='dep', jupyter=True, options={'compact': False, 'distance': 100})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NER"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"def ner(text):\n",
" doc = nlp(text)\n",
" ent_text = [ent.text for ent in doc.ents]\n",
" ent_start_char = [ent.start_char for ent in doc.ents]\n",
" ent_end_char = [ent.end_char for ent in doc.ents]\n",
" ent_label = [ent.label_ for ent in doc.ents]\n",
" combined = list(zip(ent_text, ent_label))\n",
" return combined"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[('Saddam Hussein', 'PERSON'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('County Cavan', 'GPE'),\n",
" ('Ireland', 'GPE'),\n",
" ('Paddy', 'WORK_OF_ART'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Sean', 'PERSON'),\n",
" ('Gerry', 'PERSON'),\n",
" ('8', 'CARDINAL'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('1 million', 'CARDINAL'),\n",
" ('Begorra', 'WORK_OF_ART'),\n",
" ('Paddy', 'PERSON'),\n",
" ('the next day', 'DATE'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('2', 'CARDINAL'),\n",
" ('Murphy', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('16 thousand', 'CARDINAL'),\n",
" ('14 thousand', 'CARDINAL'),\n",
" ('1 and a half million', 'CARDINAL'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('the next day', 'DATE'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Ted', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('a minute', 'TIME'),\n",
" ('Paddy', 'PERSON'),\n",
" ('500', 'CARDINAL'),\n",
" ('19', 'CARDINAL'),\n",
" ('2 million', 'CARDINAL'),\n",
" ('Paddy', 'PERSON'),\n",
" ('Paddy', 'LOC'),\n",
" ('the next day', 'DATE'),\n",
" ('Hussein', 'PERSON'),\n",
" ('Saddam', 'PERSON'),\n",
" ('Paddy', 'PERSON'),\n",
" ('2 million', 'CARDINAL')]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ner(wocka['body'][13])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Word Embeddings: word2vec, GloVe"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import nltk\n",
"import gensim\n",
"from gensim import corpora, models, similarities"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"corpus = wocka['title'].values.tolist() + wocka['body'].values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"tok_corp = [nltk.word_tokenize(sent) for sent in corpus]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Cow', 'With', 'No', 'Legs'],\n",
" ['Jumping', 'Cow'],\n",
" ['Black', ',', 'White', 'and', 'Red'],\n",
" ['Guy', 'in', 'a', 'Bar'],\n",
" ['Progress']]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tok_corp[:5]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"model = gensim.models.Word2Vec(tok_corp, min_count=1, size=32)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('guy', 0.9541391134262085),\n",
" ('woman', 0.9270364046096802),\n",
" ('boy', 0.9067033529281616),\n",
" ('doctor', 0.9010869264602661),\n",
" ('priest', 0.881646990776062),\n",
" ('girl', 0.8726887702941895),\n",
" ('lady', 0.8652868270874023),\n",
" ('lawyer', 0.8572322130203247),\n",
" ('farmer', 0.8566977977752686),\n",
" ('bartender', 0.848757266998291)]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar('man'.lower())"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9270364"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.similarity('man', 'woman')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example: Joke Lines and Length"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"def split_punchline(joke):\n",
" count = len(re.split('\\r\\n\\r\\n|\\r\\n', joke))\n",
" return count\n",
"\n",
"def character_count(joke):\n",
" length = len(joke)\n",
" return length"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"wocka['count_lines'] = wocka['body'].apply(lambda x: split_punchline(x))\n",
"wocka['joke_length'] = wocka['body'].apply(lambda x: character_count(x))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# Скачиваем файл https://github.com/zacanger/profane-words/blob/master/words.json"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"with open('words.json') as json_file:\n",
" badwords_list = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"def detect_bad_words(list):\n",
" text = [word for word in list if word not in badwords_list]\n",
" if len(text) < len(list):\n",
" return True\n",
" else:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"wocka['dirty'] = wocka['body_no_stop'].apply(lambda x: detect_bad_words(x))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" <th>count_lines</th>\n",
" <th>joke_length</th>\n",
" <th>dirty</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>52</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>78</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" <td>2</td>\n",
" <td>56</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" <td>2</td>\n",
" <td>51</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" <td>1</td>\n",
" <td>72</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech count_lines \\\n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... 2 \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... 2 \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... 1 \n",
"\n",
" joke_length dirty \n",
"0 52 False \n",
"1 78 False \n",
"2 56 False \n",
"3 51 False \n",
"4 72 False "
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"bins = np.linspace(0, 200, 40)\n",
"pyplot.hist(wocka['joke_length'], bins)\n",
"pyplot.title('Joke Length Distribution')\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"bins = np.linspace(0, 200, 40)\n",
"pyplot.hist(wocka['count_lines'], bins)\n",
"pyplot.title('Number of Joke Distribution')\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAD8CAYAAACRkhiPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFCNJREFUeJzt3X+MZeV93/H3JxjTpI4LDmOX7C5ZHK2jYpQseISpLFtUpLDgxoubuF3+MJi4WtsB1VbyR7ErFdcWEm1jW6VJcNdhBVQ2mBi7bGKos6ZuUCSDGQjmh9eEBRMz3hU7MQkQEREt/vaPe4a9zN6ZvTv3Mnd2n/dLuppzn/Occ773zu79zPl1n1QVkqQ2/dSkC5AkTY4hIEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWrYayZdwKGceOKJtX79+kmXIUlHjPvuu++vq2pqmL6rPgTWr1/PzMzMpMuQpCNGkr8atq+HgySpYYaAJDXMEJCkhhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWGr/o7hUay/4uvLXvbJq989xkokaXVyT0CSGmYISFLDDAFJapghIEkNMwQkqWGGgCQ17JAhkGRdkm8l2ZXkkSQf7drfkGRnkse6nyd07UlyTZLdSR5Mckbfui7p+j+W5JJX72VJkoYxzJ7AfuB3quqfAWcBlyU5FbgCuLOqNgB3ds8Bzgc2dI+twLXQCw3gSuDtwJnAlfPBIUmajEOGQFXtrar7u+nngV3AGmAzcEPX7Qbgwm56M3Bj9dwNHJ/kJOA8YGdVPVNVfwPsBDaN9dVIkg7LYZ0TSLIeOB24B3hTVe2FXlAAb+y6rQGe6ltstmtbrF2SNCFDh0CS1wG3Ah+rqueW6jqgrZZoH7StrUlmkszMzc0NW6Ik6TANFQJJjqUXAF+sqq92zU93h3nofu7r2meBdX2LrwX2LNF+kKraVlXTVTU9NTU17GuRJB2mYa4OCnAdsKuqPts3awcwf4XPJcBtfe0Xd1cJnQU82x0u+gZwbpITuhPC53ZtkqQJGeZbRN8BvB94KMkDXdsngKuBW5J8EPgh8L5u3u3ABcBu4AXgUoCqeibJp4F7u36fqqpnxvIqJEnLcsgQqKo/Z/DxfIBzBvQv4LJF1rUd2H44BUqSXj3eMSxJDTuqB5U5Eo0yEA44GI6kw+OegCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlq2DDDS25Psi/Jw31tX07yQPd4cn7EsSTrk/x937zP9y3ztiQPJdmd5Jpu2EpJ0gQNM57A9cDvATfON1TVv52fTvIZ4Nm+/o9X1cYB67kW2ArcTW8Iyk3AHYdfsiRpXA65J1BVdwEDxwLu/pr/N8BNS60jyUnA66vq293wkzcCFx5+uZKkcRr1nMA7gaer6rG+tlOS/EWSP0vyzq5tDTDb12e2axsoydYkM0lm5ubmRixRkrSYUUPgIl65F7AXOLmqTgd+G/hSktczeKD6WmylVbWtqqaranpqamrEEiVJi1n2GMNJXgP8a+Bt821V9SLwYjd9X5LHgbfQ+8t/bd/ia4E9y922JGk8RtkT+FXg+1X18mGeJFNJjumm3wxsAJ6oqr3A80nO6s4jXAzcNsK2JUljMMwlojcB3wZ+Kclskg92s7Zw8AnhdwEPJvku8BXgw1U1f1L5I8AfAruBx/HKIEmauEMeDqqqixZp/8CAtluBWxfpPwOcdpj1SZJeRd4xLEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsOGGVRme5J9SR7ua/tkkh8leaB7XNA37+NJdid5NMl5fe2burbdSa4Y/0uRJB2uYfYErgc2DWj/XFVt7B63AyQ5ld6IY2/tlvmDJMd0Q07+PnA+cCpwUddXkjRBw4wsdleS9UOubzNwczfg/A+S7AbO7ObtrqonAJLc3PX93mFXLEkam1HOCVye5MHucNEJXdsa4Km+PrNd22LtkqQJWm4IXAv8IrAR2At8pmvPgL61RPtASbYmmUkyMzc3t8wSJUmHsqwQqKqnq+qlqvoJ8AUOHPKZBdb1dV0L7FmifbH1b6uq6aqanpqaWk6JkqQhLCsEkpzU9/S9wPyVQzuALUmOS3IKsAH4DnAvsCHJKUleS+/k8Y7lly1JGodDnhhOchNwNnBiklngSuDsJBvpHdJ5EvgQQFU9kuQWeid89wOXVdVL3XouB74BHANsr6pHxv5qJEmHZZirgy4a0HzdEv2vAq4a0H47cPthVSdJelV5x7AkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWHDjCy2HfhXwL6qOq1r+2/ArwH/ADwOXFpVf5tkPbALeLRb/O6q+nC3zNuA64Gfpje4zEeratHB5qVhrb/i68te9smr3z3GSqQjzzB7AtcDmxa07QROq6pfBv4S+HjfvMeramP3+HBf+7XAVnrjDm8YsE5J0go7ZAhU1V3AMwva/rSq9ndP7wbWLrWObmD611fVt7u//m8ELlxeyZKkcRnHOYHfBO7oe35Kkr9I8mdJ3tm1rQFm+/rMdm0DJdmaZCbJzNzc3BhKlCQNMlIIJPmPwH7gi13TXuDkqjod+G3gS0leD2TA4oueD6iqbVU1XVXTU1NTo5QoSVrCIU8MLybJJfROGJ8zf4K3ql4EXuym70vyOPAWen/59x8yWgvsWe62JUnjsaw9gSSbgP8AvKeqXuhrn0pyTDf9ZnongJ+oqr3A80nOShLgYuC2kauXJI1kmEtEbwLOBk5MMgtcSe9qoOOAnb3P9JcvBX0X8Kkk+4GXgA9X1fxJ5Y9w4BLRO3jleQRJ0gQcMgSq6qIBzdct0vdW4NZF5s0Apx1WdZKkV5V3DEtSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1LBlf22Ejj5+L7/UHvcEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlq2FAhkGR7kn1JHu5re0OSnUke636e0LUnyTVJdid5MMkZfctc0vV/rBueUpI0QcPuCVwPbFrQdgVwZ1VtAO7sngOcT29YyQ3AVuBa6IUGvVHJ3g6cCVw5HxySpMkYKgSq6i7gmQXNm4EbuukbgAv72m+snruB45OcBJwH7KyqZ6rqb4CdHBwskqQVNMo5gTd1A8jT/Xxj174GeKqv32zXtli7JGlCXo0TwxnQVku0H7yCZGuSmSQzc3NzYy1OknTAKCHwdHeYh+7nvq59FljX128tsGeJ9oNU1baqmq6q6ampqRFKlCQtZZQQ2AHMX+FzCXBbX/vF3VVCZwHPdoeLvgGcm+SE7oTwuV2bJGlChvoq6SQ3AWcDJyaZpXeVz9XALUk+CPwQeF/X/XbgAmA38AJwKUBVPZPk08C9Xb9PVdXCk82SpBU0VAhU1UWLzDpnQN8CLltkPduB7UNXJ0l6VXnHsCQ1zJHFNHGjjGgmaTTuCUhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhi07BJL8UpIH+h7PJflYkk8m+VFf+wV9y3w8ye4kjyY5bzwvQZK0XMseT6CqHgU2AiQ5BvgR8DV6w0l+rqp+t79/klOBLcBbgZ8HvpnkLVX10nJrkCSNZlyHg84BHq+qv1qiz2bg5qp6sap+QG8M4jPHtH1J0jKMKwS2ADf1Pb88yYNJtic5oWtbAzzV12e2aztIkq1JZpLMzM3NjalESdJCI4dAktcC7wH+qGu6FvhFeoeK9gKfme86YPEatM6q2lZV01U1PTU1NWqJkqRFjGNP4Hzg/qp6GqCqnq6ql6rqJ8AXOHDIZxZY17fcWmDPGLYvSVqmcYTARfQdCkpyUt+89wIPd9M7gC1JjktyCrAB+M4Yti9JWqZlXx0EkORngH8JfKiv+b8m2UjvUM+T8/Oq6pEktwDfA/YDl3llkCRN1kghUFUvAD+3oO39S/S/CrhqlG1KksbHO4YlqWGGgCQ1zBCQpIYZApLUMENAkho20tVB0pFu/RVfX/ayT1797jFWIk2GewKS1DBDQJIaZghIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMEJCkhhkCktSwcYwx/GSSh5I8kGSma3tDkp1JHut+ntC1J8k1SXZ3A9GfMer2JUnLN649gX9RVRurarp7fgVwZ1VtAO7snkNvPOIN3WMrvUHpJUkT8modDtoM3NBN3wBc2Nd+Y/XcDRy/YExiSdIKGscXyBXwp0kK+J9VtQ14U1XtBaiqvUne2PVdAzzVt+xs17Z3DHVIRwy/uE6rxThC4B1Vtaf7oN+Z5PtL9M2AtjqoU7KV3uEiTj755DGUKEkaZOTDQVW1p/u5D/gacCbw9Pxhnu7nvq77LLCub/G1wJ4B69xWVdNVNT01NTVqiZKkRYwUAkn+cZKfnZ8GzgUeBnYAl3TdLgFu66Z3ABd3VwmdBTw7f9hIkrTyRj0c9Cbga0nm1/Wlqvo/Se4FbknyQeCHwPu6/rcDFwC7gReAS0fcviRpBCOFQFU9AfzKgPYfA+cMaC/gslG2Ka0Wo5zclVYL7xiWpIY5xrB0hBl1D8RLTNXPPQFJapghIEkNMwQkqWGGgCQ1zBCQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDTMEJKlhhoAkNcwQkKSGGQKS1LBlh0CSdUm+lWRXkkeSfLRr/2SSHyV5oHtc0LfMx5PsTvJokvPG8QIkScs3yngC+4Hfqar7u3GG70uys5v3uar63f7OSU4FtgBvBX4e+GaSt1TVSyPUIEkawbL3BKpqb1Xd300/D+wC1iyxyGbg5qp6sap+QG+c4TOXu31J0ujGck4gyXrgdOCerunyJA8m2Z7khK5tDfBU32KzLBIaSbYmmUkyMzc3N44SJUkDjDy8ZJLXAbcCH6uq55JcC3waqO7nZ4DfBDJg8Rq0zqraBmwDmJ6eHthH0vKMMjylQ1MefUbaE0hyLL0A+GJVfRWgqp6uqpeq6ifAFzhwyGcWWNe3+FpgzyjblySNZpSrgwJcB+yqqs/2tZ/U1+29wMPd9A5gS5LjkpwCbAC+s9ztS5JGN8rhoHcA7wceSvJA1/YJ4KIkG+kd6nkS+BBAVT2S5Bbge/SuLLrMK4MkabKWHQJV9ecMPs5/+xLLXAVctdxtSpLGyzuGJalhhoAkNWzkS0QltcPLS48+7glIUsMMAUlqmCEgSQ0zBCSpYYaAJDXMq4MkrQivLFqd3BOQpIYZApLUMENAkhpmCEhSwwwBSWqYISBJDVvxS0STbAL+O3AM8IdVdfVK1yDpyDLK5aXgJaZLWdE9gSTHAL8PnA+cSm8UslNXsgZJ0gErvSdwJrC7qp4ASHIzsJnekJOS9KrwRrXFrXQIrAGe6ns+C7x9hWuQpKEd7QGy0iEwaEziOqhTshXY2j39uySPLnN7JwJ/vZwF81+WucXxOuz6J1X3gO0u+71fJax/so6K+if4OfILw3Zc6RCYBdb1PV8L7FnYqaq2AdtG3ViSmaqaHnU9k3Ik138k1w7WP2nWv3JW+hLRe4ENSU5J8lpgC7BjhWuQJHVWdE+gqvYnuRz4Br1LRLdX1SMrWYMk6YAVv0+gqm4Hbl+hzY18SGnCjuT6j+TawfonzfpXSKoOOi8rSWqEXxshSQ07KkIgyaYkjybZneSKAfOPS/Llbv49SdavfJWDDVH7B5LMJXmge/y7SdS5mCTbk+xL8vAi85Pkmu71PZjkjJWucTFD1H52kmf73vv/tNI1LiXJuiTfSrIrySNJPjqgz2p+/4epf1X+DpL8oyTfSfLdrvb/PKDPqv3ceYWqOqIf9E4wPw68GXgt8F3g1AV9fgv4fDe9BfjypOs+jNo/APzepGtd4jW8CzgDeHiR+RcAd9C7R+Qs4J5J13wYtZ8N/Mmk61yi/pOAM7rpnwX+csC/n9X8/g9T/6r8HXTv5+u66WOBe4CzFvRZlZ87Cx9Hw57Ay19FUVX/AMx/FUW/zcAN3fRXgHOSDLpxbaUNU/uqVlV3Ac8s0WUzcGP13A0cn+SklaluaUPUvqpV1d6qur+bfh7YRe+u/H6r+f0fpv5VqXs//657emz3WHiCdbV+7rzC0RACg76KYuE/pJf7VNV+4Fng51akuqUNUzvAr3e78l9Jsm7A/NVs2Ne4Wv3zbpf/jiRvnXQxi+kONZxO7y/SfkfE+79E/bBKfwdJjknyALAP2FlVi773q+xz5xWOhhAY5qsohvq6igkYpq4/BtZX1S8D3+TAXxZHitX63g/jfuAXqupXgP8B/O8J1zNQktcBtwIfq6rnFs4esMiqev8PUf+q/R1U1UtVtZHeNx+cmeS0BV1W/XsPR0cIDPNVFC/3SfIa4J+wOg4DHLL2qvpxVb3YPf0C8LYVqm1chvqqkNWoqp6b3+Wv3v0txyY5ccJlvUKSY+l9gH6xqr46oMuqfv8PVf+R8Duoqr8F/h+wacGs1fq58wpHQwgM81UUO4BLuunfAP5vdWdrJuyQtS84fvseesdNjyQ7gIu7q1TOAp6tqr2TLmoYSf7p/DHcJGfS+//y48lWdUBX23XArqr67CLdVu37P0z9q/V3kGQqyfHd9E8Dvwp8f0G31fq58worfsfwuNUiX0WR5FPATFXtoPcP7X8l2U0vibdMruIDhqz93yd5D7CfXu0fmFjBAyS5id4VHCcmmQWupHeSjKr6PL27wy8AdgMvAJdOptKDDVH7bwAfSbIf+Htgyyr7T/wO4P3AQ92xaYBPACfD6n//Ga7+1fo7OAm4Ib2Bsn4KuKWq/uRI+NxZyDuGJalhR8PhIEnSMhkCktQwQ0CSGmYISFLDDAFJapghIEkNMwQkqWGGgCQ17P8D2KYN9QmVDiQAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"transform = np.asarray(wocka[['count_lines']].values)\n",
"dtf = stats.boxcox(transform)[0]\n",
"pyplot.hist(dtf, bins=20)\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"nlp = en_core_web_sm.load()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"stopwords = nltk.corpus.stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"def lemmatization(text):\n",
" text = re.sub(\"[0-9]*|\\_\", \"\", text)\n",
" tokens = re.split('\\W+', text)\n",
" text = [word.lower() for word in tokens]\n",
" lemma = \" \".join([word.lemma_ for word in nlp(\" \".join([word for word in text if word not in stopwords]))])\n",
" return lemma"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"wocka['wocka_no_stop_lemma'] = wocka['body'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"wocka['title_no_stop_lemma'] = wocka['title'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>tokenize</th>\n",
" <th>body_no_stop</th>\n",
" <th>body_no_stop_stemmed</th>\n",
" <th>body_no_stop_lemma</th>\n",
" <th>bigrams</th>\n",
" <th>trigrams</th>\n",
" <th>parts_of_speech</th>\n",
" <th>count_lines</th>\n",
" <th>joke_length</th>\n",
" <th>dirty</th>\n",
" <th>wocka_no_stop_lemma</th>\n",
" <th>title_no_stop_lemma</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" <td>[what, do, you, call, a, cow, with, no, legs, ...</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[call, cow, leg, ground, beef]</td>\n",
" <td>[call, cow, legs, ground, beef]</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>52</td>\n",
" <td>False</td>\n",
" <td>call cow legs ground beef</td>\n",
" <td>cow legs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" <td>[what, do, you, call, a, cow, jumping, over, a...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[call, cow, jump, barb, wire, fenc, utter, des...</td>\n",
" <td>[call, cow, jumping, barbed, wire, fence, utte...</td>\n",
" <td>[(what, do), (do, you), (you, call), (call, a)...</td>\n",
" <td>[(what, do, you), (do, you, call), (you, call,...</td>\n",
" <td>[(what, PRON), (do, AUX), (you, PRON), (call, ...</td>\n",
" <td>2</td>\n",
" <td>78</td>\n",
" <td>False</td>\n",
" <td>call cow jumping barbed wire fence utter destr...</td>\n",
" <td>jump cow</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" <td>[what, s, black, and, white, and, red, all, ov...</td>\n",
" <td>[whats, black, white, red, newspaper]</td>\n",
" <td>[what, black, white, red, newspap]</td>\n",
" <td>[what, s, black, white, red, newspaper]</td>\n",
" <td>[(what, s), (s, black), (black, and), (and, wh...</td>\n",
" <td>[(what, s, black), (s, black, and), (black, an...</td>\n",
" <td>[(what, PRON), (s, VERB), (black, ADJ), (and, ...</td>\n",
" <td>2</td>\n",
" <td>56</td>\n",
" <td>False</td>\n",
" <td>black white red newspaper</td>\n",
" <td>black white red</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" <td>[so, this, guy, walks, into, a, bar, and, says...</td>\n",
" <td>[guy, walks, bar, says, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[guy, walk, bar, say, ouch]</td>\n",
" <td>[(so, this), (this, guy), (guy, walks), (walks...</td>\n",
" <td>[(so, this, guy), (this, guy, walks), (guy, wa...</td>\n",
" <td>[(so, ADV), (this, DET), (guy, NOUN), (walks, ...</td>\n",
" <td>2</td>\n",
" <td>51</td>\n",
" <td>False</td>\n",
" <td>guy walk bar say ouch</td>\n",
" <td>guy bar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" <td>[if, the, opposite, of, pro, is, con, isn, t, ...</td>\n",
" <td>[opposite, pro, con, isnt, opposite, progress,...</td>\n",
" <td>[opposit, pro, con, isnt, opposit, progress, c...</td>\n",
" <td>[opposite, pro, con, be, not, opposite, progre...</td>\n",
" <td>[(if, the), (the, opposite), (opposite, of), (...</td>\n",
" <td>[(if, the, opposite), (the, opposite, of), (op...</td>\n",
" <td>[(if, SCONJ), (the, DET), (opposite, NOUN), (o...</td>\n",
" <td>1</td>\n",
" <td>72</td>\n",
" <td>False</td>\n",
" <td>opposite pro con opposite progress congress</td>\n",
" <td>progress</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title tokenize \\\n",
"0 Cow With No Legs [what, do, you, call, a, cow, with, no, legs, ... \n",
"1 Jumping Cow [what, do, you, call, a, cow, jumping, over, a... \n",
"2 Black, White and Red [what, s, black, and, white, and, red, all, ov... \n",
"3 Guy in a Bar [so, this, guy, walks, into, a, bar, and, says... \n",
"4 Progress [if, the, opposite, of, pro, is, con, isn, t, ... \n",
"\n",
" body_no_stop \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [whats, black, white, red, newspaper] \n",
"3 [guy, walks, bar, says, ouch] \n",
"4 [opposite, pro, con, isnt, opposite, progress,... \n",
"\n",
" body_no_stop_stemmed \\\n",
"0 [call, cow, leg, ground, beef] \n",
"1 [call, cow, jump, barb, wire, fenc, utter, des... \n",
"2 [what, black, white, red, newspap] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposit, pro, con, isnt, opposit, progress, c... \n",
"\n",
" body_no_stop_lemma \\\n",
"0 [call, cow, legs, ground, beef] \n",
"1 [call, cow, jumping, barbed, wire, fence, utte... \n",
"2 [what, s, black, white, red, newspaper] \n",
"3 [guy, walk, bar, say, ouch] \n",
"4 [opposite, pro, con, be, not, opposite, progre... \n",
"\n",
" bigrams \\\n",
"0 [(what, do), (do, you), (you, call), (call, a)... \n",
"1 [(what, do), (do, you), (you, call), (call, a)... \n",
"2 [(what, s), (s, black), (black, and), (and, wh... \n",
"3 [(so, this), (this, guy), (guy, walks), (walks... \n",
"4 [(if, the), (the, opposite), (opposite, of), (... \n",
"\n",
" trigrams \\\n",
"0 [(what, do, you), (do, you, call), (you, call,... \n",
"1 [(what, do, you), (do, you, call), (you, call,... \n",
"2 [(what, s, black), (s, black, and), (black, an... \n",
"3 [(so, this, guy), (this, guy, walks), (guy, wa... \n",
"4 [(if, the, opposite), (the, opposite, of), (op... \n",
"\n",
" parts_of_speech count_lines \\\n",
"0 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"1 [(what, PRON), (do, AUX), (you, PRON), (call, ... 2 \n",
"2 [(what, PRON), (s, VERB), (black, ADJ), (and, ... 2 \n",
"3 [(so, ADV), (this, DET), (guy, NOUN), (walks, ... 2 \n",
"4 [(if, SCONJ), (the, DET), (opposite, NOUN), (o... 1 \n",
"\n",
" joke_length dirty wocka_no_stop_lemma \\\n",
"0 52 False call cow legs ground beef \n",
"1 78 False call cow jumping barbed wire fence utter destr... \n",
"2 56 False black white red newspaper \n",
"3 51 False guy walk bar say ouch \n",
"4 72 False opposite pro con opposite progress congress \n",
"\n",
" title_no_stop_lemma \n",
"0 cow legs \n",
"1 jump cow \n",
"2 black white red \n",
"3 guy bar \n",
"4 progress "
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wocka.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Count Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>abdomen</th>\n",
" <th>abdominal</th>\n",
" <th>abhor</th>\n",
" <th>abraham</th>\n",
" <th>accent</th>\n",
" <th>accident</th>\n",
" <th>account</th>\n",
" <th>achieve</th>\n",
" <th>acid</th>\n",
" <th>acquire</th>\n",
" <th>...</th>\n",
" <th>yes</th>\n",
" <th>yet</th>\n",
" <th>yew</th>\n",
" <th>yield</th>\n",
" <th>yo</th>\n",
" <th>yokels</th>\n",
" <th>york</th>\n",
" <th>young</th>\n",
" <th>yup</th>\n",
" <th>zealand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 2175 columns</p>\n",
"</div>"
],
"text/plain": [
" abdomen abdominal abhor abraham accent accident account achieve \\\n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
" acid acquire ... yes yet yew yield yo yokels york young yup \\\n",
"0 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"2 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"3 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"4 0 0 ... 0 0 0 0 0 0 0 0 0 \n",
"\n",
" zealand \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 2175 columns]"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_vect = CountVectorizer()\n",
"X_counts = count_vect.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_counts_df = pd.DataFrame(X_counts.toarray())\n",
"X_counts_df.columns = count_vect.get_feature_names()\n",
"X_counts_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Gram Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"ngram_vector = CountVectorizer(ngram_range=(2,3))\n",
"X_counts = ngram_vector.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_counts_df = pd.DataFrame(X_counts.toarray())\n",
"X_counts_df.columns = ngram_vector.get_feature_names()\n",
"# X_counts_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vector = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>abdomen</th>\n",
" <th>abdominal</th>\n",
" <th>abhor</th>\n",
" <th>abraham</th>\n",
" <th>accent</th>\n",
" <th>accident</th>\n",
" <th>account</th>\n",
" <th>achieve</th>\n",
" <th>acid</th>\n",
" <th>acquire</th>\n",
" <th>...</th>\n",
" <th>yes</th>\n",
" <th>yet</th>\n",
" <th>yew</th>\n",
" <th>yield</th>\n",
" <th>yo</th>\n",
" <th>yokels</th>\n",
" <th>york</th>\n",
" <th>young</th>\n",
" <th>yup</th>\n",
" <th>zealand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>0.065774</td>\n",
" <td>0.131548</td>\n",
" <td>0.065774</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.027508</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.025242</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.088984</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.088896</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.043038</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.129113</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.059502</td>\n",
" <td>0.054600</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.232210</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.103903</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.048936</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.095581</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.106920</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16 rows × 2175 columns</p>\n",
"</div>"
],
"text/plain": [
" abdomen abdominal abhor abraham accent accident account \\\n",
"84 0.065774 0.131548 0.065774 0.000000 0.0 0.000000 0.0 \n",
"85 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"86 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"87 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"88 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"89 0.000000 0.000000 0.000000 0.088984 0.0 0.000000 0.0 \n",
"90 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"91 0.000000 0.000000 0.000000 0.000000 0.0 0.043038 0.0 \n",
"92 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"93 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"94 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"95 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"96 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"97 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"98 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"99 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.0 \n",
"\n",
" achieve acid acquire ... yes yet yew yield yo \\\n",
"84 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"85 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"86 0.000000 0.000000 0.0 ... 0.000000 0.0 0.027508 0.0 0.0 \n",
"87 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"88 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"89 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"90 0.088896 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"91 0.000000 0.129113 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"92 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"93 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"94 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"95 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"96 0.000000 0.000000 0.0 ... 0.000000 0.0 0.000000 0.0 0.0 \n",
"97 0.000000 0.000000 0.0 ... 0.232210 0.0 0.000000 0.0 0.0 \n",
"98 0.000000 0.000000 0.0 ... 0.048936 0.0 0.000000 0.0 0.0 \n",
"99 0.000000 0.000000 0.0 ... 0.095581 0.0 0.000000 0.0 0.0 \n",
"\n",
" yokels york young yup zealand \n",
"84 0.000000 0.000000 0.000000 0.0 0.0 \n",
"85 0.000000 0.000000 0.000000 0.0 0.0 \n",
"86 0.000000 0.025242 0.000000 0.0 0.0 \n",
"87 0.000000 0.000000 0.000000 0.0 0.0 \n",
"88 0.000000 0.000000 0.000000 0.0 0.0 \n",
"89 0.000000 0.000000 0.000000 0.0 0.0 \n",
"90 0.000000 0.000000 0.000000 0.0 0.0 \n",
"91 0.000000 0.000000 0.000000 0.0 0.0 \n",
"92 0.000000 0.000000 0.000000 0.0 0.0 \n",
"93 0.000000 0.000000 0.000000 0.0 0.0 \n",
"94 0.000000 0.000000 0.000000 0.0 0.0 \n",
"95 0.000000 0.000000 0.000000 0.0 0.0 \n",
"96 0.059502 0.054600 0.000000 0.0 0.0 \n",
"97 0.000000 0.000000 0.103903 0.0 0.0 \n",
"98 0.000000 0.000000 0.000000 0.0 0.0 \n",
"99 0.000000 0.000000 0.106920 0.0 0.0 \n",
"\n",
"[16 rows x 2175 columns]"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_tfidf = tfidf_vector.fit_transform(wocka['wocka_no_stop_lemma'][0:100])\n",
"X_tfidf_df = pd.DataFrame(X_tfidf.toarray())\n",
"X_tfidf_df.columns = tfidf_vector.get_feature_names()\n",
"X_tfidf_df[84:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Machine Learning"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"with open('reddit_jokes.json', 'r') as json_file:\n",
" data_reddit = json.load(json_file)\n",
" \n",
"reddit = pd.DataFrame.from_dict(data_reddit)\n",
"reddit = reddit[:10000]"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"reddit['tokenize'] = reddit['body'].apply(lambda x: tokenize_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop'] = reddit['body'].apply(lambda x: remove_stop_words(x.lower()))"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop_stemmed'] = reddit['body_no_stop'].apply(lambda x: stemming(x))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"reddit['count_lines'] = reddit['body'].apply(lambda x: split_punchline(x))\n",
"reddit['joke_length'] = reddit['body'].apply(lambda x: character_count(x))"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"reddit['dirty'] = reddit['body_no_stop'].apply(lambda x: detect_bad_words(x))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"reddit['body_no_stop_lemma'] = reddit['body'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"reddit['title_no_stop_lemma'] = reddit['title'].apply(lambda x: lemmatization(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vector = TfidfVectorizer()\n",
"\n",
"X_tfidf = tfidf_vector.fit_transform(reddit['body_no_stop_lemma'])\n",
"X_tfidf_df = pd.DataFrame(X_tfidf.toarray())"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"X_tfidf_title = tfidf_vector.fit_transform(reddit['title_no_stop_lemma'])\n",
"X_tfidf_df_title = pd.DataFrame(X_tfidf.toarray())"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"X_features = pd.concat([reddit['dirty'], reddit['joke_length'], reddit['count_lines'], X_tfidf_df, X_tfidf_df_title], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dirty</th>\n",
" <th>joke_length</th>\n",
" <th>count_lines</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>...</th>\n",
" <th>14397</th>\n",
" <th>14398</th>\n",
" <th>14399</th>\n",
" <th>14400</th>\n",
" <th>14401</th>\n",
" <th>14402</th>\n",
" <th>14403</th>\n",
" <th>14404</th>\n",
" <th>14405</th>\n",
" <th>14406</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>57</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>65</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>174</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>528</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>57</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28817 columns</p>\n",
"</div>"
],
"text/plain": [
" dirty joke_length count_lines 0 1 2 3 4 5 6 ... \\\n",
"0 False 57 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"1 False 65 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"2 False 174 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"3 False 528 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"4 False 57 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n",
"\n",
" 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 28817 columns]"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random forest"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import KFold, cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"rf = RandomForestClassifier(n_jobs=-1)\n",
"k_fold = KFold(n_splits=5)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([0.1305, 0.14 , 0.1215, 0.148 , 0.164 ])"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cross_val_score(rf, X_features, reddit['score'], cv=k_fold, scoring='accuracy')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment