Skip to content

Instantly share code, notes, and snippets.

@drpedrazas
Created February 11, 2024 01:21
Show Gist options
  • Save drpedrazas/dbe4dfffef01c88d91dc52685754f67f to your computer and use it in GitHub Desktop.
Save drpedrazas/dbe4dfffef01c88d91dc52685754f67f to your computer and use it in GitHub Desktop.
Naive Bayes - Vanilla
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "3691532f",
"metadata": {},
"source": [
"# Naive Bayes for IMDB reviews"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "654d57c7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /home/david/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Importación de librerías, no olviden ejecutar esta celda.\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"import matplotlib.pyplot as plt\n",
"from nltk.corpus import stopwords\n",
"import seaborn as sns\n",
"import re\n",
"import nltk \n",
"import numpy as np\n",
"import pandas as pd\n",
"stemmer = nltk.stem.SnowballStemmer('spanish')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "716828a9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7217</th>\n",
" <td>I kept waiting for the film to move me, inspir...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40682</th>\n",
" <td>Well, Anne is way way too old. Wentworth looks...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48836</th>\n",
" <td>As a \"rebuttle\" of sorts to the AFI's top 100 ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35207</th>\n",
" <td>I like the movie. Twisted Desire had Jeremy Jo...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45407</th>\n",
" <td>Getting lost in space frozen for 15 years, tha...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34375</th>\n",
" <td>I watched this show and i simply didn't find i...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35057</th>\n",
" <td>This is a very intriguing short movie by David...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8554</th>\n",
" <td>\"The Cellar\" is an intolerably dull and overly...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22387</th>\n",
" <td>Dr. Paul Flanner (Richard Gere), a successful ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33915</th>\n",
" <td>I believe that war films should try to convey ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"7217 I kept waiting for the film to move me, inspir... negative\n",
"40682 Well, Anne is way way too old. Wentworth looks... negative\n",
"48836 As a \"rebuttle\" of sorts to the AFI's top 100 ... positive\n",
"35207 I like the movie. Twisted Desire had Jeremy Jo... positive\n",
"45407 Getting lost in space frozen for 15 years, tha... negative\n",
"... ... ...\n",
"34375 I watched this show and i simply didn't find i... negative\n",
"35057 This is a very intriguing short movie by David... positive\n",
"8554 \"The Cellar\" is an intolerably dull and overly... negative\n",
"22387 Dr. Paul Flanner (Richard Gere), a successful ... positive\n",
"33915 I believe that war films should try to convey ... negative\n",
"\n",
"[2000 rows x 2 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"original_dataset = pd.read_csv('/home/david/Downloads/imdb/imdb.csv').sample(n=2000)\n",
"original_dataset"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "20b45fbe",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Text(-2.700000000000003, 0.5, 'Número de Ejemplos')"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 360x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.catplot(x='sentiment', kind='count', color='r', data=original_dataset)\n",
"plt.title('Distribución de Ejemplos')\n",
"plt.xlabel('Sentiment')\n",
"plt.ylabel('Número de Ejemplos')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "26eb3c40",
"metadata": {},
"outputs": [],
"source": [
"def processing_text(texto):\n",
" processed_feature = re.sub(r'\\W', ' ', str(texto))\n",
" processed_feature= re.sub(r'\\s+[a-zA-Z]\\s+', ' ', processed_feature)\n",
" processed_feature = re.sub(r'\\^[a-zA-Z]\\s+', ' ', processed_feature) \n",
" processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)\n",
" processed_feature = re.sub(' +', ' ', processed_feature) \n",
" processed_feature = processed_feature.lower()\n",
" processed_feature = \" \".join([stemmer.stem(i) for i in processed_feature.split()])\n",
" return processed_feature"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "09fdea43",
"metadata": {},
"outputs": [],
"source": [
"labels = original_dataset['sentiment'].values\n",
"text_for_processing = original_dataset['review'].values"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "66b443e8",
"metadata": {},
"outputs": [],
"source": [
"processinc_text_vectorized = np.vectorize(lambda x: processing_text(x))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "a2ac865a",
"metadata": {},
"outputs": [],
"source": [
"texto_procesado = processinc_text_vectorized(text_for_processing)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "3f704b3d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i kept waiting for the film to mov me inspir me shock me sadd me in som way but it stirr non of my emotions it just meander along to the end non of the characters seem very uniqu or complex they just seem lik actors reciting the lin think it could hav been bett movi if the characters express mor emotion the only one who did and was believ was the veter and he probably committ suicid just to get out of the movi as soon as he could it was wast of talent film the tim and min if ther is messag or meaning or genius in this story it certainly is well hidd or am very dens which doubt'"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"texto_procesado[0]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "49e5a22c",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = CountVectorizer(max_features=2000, stop_words=stopwords.words('english'))\n",
"text_features = vectorizer.fit_transform(texto_procesado).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "e9faffb1",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(text_features, labels, test_size=0.2, random_state=55)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "95b87b18",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB()"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb = MultinomialNB()\n",
"nb.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "baa3f4a3",
"metadata": {},
"outputs": [],
"source": [
"predictions=nb.predict(X_test)\n",
"accuracy_result=accuracy_score(y_test, predictions)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "fb4619bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8275"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_result"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "f179759b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Crítica test: This film was an absolute waste of my time and i doubt i will ever see this shameful display again\n",
"Crítica test: ['negative']\n"
]
}
],
"source": [
"test = \"This film was an absolute waste of my time and i doubt i will ever see this shameful display again\"\n",
"test_procesado=processing_text(test)\n",
"test_bow=vectorizer.transform([test_procesado])\n",
"clase_test=nb.predict(test_bow)\n",
"print(\"Critique test:\", test)\n",
"print(\"Crítique test:\", clase_test)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "16df5e6d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Crítica test: Extremely disturbing, just what I was looking for. Great performances but lackluster direction.\n",
"Crítica test: ['positive']\n"
]
}
],
"source": [
"test = \"Extremely disturbing, just what I was looking for. Great performances but lackluster direction.\"\n",
"test_procesado=processing_text(test)\n",
"test_bow=vectorizer.transform([test_procesado])\n",
"clase_test=nb.predict(test_bow)\n",
"print(\"Critique test:\", test)\n",
"print(\"Critique test:\", clase_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08e356f2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment