Skip to content

Instantly share code, notes, and snippets.

@sudhanshu456
Created August 15, 2020 17:40
Show Gist options
  • Save sudhanshu456/192a9d380339ce4152a101fa414e28df to your computer and use it in GitHub Desktop.
Save sudhanshu456/192a9d380339ce4152a101fa414e28df to your computer and use it in GitHub Desktop.
tcsrio_project.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"colab": {
"name": "tcsrio_project.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/sudhanshu456/192a9d380339ce4152a101fa414e28df/tcsrio_project.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sLMjaXaP6BdW",
"colab_type": "text"
},
"source": [
"# Browse Dir"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "PPYyhJ006BdX",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "5a1003b2-d2f3-4d01-a490-6aba3f20dfce"
},
"source": [
"!ls"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"sample_data\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wU_5r4vM6Bdb",
"colab_type": "text"
},
"source": [
"# Load Library"
]
},
{
"cell_type": "code",
"metadata": {
"id": "u--o4F546Bdb",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 306
},
"outputId": "b684f3a8-959a-44cc-8789-ebcbe410203f"
},
"source": [
"!pip install emoji\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"import inflect\n",
"import string\n",
"from textblob import TextBlob\n",
"import emoji\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from nltk.stem.lancaster import LancasterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"import seaborn as sns\n",
"sns.set(font_scale=1.3)\n",
"import warnings\n",
"import collections\n",
"warnings.filterwarnings('ignore')\n",
"from collections import Counter"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting emoji\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)\n",
"\r\u001b[K |██████▍ | 10kB 16.3MB/s eta 0:00:01\r\u001b[K |████████████▉ | 20kB 21.2MB/s eta 0:00:01\r\u001b[K |███████████████████▎ | 30kB 25.5MB/s eta 0:00:01\r\u001b[K |█████████████████████████▊ | 40kB 28.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 51kB 5.3MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: emoji\n",
" Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49714 sha256=d2711935abb6ca26b973f5c2de238873e29160f742c21057c6b3851f0960bbd1\n",
" Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796\n",
"Successfully built emoji\n",
"Installing collected packages: emoji\n",
"Successfully installed emoji-0.6.0\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
" import pandas.util.testing as tm\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "olwBfDqF6Bdd",
"colab_type": "text"
},
"source": [
"# Read Data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2x6c2Fq06kq9",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 121
},
"outputId": "a9fb5242-4a26-4a2c-be3f-1e9a4933c3d9"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code\n",
"\n",
"Enter your authorization code:\n",
"··········\n",
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bMfOI6j_6Bde",
"colab_type": "code",
"colab": {}
},
"source": [
"# dataset from https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews\n",
"#contain two rows positive and negative\n",
"data=pd.read_csv(\"/content/drive/My Drive/resources/IMDB Dataset.csv\")"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Q66-oHxV6Bdg",
"colab_type": "text"
},
"source": [
"# Dataset Exploration "
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "dh-PG5876Bdg",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "8b8ac849-bf76-4abd-9ba7-b92f169f2a5e"
},
"source": [
"data[\"sentiment\"].unique()"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['positive', 'negative'], dtype=object)"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "EsEqGR936Bdi",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 166
},
"outputId": "3f92856b-5354-4d57-88ad-766747188cd7"
},
"source": [
"data.describe(include='all')\n"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>50000</td>\n",
" <td>50000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>49582</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>Loved today's show!!! It was a variety and not...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>5</td>\n",
" <td>25000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"count 50000 50000\n",
"unique 49582 2\n",
"top Loved today's show!!! It was a variety and not... positive\n",
"freq 5 25000"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "Kh450VvS6Bdk",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 336
},
"outputId": "006fab17-18d5-4630-96cd-eaf1c4b8fd6e"
},
"source": [
"data['sentiment'].value_counts().plot(kind='bar', rot=60)"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7fd5757d9cf8>"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "OFe0XO9o6Bdm",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67
},
"outputId": "f7c5c06f-a03e-4b07-f57c-a4ccc5017978"
},
"source": [
"data.isnull().sum()\n"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"review 0\n",
"sentiment 0\n",
"dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZPj9r9IY6Bdp",
"colab_type": "text"
},
"source": [
"## Getting first review to see"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "yrBhWm8e6Bdq",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "2706cbd6-51f9-44eb-a7e4-c3b225108eb9"
},
"source": [
"print(data[\"review\"][0])\n",
"print()\n",
"print(data[\"sentiment\"][0])\n",
"len(data[\"review\"][0])"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.\n",
"\n",
"positive\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1761"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8lCjFC-l6Bds",
"colab_type": "text"
},
"source": [
"# Data Cleaning and Preprocessing"
]
},
{
"cell_type": "code",
"metadata": {
"id": "j8edNd7r6Bds",
"colab_type": "code",
"colab": {}
},
"source": [
"class CleanText(BaseEstimator, TransformerMixin):\n",
" \n",
" def emoji_oneword(self, input_text):\n",
" # By compressing the underscore, the emoji is kept as one word\n",
" return input_text.replace('_','')\n",
" \n",
" def remove_punctuation(self, input_text):\n",
" # Make translation table\n",
" punct = string.punctuation\n",
" trantab = str.maketrans(punct, len(punct)*' ') # Every punctuation symbol will be replaced by a space\n",
" return input_text.translate(trantab) \n",
" \n",
" def remove_html(self,input_text):\n",
" table = str.maketrans('', '', string.punctuation)\n",
" return input_text.replace(\"<br />\",\"\").translate(table)\n",
" \n",
" def number_to_text(self,input_text):\n",
" p = inflect.engine()\n",
" return ' '.join([str(p.number_to_words(((x)))) if x.isnumeric() else x for x in input_text.split()])\n",
" \n",
" \n",
" def to_lower(self, input_text):\n",
" return input_text.lower()\n",
" \n",
" def remove_stopwords(self, input_text):\n",
" stopwords_list = stopwords.words('english')\n",
" # Some words which might indicate a certain sentiment are kept via a whitelist\n",
" whitelist = [\"n't\", \"not\", \"no\"]\n",
" words = input_text.split() \n",
" clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] \n",
" return \" \".join(clean_words) \n",
" \n",
" def stemming(self, input_text):\n",
" stemmer = LancasterStemmer()\n",
" words = input_text.split() \n",
" stemmed_words = [stemmer.stem(word) for word in words]\n",
" return \" \".join(stemmed_words)\n",
" \n",
" def fit(self, X, y=None, **fit_params):\n",
" return self\n",
" \n",
" def transform(self, X, **transform_params):\n",
" clean_X = X.apply(self.emoji_oneword).apply(self.remove_html).apply(self.remove_punctuation).apply(self.number_to_text).apply(self.to_lower).apply(self.remove_stopwords)\n",
" return clean_X"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "no1JWhO76Bdu",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "0c249801-8e9c-41af-89c9-afdb37b02a54"
},
"source": [
"#cleaning text & preprocessing \n",
"ct = CleanText()\n",
"sr_clean = ct.fit_transform(data.review)\n",
"#check is there any review with no text in it after the preprocessing \n",
"empty_clean = sr_clean == ''\n",
"print('{} records have no words left after text cleaning'.format(sr_clean[empty_clean].count()))\n",
"sr_clean.loc[empty_clean] = '[no_text]'"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"0 records have no words left after text cleaning\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IRSA3X3V6Bdw",
"colab_type": "text"
},
"source": [
"### Checking Most Common values"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "rCxL0Z0g6Bdw",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 617
},
"outputId": "64e387e8-95c5-41d2-bae6-5cfbe4dca007"
},
"source": [
"#plot the bar char to see the most common from the features \n",
"cv = CountVectorizer()\n",
"bow = cv.fit_transform(sr_clean)\n",
"word_freq = dict(zip(cv.get_feature_names(), np.asarray(bow.sum(axis=0)).ravel()))\n",
"word_counter = collections.Counter(word_freq)\n",
"word_counter_df = pd.DataFrame(word_counter.most_common(20), columns = ['word', 'freq'])\n",
"fig, ax = plt.subplots(figsize=(12, 10))\n",
"sns.barplot(x=\"word\", y=\"freq\", data=word_counter_df, palette=\"PuBuGn_d\", ax=ax)\n",
"plt.show();"
],
"execution_count": 12,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x720 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "v2QZHCJ86Bdy",
"colab_type": "text"
},
"source": [
"### Customizing dataset with three labels as netural, positive & negative"
]
},
{
"cell_type": "code",
"metadata": {
"id": "TMFJ05Ro6Bdz",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 402
},
"outputId": "dfea5550-c63c-4b89-c99f-8f41eece69c3"
},
"source": [
"temp=[]\n",
"for i in range(len(sr_clean)):\n",
" blob=TextBlob(sr_clean[i])\n",
" polarity=blob.sentiment.polarity\n",
" if polarity>0.005 and polarity <= 0.04:\n",
" temp.append(\"neutral\")\n",
" elif polarity > 0.04:\n",
" temp.append('positive')\n",
" else:\n",
" temp.append('negative')\n",
"\n",
"labels = pd.DataFrame({'sentiment_polarity':temp})\n",
"labels"
],
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentiment_polarity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49995</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49996</th>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49997</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49998</th>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49999</th>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>50000 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" sentiment_polarity\n",
"0 neutral\n",
"1 positive\n",
"2 positive\n",
"3 negative\n",
"4 positive\n",
"... ...\n",
"49995 positive\n",
"49996 negative\n",
"49997 positive\n",
"49998 negative\n",
"49999 positive\n",
"\n",
"[50000 rows x 1 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": false,
"id": "NdO5YjsJ6Bd0",
"colab_type": "code",
"colab": {}
},
"source": [
"data['clean_data']=sr_clean\n",
"data[\"sentiment_polarity\"]=labels"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zYkIJi8F6Bd2",
"colab_type": "text"
},
"source": [
"### Data After the cleaning and feature engineering"
]
},
{
"cell_type": "code",
"metadata": {
"id": "s5P0dFIj6Bd2",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 343
},
"outputId": "7d162f11-f741-4676-df8c-e047cd62cffb"
},
"source": [
"data.head(10)"
],
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" <th>clean_data</th>\n",
" <th>sentiment_polarity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" <td>one reviewers mentioned watching one oz episod...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
" <td>positive</td>\n",
" <td>wonderful little production filming technique ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" <td>basically theres family little boy jake thinks...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
" <td>positive</td>\n",
" <td>petter matteis love time money visually stunni...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Probably my all-time favorite movie, a story o...</td>\n",
" <td>positive</td>\n",
" <td>probably alltime favorite movie story selfless...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>I sure would like to see a resurrection of a u...</td>\n",
" <td>positive</td>\n",
" <td>sure would like see resurrection dated seahunt...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>This show was an amazing, fresh &amp; innovative i...</td>\n",
" <td>negative</td>\n",
" <td>show amazing fresh innovative idea 70s first a...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Encouraged by the positive comments about this...</td>\n",
" <td>negative</td>\n",
" <td>encouraged positive comments film looking forw...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>If you like original gut wrenching laughter yo...</td>\n",
" <td>positive</td>\n",
" <td>like original gut wrenching laughter like movi...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review ... sentiment_polarity\n",
"0 One of the other reviewers has mentioned that ... ... neutral\n",
"1 A wonderful little production. <br /><br />The... ... positive\n",
"2 I thought this was a wonderful way to spend ti... ... positive\n",
"3 Basically there's a family where a little boy ... ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... ... positive\n",
"5 Probably my all-time favorite movie, a story o... ... positive\n",
"6 I sure would like to see a resurrection of a u... ... positive\n",
"7 This show was an amazing, fresh & innovative i... ... positive\n",
"8 Encouraged by the positive comments about this... ... negative\n",
"9 If you like original gut wrenching laughter yo... ... positive\n",
"\n",
"[10 rows x 4 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7452rV-m6Bd8",
"colab_type": "text"
},
"source": [
"Checking stopwords and how tokenization will happen "
]
},
{
"cell_type": "code",
"metadata": {
"id": "fbG9RX176Bd9",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "3981fbca-9adf-486f-8d36-fd235d361488"
},
"source": [
"stop_words = set(stopwords.words('english'))\n",
"print(stop_words)"
],
"execution_count": 89,
"outputs": [
{
"output_type": "stream",
"text": [
"{'on', \"shouldn't\", 'by', 'this', 'before', 'couldn', 'because', \"couldn't\", 'hadn', 'up', 'not', 'mustn', 'themselves', 'do', 'our', 'and', 'those', 'in', 'here', 'only', 'who', 'yourselves', 'does', 'off', 'doesn', 'through', \"mightn't\", \"weren't\", 'd', 'itself', 'theirs', 'so', 'hasn', 'mightn', \"needn't\", 'about', 'down', 'under', 'below', 'we', \"you'd\", 'what', 'didn', 'his', 'few', 'such', 'will', 'why', 'shouldn', 'ain', 'whom', 'now', 'her', 'i', 'if', 'most', 'your', 'am', 'above', 'again', 'no', 'll', 'yours', 'he', 'is', 'some', 'being', 'with', 'nor', 'all', 'for', 'are', 'were', 'any', 'myself', \"should've\", \"it's\", \"don't\", \"you're\", 'very', 'it', 'more', 'you', 'won', 'hers', 'isn', 'than', \"didn't\", 'which', 'their', 'when', \"won't\", 'm', 'just', \"aren't\", 't', 'wouldn', 'until', 'each', 'did', 'had', 'at', 'an', \"doesn't\", 'too', 're', \"hadn't\", 'haven', 'needn', 'between', 'o', 'doing', \"you've\", 'has', 'these', 'while', 'to', 'weren', \"wouldn't\", 'have', 'aren', 'wasn', 'or', 'out', 'y', 'having', 'that', 'once', 'him', 'them', 'don', 'further', \"mustn't\", 'me', 'yourself', 'into', 'its', 'they', 'was', 'after', 'ma', 'she', 's', 'a', \"isn't\", 'same', \"shan't\", 've', \"she's\", 'ourselves', 'as', 'where', 'should', 'over', \"that'll\", 'against', \"haven't\", 'the', 'during', \"you'll\", 'herself', 'own', 'shan', 'but', \"hasn't\", 'how', \"wasn't\", 'then', 'himself', 'ours', 'other', 'both', 'of', 'from', 'can', 'been', 'my', 'there', 'be'}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "c3F9jskE6Bd_",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "32500c3d-d7bb-435a-bb92-688c06e50fc7"
},
"source": [
"word_tokens = word_tokenize(data[\"clean_data\"][0])\n",
"filteres_review= [w for w in word_tokens if not w in stop_words]\n",
"print(filteres_review)"
],
"execution_count": 90,
"outputs": [
{
"output_type": "stream",
"text": [
"['one', 'reviewers', 'mentioned', 'watching', 'one', 'oz', 'episode', 'youll', 'hooked', 'right', 'exactly', 'happened', 'methe', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence', 'hardcore', 'classic', 'use', 'wordit', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'manyaryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'moreso', 'scuffles', 'death', 'stares', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'wouldnt', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audiences', 'forget', 'charm', 'forget', 'romanceoz', 'doesnt', 'mess', 'around', 'first', 'episode', 'ever', 'saw', 'struck', 'nasty', 'surreal', 'couldnt', 'say', 'ready', 'watched', 'developed', 'taste', 'oz', 'got', 'accustomed', 'high', 'levels', 'graphic', 'violence', 'violence', 'injustice', 'crooked', 'guards', 'wholl', 'sold', 'nickel', 'inmates', 'wholl', 'kill', 'order', 'get', 'away', 'well', 'mannered', 'middle', 'class', 'inmates', 'turned', 'prison', 'bitches', 'due', 'lack', 'street', 'skills', 'prison', 'experience', 'watching', 'oz', 'may', 'become', 'comfortable', 'uncomfortable', 'viewingthats', 'get', 'touch', 'darker', 'side']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7rtHQN0mMQb8",
"colab_type": "text"
},
"source": [
"Common word in one review "
]
},
{
"cell_type": "code",
"metadata": {
"id": "4u0wiNsB6BeA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "b3cad440-c1aa-4e64-fe79-dfb7c8506806"
},
"source": [
"from nltk.probability import FreqDist\n",
"fdist = FreqDist(filteres_review)\n",
"print('Sampling ', fdist)\n",
"print('The first 3 frequently used tokens are')\n",
"fdist.most_common(10)\n"
],
"execution_count": 91,
"outputs": [
{
"output_type": "stream",
"text": [
"Sampling <FreqDist with 142 samples and 168 outcomes>\n",
"The first 3 frequently used tokens are\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('oz', 5),\n",
" ('violence', 4),\n",
" ('show', 3),\n",
" ('prison', 3),\n",
" ('forget', 3),\n",
" ('one', 2),\n",
" ('watching', 2),\n",
" ('episode', 2),\n",
" ('right', 2),\n",
" ('first', 2)]"
]
},
"metadata": {
"tags": []
},
"execution_count": 91
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "40kLUWCR6BeC",
"colab_type": "text"
},
"source": [
"## Checking Outliers"
]
},
{
"cell_type": "code",
"metadata": {
"id": "tniZYpyd6BeC",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 84
},
"outputId": "10799e77-a80f-44b6-f159-52db5ae51208"
},
"source": [
"review_lens = Counter([len(x) for x in data[\"clean_data\"].values])\n",
"print(\"Zero-length reviews: {}\".format(review_lens[0]))\n",
"print(\"Maximum review length: {}\".format(max(review_lens)))\n",
"print('Number of reviews before removing outliers: ', len(data['clean_data']))\n",
"zero_idx = [ii for ii, review in enumerate(data.clean_data) if len(review)==0]\n",
"print(zero_idx,\"index of review with 0 length\")"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"Zero-length reviews: 0\n",
"Maximum review length: 9306\n",
"Number of reviews before removing outliers: 50000\n",
"[] index of review with 0 length\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": false,
"id": "vhfstPLs6BeE",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
},
"outputId": "e0b38675-24a8-42eb-a351-4902fbcad184"
},
"source": [
"data.head()"
],
"execution_count": 47,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" <th>clean_data</th>\n",
" <th>sentiment_polarity</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" <td>one reviewers mentioned watching one oz episod...</td>\n",
" <td>neutral</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
" <td>positive</td>\n",
" <td>wonderful little production filming technique ...</td>\n",
" <td>positive</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" <td>positive</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" <td>basically theres family little boy jake thinks...</td>\n",
" <td>negative</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
" <td>positive</td>\n",
" <td>petter matteis love time money visually stunni...</td>\n",
" <td>positive</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review ... label\n",
"0 One of the other reviewers has mentioned that ... ... 0\n",
"1 A wonderful little production. <br /><br />The... ... 0\n",
"2 I thought this was a wonderful way to spend ti... ... 0\n",
"3 Basically there's a family where a little boy ... ... 1\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... ... 0\n",
"\n",
"[5 rows x 5 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 47
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0wAim9Y86BeG",
"colab_type": "text"
},
"source": [
"Unique values in label after feature enginnering"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "uBUBOmCd6BeG",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "cdb922d8-cbf4-432c-8e43-36e9f6b352a9"
},
"source": [
"data[\"sentiment_polarity\"].unique()\n"
],
"execution_count": 36,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['neutral', 'positive', 'negative'], dtype=object)"
]
},
"metadata": {
"tags": []
},
"execution_count": 36
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "e3BzWjof6BeK",
"colab_type": "text"
},
"source": [
"Encoding Labels"
]
},
{
"cell_type": "code",
"metadata": {
"id": "pJ1Vn_UM6BeK",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "35539965-fd7a-44a1-81af-37593dda57d0"
},
"source": [
"# encodeedd = {\"sentiment_polarity\":{\"positive\": 1, \"negative\":2,\"neutral\":3 }}\n",
"labels = {value: idx for idx, value in enumerate(data['sentiment'].unique())}\n",
"labels"
],
"execution_count": 37,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'negative': 1, 'positive': 0}"
]
},
"metadata": {
"tags": []
},
"execution_count": 37
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hZlfk0bC6BeM",
"colab_type": "code",
"colab": {}
},
"source": [
"# data.replace(encodeedd, inplace=True)\n",
"data['label'] = data['sentiment'].apply(lambda x: labels[x])\n"
],
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gb0GE9b26BeO",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67
},
"outputId": "e14a7131-2e69-4c4c-bad8-b505e1639474"
},
"source": [
"data['label'].value_counts()"
],
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1 25000\n",
"0 25000\n",
"Name: label, dtype: int64"
]
},
"metadata": {
"tags": []
},
"execution_count": 39
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZFmnhK7A6BeR",
"colab_type": "text"
},
"source": [
"# Modeling,Training & Testing"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cCfw5Zbl6BeS",
"colab_type": "text"
},
"source": [
"Import required models and metrics from sciktlearn"
]
},
{
"cell_type": "code",
"metadata": {
"id": "QJ-Y3mRd6BeS",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from sklearn.model_selection import GridSearchCV\n",
"from scipy.stats import uniform\n"
],
"execution_count": 40,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "q1H7Nm7e6BeU",
"colab_type": "text"
},
"source": [
"Spiliting the dataset into train and test with 90:10 ratio"
]
},
{
"cell_type": "code",
"metadata": {
"id": "b2EFza0J6BeU",
"colab_type": "code",
"colab": {}
},
"source": [
"#first trainig and spliting of data and using count vectorizer and tfidtransformer\n",
"X_train, X_test, y_train, y_test = train_test_split(data[\"clean_data\"], data['label'],test_size=0.1, random_state=42)"
],
"execution_count": 48,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "JrsdG9Dq6BeX",
"colab_type": "text"
},
"source": [
"Count Vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "HQbvqy7w6BeY",
"colab_type": "code",
"colab": {}
},
"source": [
"# Extracting Count Vectors Parameters\n",
"count_vect = CountVectorizer(analyzer='word')\n",
"count_vect.fit(data['clean_data'])\n",
"X_train_count = count_vect.transform(X_train)\n",
"X_test_count = count_vect.transform(X_test)"
],
"execution_count": 49,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "vx6owq196BeZ",
"colab_type": "text"
},
"source": [
"TFIDF Vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "LB-2FC0R6BeZ",
"colab_type": "code",
"colab": {}
},
"source": [
"# Extracting TF-IDF parameters\n",
"tfidf = TfidfVectorizer(max_features=1000, analyzer='word',sublinear_tf=True, min_df=2000, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')\n",
"tfidf.fit(data['clean_data'])\n",
"X_train_tfidf = tfidf.transform(X_train)\n",
"X_test_tfidf = tfidf.transform(X_test)"
],
"execution_count": 50,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "dzb2tf0-6Bec",
"colab_type": "text"
},
"source": [
"MultiNomialNB Using TFIDF vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "GesB8IAi6Bed",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "a91bbdd6-43c1-4940-a534-e8a687992f70"
},
"source": [
"#model used here is MultinomailNB TFIDF \n",
"clf_tfidf_MNB = MultinomialNB().fit(X_train_tfidf, y_train)\n",
"print(confusion_matrix(y_test,clf_tfidf_MNB.predict(X_test_tfidf)))\n",
"print(classification_report(y_test,clf_tfidf_MNB.predict(X_test_tfidf)))\n",
"print(accuracy_score(y_test,clf_tfidf_MNB.predict(X_test_tfidf),normalize=True))"
],
"execution_count": 51,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2072 447]\n",
" [ 545 1936]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.79 0.82 0.81 2519\n",
" 1 0.81 0.78 0.80 2481\n",
"\n",
" accuracy 0.80 5000\n",
" macro avg 0.80 0.80 0.80 5000\n",
"weighted avg 0.80 0.80 0.80 5000\n",
"\n",
"0.8016\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "rwCI0DpN6Bee",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "740199fc-6125-47db-a9f6-ac06dd0b3263"
},
"source": [
"test_text=tfidf.transform([\"you're very bad\"])\n",
"x=clf_tfidf_MNB.predict(test_text)\n",
"x"
],
"execution_count": 53,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([1])"
]
},
"metadata": {
"tags": []
},
"execution_count": 53
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9NOQt4b66Beg",
"colab_type": "text"
},
"source": [
"Using Count Vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Smw_dg3E6Beg",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "85a7f7b9-a8ee-4ab0-ec29-a46d5f7b1740"
},
"source": [
"#model used here is MultinomailNB TFIDF \n",
"clf_count_vect_MNB = MultinomialNB().fit(X_train_count, y_train)\n",
"print(confusion_matrix(y_test,clf_count_vect_MNB.predict(X_test_count)))\n",
"print(classification_report(y_test,clf_count_vect_MNB.predict(X_test_count)))\n",
"print(accuracy_score(y_test,clf_count_vect_MNB.predict(X_test_count),normalize=True))"
],
"execution_count": 54,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2135 384]\n",
" [ 304 2177]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.85 0.86 2519\n",
" 1 0.85 0.88 0.86 2481\n",
"\n",
" accuracy 0.86 5000\n",
" macro avg 0.86 0.86 0.86 5000\n",
"weighted avg 0.86 0.86 0.86 5000\n",
"\n",
"0.8624\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Uz3HeSVT6Bei",
"colab_type": "text"
},
"source": [
"Random Forest Using TFIDF"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "-yapE5tL6Bei",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "343b322f-e72c-4016-a378-6b1dc0431a4e"
},
"source": [
"#Second model used Random Forest upon the same dataset\n",
"clf_tfidf_RondomFC = RandomForestClassifier().fit(X_train_tfidf, y_train)\n",
"#prediction from Random Forest classifer\n",
"# metrics of evaluation of performance of model\n",
"print(confusion_matrix(y_test,clf_tfidf_RondomFC.predict(X_test_tfidf)))\n",
"print(classification_report(y_test,clf_tfidf_RondomFC.predict(X_test_tfidf)))\n",
"print(accuracy_score(y_test,clf_tfidf_RondomFC.predict(X_test_tfidf)))"
],
"execution_count": 55,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2110 409]\n",
" [ 534 1947]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.80 0.84 0.82 2519\n",
" 1 0.83 0.78 0.81 2481\n",
"\n",
" accuracy 0.81 5000\n",
" macro avg 0.81 0.81 0.81 5000\n",
"weighted avg 0.81 0.81 0.81 5000\n",
"\n",
"0.8114\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hqHo1kaM6Bel",
"colab_type": "text"
},
"source": [
"Random Forest Using Count Vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "AL4goO2U6Bel",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "5ec55c45-cf94-40cf-ff8e-11c8c6e326b4"
},
"source": [
"#Second model used Random Forest upon the same dataset\n",
"clf_count_RondomFC = RandomForestClassifier().fit(X_train_count, y_train)\n",
"#prediction from Random Forest classifer\n",
"# metrics of evaluation of performance of model\n",
"print(confusion_matrix(y_test,clf_count_RondomFC.predict(X_test_count)))\n",
"print(classification_report(y_test,clf_count_RondomFC.predict(X_test_count)))\n",
"print(accuracy_score(y_test,clf_count_RondomFC.predict(X_test_count)))"
],
"execution_count": 56,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2200 319]\n",
" [ 381 2100]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.85 0.87 0.86 2519\n",
" 1 0.87 0.85 0.86 2481\n",
"\n",
" accuracy 0.86 5000\n",
" macro avg 0.86 0.86 0.86 5000\n",
"weighted avg 0.86 0.86 0.86 5000\n",
"\n",
"0.86\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "xwnES4aM6Ben",
"colab_type": "text"
},
"source": [
"SGDClassifer using randomizedSearchCV TFIDF vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "rbvDqY_j6Ben",
"colab_type": "code",
"colab": {}
},
"source": [
"#using randomSearchCV and SGDclassifer here\n",
"clfSGD = SGDClassifier()\n",
"distributions = dict(\n",
" loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],\n",
" learning_rate=['optimal', 'invscaling', 'adaptive'],\n",
" eta0=uniform(loc=1e-7, scale=1e-2)\n",
")\n",
"\n",
"random_search_TFIDF_cv = RandomizedSearchCV(\n",
" estimator=clfSGD,\n",
" param_distributions=distributions,\n",
" cv=5,\n",
" n_iter=50\n",
")\n",
"random_search_COUNT_cv = RandomizedSearchCV(\n",
" estimator=clfSGD,\n",
" param_distributions=distributions,\n",
" cv=5,\n",
" n_iter=50\n",
")"
],
"execution_count": 57,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "YcaoxndS6Beq",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 101
},
"outputId": "e01a9006-5c4a-4098-b69f-8849eb7cc695"
},
"source": [
"random_search_TFIDF_cv.fit(X_train_tfidf, y_train)\n",
"print(f'Best params: {random_search_TFIDF_cv.best_params_}')\n",
"print(f'Best score: {random_search_TFIDF_cv.best_score_}')\n",
"train_score = random_search_TFIDF_cv.score(X_train_tfidf, y_train)\n",
"valid_score = random_search_TFIDF_cv.score(X_test_tfidf,y_test)\n",
"print(f'\\nTrain score: {round(train_score, 2)} ; Test score: {round(valid_score, 2)}\\n')"
],
"execution_count": 58,
"outputs": [
{
"output_type": "stream",
"text": [
"Best params: {'eta0': 0.0036105064457279228, 'learning_rate': 'adaptive', 'loss': 'squared_hinge'}\n",
"Best score: 0.8196666666666665\n",
"\n",
"Train score: 0.82 ; Test score: 0.82\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "xGgqwGrB6Bes",
"colab_type": "text"
},
"source": [
"SGDClassifer using randomizedSearchCV Count vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3viBvCj36Bes",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 101
},
"outputId": "f6b4680a-8106-4cba-d1e8-69d606ba94e6"
},
"source": [
"random_search_COUNT_cv.fit(X_train_count, y_train)\n",
"print(f'Best params: {random_search_COUNT_cv.best_params_}')\n",
"print(f'Best score: {random_search_COUNT_cv.best_score_}')\n",
"train_score = random_search_COUNT_cv.score(X_train_count, y_train)\n",
"valid_score = random_search_COUNT_cv.score(X_test_count,y_test)\n",
"print(f'\\nTrain score: {round(train_score, 2)} ; Test score: {round(valid_score, 2)}\\n')"
],
"execution_count": 59,
"outputs": [
{
"output_type": "stream",
"text": [
"Best params: {'eta0': 0.002336132788972425, 'learning_rate': 'adaptive', 'loss': 'log'}\n",
"Best score: 0.8900222222222223\n",
"\n",
"Train score: 0.97 ; Test score: 0.89\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HGSrzQEV6Bev",
"colab_type": "text"
},
"source": [
"Logistic Regression TFIDF Vectorizer"
]
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"id": "bTOl3WE66Bev",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "91b96308-99cd-4a5d-b771-aaf07f42c5ea"
},
"source": [
"logst_tfidf=LogisticRegression()\n",
"logst_tfidf.fit(X_train_tfidf, y_train) \n",
"# metrics of evaluation of performance of model\n",
"print(confusion_matrix(y_test,logst_tfidf.predict(X_test_tfidf)))\n",
"print(classification_report(y_test,logst_tfidf.predict(X_test_tfidf)))\n",
"print(accuracy_score(y_test, logst_tfidf.predict(X_test_tfidf)))"
],
"execution_count": 60,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2120 399]\n",
" [ 501 1980]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.81 0.84 0.82 2519\n",
" 1 0.83 0.80 0.81 2481\n",
"\n",
" accuracy 0.82 5000\n",
" macro avg 0.82 0.82 0.82 5000\n",
"weighted avg 0.82 0.82 0.82 5000\n",
"\n",
"0.82\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Hp89Cs9R6Bey",
"colab_type": "text"
},
"source": [
"Logistic Regression Count Vectorizer "
]
},
{
"cell_type": "code",
"metadata": {
"id": "LF6L8Oya6Bez",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"outputId": "83e0fdb6-038f-49a2-ff7f-30d850d28274"
},
"source": [
"logst_count=LogisticRegression()\n",
"logst_count.fit(X_train_count, y_train) \n",
"# metrics of evaluation of performance of model\n",
"print(confusion_matrix(y_test,logst_count.predict(X_test_count)))\n",
"print(classification_report(y_test,logst_count.predict(X_test_count)))\n",
"print(accuracy_score(y_test, logst_count.predict(X_test_count)))"
],
"execution_count": 61,
"outputs": [
{
"output_type": "stream",
"text": [
"[[2250 269]\n",
" [ 293 2188]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.89 0.89 2519\n",
" 1 0.89 0.88 0.89 2481\n",
"\n",
" accuracy 0.89 5000\n",
" macro avg 0.89 0.89 0.89 5000\n",
"weighted avg 0.89 0.89 0.89 5000\n",
"\n",
"0.8876\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cYCSze6U6Be0",
"colab_type": "text"
},
"source": [
"SVM using Grid SearchCV TFIDF Vectorizer"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Z-H1Tqc8MYZX",
"colab_type": "text"
},
"source": [
"**Tensorflow Modeling**"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "XYYDvoskkE61",
"colab": {}
},
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences"
],
"execution_count": 62,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "mB_sbF7aMeAf",
"colab_type": "text"
},
"source": [
"Params"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "0eJSTTYnkJQd",
"colab": {}
},
"source": [
"vocab_size = 10000\n",
"embedding_dim = 200\n",
"max_length = 500\n",
"trunc_type='post'\n",
"padding_type='post'\n",
"oov_tok = \"<OOV>\"\n"
],
"execution_count": 63,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wKdseZ6eMgAp",
"colab_type": "text"
},
"source": [
"Recheck dataset"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "oaLaaqhNkUPd",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 823
},
"outputId": "cfd529b2-79d3-456c-a40c-a5c398e5955e"
},
"source": [
"X_train, X_test, y_train, y_test"
],
"execution_count": 64,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(40877 recently started watching show say really made...\n",
" 18057 return jedi often remembered wrong rather righ...\n",
" 19066 remember loved movie came twelve years old com...\n",
" 20525 dont know last reviewer talking show pure ente...\n",
" 5847 beginning excited see movie poster possibly fu...\n",
" ... \n",
" 11284 shadow magic recaptures joy amazement first mo...\n",
" 44732 found movie quite enjoyable fairly entertainin...\n",
" 38158 avoid one terrible movie exciting pointless mu...\n",
" 860 production quite surprise absolutely love obsc...\n",
" 15795 decent movie although little bit short time pa...\n",
" Name: clean_data, Length: 45000, dtype: object,\n",
" 33553 really liked summerslam due look arena curtain...\n",
" 9427 not many television shows appeal quite many di...\n",
" 199 film quickly gets major chase scene ever incre...\n",
" 12447 jane austen would definitely approve onegwynet...\n",
" 39489 expectations somewhat high went see movie thou...\n",
" ... \n",
" 39885 one eastwoods best movies separated westerns g...\n",
" 17566 blurred childhood memories kept echo cult seri...\n",
" 16062 love zombiemovies love amateurproductions meat...\n",
" 48445 chan new york gets involved attempt sabotage n...\n",
" 20382 wife thought film watereddown madefortv bbc ve...\n",
" Name: clean_data, Length: 5000, dtype: object,\n",
" 40877 0\n",
" 18057 0\n",
" 19066 1\n",
" 20525 0\n",
" 5847 0\n",
" ..\n",
" 11284 0\n",
" 44732 0\n",
" 38158 1\n",
" 860 0\n",
" 15795 0\n",
" Name: label, Length: 45000, dtype: int64,\n",
" 33553 0\n",
" 9427 0\n",
" 199 1\n",
" 12447 0\n",
" 39489 1\n",
" ..\n",
" 39885 0\n",
" 17566 1\n",
" 16062 1\n",
" 48445 0\n",
" 20382 1\n",
" Name: label, Length: 5000, dtype: int64)"
]
},
"metadata": {
"tags": []
},
"execution_count": 64
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6CbUg_q5MjP5",
"colab_type": "text"
},
"source": [
"Assign data"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "S1sD-7v0kYWk",
"colab": {}
},
"source": [
"training_sentences = X_train\n",
"testing_sentences = X_test\n",
"training_labels = y_train\n",
"testing_labels = y_test"
],
"execution_count": 65,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "sLpzKE5PMmS4",
"colab_type": "text"
},
"source": [
"Tokenier from Tensorflow and Padding of sentences"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "3u8UB0MCkZ5N",
"colab": {}
},
"source": [
"tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)\n",
"tokenizer.fit_on_texts(training_sentences)\n",
"\n",
"word_index = tokenizer.word_index\n",
"\n",
"training_sequences = tokenizer.texts_to_sequences(training_sentences)\n",
"training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)\n",
"\n",
"testing_sequences = tokenizer.texts_to_sequences(testing_sentences)\n",
"testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)"
],
"execution_count": 66,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "-YkDzlzDMrnv",
"colab_type": "text"
},
"source": [
"# Need this block to get it to work with TensorFlow 2.x\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "GrAlWBKf99Ya",
"colab": {}
},
"source": [
"\n",
"training_padded = np.array(training_padded)\n",
"training_labels = np.array(training_labels)\n",
"testing_padded = np.array(testing_padded)\n",
"testing_labels = np.array(testing_labels)"
],
"execution_count": 67,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "WVrcp_wtMu1c",
"colab_type": "text"
},
"source": [
"Defining model structure "
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "FufaT4vlkiDE",
"colab": {}
},
"source": [
"model = tf.keras.Sequential([\n",
" tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),\n",
" tf.keras.layers.GlobalAveragePooling1D(),\n",
" tf.keras.layers.Dense(24, activation='relu'),\n",
" tf.keras.layers.Dense(1, activation='sigmoid')\n",
"])\n",
"model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])"
],
"execution_count": 68,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "OJZvwe9GMyS5",
"colab_type": "text"
},
"source": [
"Print the Summary "
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "XfDt1hmYkiys",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 286
},
"outputId": "ad7368b1-89ca-4e44-8100-74b713397715"
},
"source": [
"model.summary()\n"
],
"execution_count": 69,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding (Embedding) (None, 500, 200) 2000000 \n",
"_________________________________________________________________\n",
"global_average_pooling1d (Gl (None, 200) 0 \n",
"_________________________________________________________________\n",
"dense (Dense) (None, 24) 4824 \n",
"_________________________________________________________________\n",
"dense_1 (Dense) (None, 1) 25 \n",
"=================================================================\n",
"Total params: 2,004,849\n",
"Trainable params: 2,004,849\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rekv18MNM0f9",
"colab_type": "text"
},
"source": [
"Train the model and Epoch = 5"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "2DTKQFf1kkyc",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 185
},
"outputId": "50b236bc-db9e-4bd3-a572-171cbd8dd810"
},
"source": [
"num_epochs = 5\n",
"history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)"
],
"execution_count": 70,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/5\n",
"1407/1407 - 42s - loss: 0.3887 - accuracy: 0.8249 - val_loss: 0.2737 - val_accuracy: 0.8882\n",
"Epoch 2/5\n",
"1407/1407 - 42s - loss: 0.2322 - accuracy: 0.9096 - val_loss: 0.2598 - val_accuracy: 0.8986\n",
"Epoch 3/5\n",
"1407/1407 - 42s - loss: 0.2024 - accuracy: 0.9222 - val_loss: 0.2665 - val_accuracy: 0.8972\n",
"Epoch 4/5\n",
"1407/1407 - 42s - loss: 0.1827 - accuracy: 0.9304 - val_loss: 0.2826 - val_accuracy: 0.8930\n",
"Epoch 5/5\n",
"1407/1407 - 42s - loss: 0.1695 - accuracy: 0.9360 - val_loss: 0.2873 - val_accuracy: 0.8926\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "v6q8o4jpM3gp",
"colab_type": "text"
},
"source": [
"Plot graph between Accuracy and Loss ,val_loss"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "2HYfBKXjkmU8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 568
},
"outputId": "8f7c4372-f2c9-4801-deac-c5be0db7c4a3"
},
"source": [
"\n",
"def plot_graphs(history, string):\n",
" plt.plot(history.history[string])\n",
" plt.plot(history.history['val_'+string])\n",
" plt.xlabel(\"Epochs\")\n",
" plt.ylabel(string)\n",
" plt.legend([string, 'val_'+string])\n",
" plt.show()\n",
" \n",
"plot_graphs(history, \"accuracy\")\n",
"plot_graphs(history, \"loss\")"
],
"execution_count": 71,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2Xzd823gNBKR",
"colab_type": "text"
},
"source": [
"get the labeled output from customize function to filter the output from model "
]
},
{
"cell_type": "code",
"metadata": {
"id": "4ig-LGZFLG-N",
"colab_type": "code",
"colab": {}
},
"source": [
"def predict_fun(scores):\n",
" if scores>=.4 and scores<0.6:\n",
" print(\"neutral\")\n",
" elif scores>0.6:\n",
" print(\"negative\")\n",
" elif scores<0.4:\n",
" print(\"positive\")"
],
"execution_count": 83,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ghUR_yk7NIV2",
"colab_type": "text"
},
"source": [
"Try yourself , examples"
]
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"id": "cG8-ArY-qDcz",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67
},
"outputId": "ad6cb9a3-27a5-44c6-b00f-e5d7302882b9"
},
"source": [
"sentence = [\"granny starting bad movie this movie spiders in the garden might be real\", \"game showing this sunday night\",\"Awesome movie that was i will watch again\"]\n",
"sequences = tokenizer.texts_to_sequences(sentence)\n",
"padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)\n",
"\n",
"\n",
"predict_fun(model.predict(padded)[0])\n",
"predict_fun(model.predict(padded)[1])\n",
"predict_fun(model.predict(padded)[2])"
],
"execution_count": 88,
"outputs": [
{
"output_type": "stream",
"text": [
"negative\n",
"neutral\n",
"positive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "MEDghh3KDJRy",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 88,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Urc--hNwLWO0",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment