Skip to content

Instantly share code, notes, and snippets.

@akhildaphara
Created December 16, 2020 07:15
Show Gist options
  • Save akhildaphara/9c0e8fd50def1b6eee6f3eb977584417 to your computer and use it in GitHub Desktop.
Save akhildaphara/9c0e8fd50def1b6eee6f3eb977584417 to your computer and use it in GitHub Desktop.
NLP_test.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "NLP_test.ipynb",
"provenance": [],
"toc_visible": true,
"mount_file_id": "1SKwoeAgvJx-alJaS1HG3JqJ0i6vGjZYp",
"authorship_tag": "ABX9TyOaIJylIUt5sYqjE1dnrbvA",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/akhildaphara/9c0e8fd50def1b6eee6f3eb977584417/nlp_test.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "viU2VDxie3z6"
},
"source": [
"import pandas as pd"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1Bal3HEqfUE3",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 419
},
"outputId": "f328b6ad-5f75-4044-84be-028e508eb3d5"
},
"source": [
"dataset = pd.read_csv(\"/content/drive/My Drive/Codes/NLP/Restaurant_Reviews.tsv\", delimiter='\\t')\n",
"dataset"
],
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Review</th>\n",
" <th>Liked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Wow... Loved this place.</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Crust is not good.</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Not tasty and the texture was just nasty.</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Stopped by during the late May bank holiday of...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>The selection on the menu was great and so wer...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>995</th>\n",
" <td>I think food should have flavor and texture an...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>996</th>\n",
" <td>Appetite instantly gone.</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>997</th>\n",
" <td>Overall I was not impressed and would not go b...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>998</th>\n",
" <td>The whole experience was underwhelming, and I ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>999</th>\n",
" <td>Then, as if I hadn't wasted enough of my life ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Review Liked\n",
"0 Wow... Loved this place. 1\n",
"1 Crust is not good. 0\n",
"2 Not tasty and the texture was just nasty. 0\n",
"3 Stopped by during the late May bank holiday of... 1\n",
"4 The selection on the menu was great and so wer... 1\n",
".. ... ...\n",
"995 I think food should have flavor and texture an... 0\n",
"996 Appetite instantly gone. 0\n",
"997 Overall I was not impressed and would not go b... 0\n",
"998 The whole experience was underwhelming, and I ... 0\n",
"999 Then, as if I hadn't wasted enough of my life ... 0\n",
"\n",
"[1000 rows x 2 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "qb6W3rYihJSP",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9969bc71-cdfc-4748-a255-dff5b4f8260e"
},
"source": [
"import re\n",
"import nltk\n",
"nltk.download('stopwords')\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.porter import PorterStemmer\n",
"\n",
"corpus = []\n",
"for i in range(0,1000):\n",
" review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])\n",
" review = review.lower()\n",
" review = review.split()\n",
" ps = PorterStemmer()\n",
" review = [ps.stem(word) for word in review if not word in set(stopwords.words('english')) or word=='not' ]\n",
" review = ' '.join(review)\n",
" corpus.append(review)"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ncp_nWMBjBMu"
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"cv = CountVectorizer(max_features = 1500)\n",
"X = cv.fit_transform(corpus).toarray()\n",
"y = dataset.iloc[:, 1].values"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "GTw-AMqkjFqG"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"
],
"execution_count": 5,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "XX9KmF_OsJ1u"
},
"source": [
"Random Forest"
]
},
{
"cell_type": "code",
"metadata": {
"id": "XuJH1p0FrWO7",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5abe0419-3ba6-447c-bc32-8a3fe3d6d3da"
},
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"classifier = RandomForestClassifier(n_estimators=1000)\n",
"classifier.fit(X_train, y_train)\n",
"y_pred = classifier.predict(X_test)\n",
"classifier.score(X_test, y_test)"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.765"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FasIwD4NsIIi"
},
"source": [
"SVC"
]
},
{
"cell_type": "code",
"metadata": {
"id": "204f6yKokJx4"
},
"source": [
"from sklearn.svm import SVC\n",
"classifier = SVC(C = 2, kernel = 'linear', random_state = 0)\n",
"classifier.fit(X_train, y_train)\n",
"y_pred = classifier.predict(X_test)"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "fVKgW8bBh2XR"
},
"source": [
"Prediction"
]
},
{
"cell_type": "code",
"metadata": {
"id": "sDDhC3Jyh42N"
},
"source": [
"def predict(new_review): \n",
" new_review = re.sub(\"[^a-zA-Z]\", \" \", new_review) \n",
" new_review = new_review.lower().split()\n",
" new_review = [ps.stem(word) for word in new_review if word not in set(stopwords.words(\"english\")) or word=='not'] \n",
" new_review = \" \".join(new_review) \n",
" new_review = [new_review] \n",
" new_review = cv.transform(new_review).toarray() \n",
" if classifier.predict(new_review)[0] == 1:\n",
" return \"Positive\" \n",
" else: \n",
" return \"Negative\""
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MEVjVt2Xh_ju",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "9818d1fe-4e0c-48a8-eb59-1326c777bc0c"
},
"source": [
"predict(\"Not Good food\")"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'Negative'"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "52LWNdgkctrY"
},
"source": [
"End of Code\n",
"---\n",
"\n",
"\n",
"Model Evaluation"
]
},
{
"cell_type": "code",
"metadata": {
"id": "IWeoae8KkbLd",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "e95a23f5-fcf9-4475-ed2b-5efe46d61925"
},
"source": [
"from sklearn.metrics import confusion_matrix\n",
"cm = confusion_matrix(y_test, y_pred)\n",
"print(cm)\n",
"print(\"Accuracy= \"+str((cm[0][0]+cm[1][1])/200))"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"[[78 19]\n",
" [23 80]]\n",
"Accuracy= 0.79\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bNcJVPMxm58k",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "034e4e75-47f9-40c9-fd2f-8fc7d637250d"
},
"source": [
"classifier.score(X_test, y_test)"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.79"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "FoK7ue1Kqh7G",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5d4e57ec-fa61-4a5c-827f-b6d4110737b1"
},
"source": [
"classifier.score(X_train, y_train)"
],
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.9875"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "MjWvXrO4pXqO",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "14b612c7-b554-40e0-e3ec-c9be0a7b5e5c"
},
"source": [
"from sklearn.model_selection import cross_val_score\n",
"accuracy = cross_val_score(estimator = classifier, X= X_train, y = y_train, cv= 10)\n",
"print(accuracy.mean())\n",
"accuracy.std()"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
"0.8\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.04330127018922194"
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "TStqeQCZqCqq",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "39e4066a-fc7e-4c59-9011-9715bf7f1d14"
},
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"parameters = [{'C' : [1, 2, 2.5, 3, 4], 'kernel' : ['linear']}]\n",
" \n",
"grid_search = GridSearchCV(estimator= classifier, param_grid=parameters, scoring= 'accuracy', cv = 10, n_jobs = -1)\n",
"grid_search = grid_search.fit(X_train, y_train)\n",
"best_accuracy = grid_search.best_score_\n",
"print(best_accuracy)"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"0.8\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "HZ_lnDy1vy8-",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "e168df7d-d14c-404f-8803-0a34d1eeb6ae"
},
"source": [
"best_param = grid_search.best_params_\n",
"print(best_param)"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"text": [
"{'C': 2, 'kernel': 'linear'}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0VlKsg6c1XXy",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "912454d0-21fa-4078-f1cc-a75caa7bb2bf"
},
"source": [
"from sklearn.metrics import classification_report\n",
"print(classification_report(y_test, y_pred))"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.77 0.80 0.79 97\n",
" 1 0.81 0.78 0.79 103\n",
"\n",
" accuracy 0.79 200\n",
" macro avg 0.79 0.79 0.79 200\n",
"weighted avg 0.79 0.79 0.79 200\n",
"\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment