Skip to content

Instantly share code, notes, and snippets.

@kmehant
Last active July 16, 2021 16:50
Show Gist options
  • Save kmehant/fb29dcfb54dd19a9736c058dae0a56b4 to your computer and use it in GitHub Desktop.
Save kmehant/fb29dcfb54dd19a9736c058dae0a56b4 to your computer and use it in GitHub Desktop.
Mehant_LogisticRegression.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "411843_Mehant_LogisticRegression.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/kmehant/fb29dcfb54dd19a9736c058dae0a56b4/411843_mehant_logisticregression.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EephMyPHGbnD",
"outputId": "03311919-8df5-4998-f751-4448d8b6c6e3"
},
"source": [
"\n",
"\"\"\"\n",
"Import python modules\n",
"\"\"\"\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import re\n",
"\n",
"\n",
"\"\"\"\n",
"Create a pandas dataframe\n",
"\"\"\"\n",
"\n",
"columns = ['target', 'review']\n",
"df = pd.DataFrame(columns=columns)\n",
"\n",
"\n",
"\"\"\"\n",
"Import data into dataframes and perform cleaning\n",
"\"\"\"\n",
"\n",
"for pos_file in os.listdir('/content/drive/MyDrive/data/pos'):\n",
" f = open('/content/drive/MyDrive/data/pos/' + pos_file, \"r\")\n",
" file_data = f.read()\n",
" # clean data\n",
" file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
" file_data = file_data.strip()\n",
" # append at the end of the dataframe\n",
" df.loc[len(df.index)] = [1, file_data]\n",
"\n",
"\n",
"for neg_file in os.listdir('/content/drive/MyDrive/data/neg'):\n",
" f = open('/content/drive/MyDrive/data/neg/' + neg_file, \"r\")\n",
" file_data = f.read()\n",
" # clean data\n",
" file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
" file_data = file_data.strip()\n",
" # append at the end of the dataframe\n",
" df.loc[len(df.index)] = [0, file_data]\n",
"\n",
"\n",
"\"\"\"\n",
"Split the data into test and train data\n",
"\"\"\"\n",
"\n",
"x = df.review.values\n",
"y = df.target.values\n",
"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)\n",
"\n",
"y_train = y_train.astype(int)\n",
"y_test = y_test.astype(int)\n",
"\n",
"\n",
"\"\"\"\n",
"Create a count vectorizer\n",
"\"\"\"\n",
"\n",
"vectorizer = CountVectorizer(stop_words=['is','are','and','in','the'])\n",
"vectorizer.fit(x_train)\n",
"\n",
"\n",
"\"\"\"\n",
"Transform text (independent feature) into numerical type using count vectorizer\n",
"\"\"\"\n",
"\n",
"X_train = vectorizer.transform(x_train)\n",
"X_test = vectorizer.transform(x_test)\n",
"\n",
"\n",
"\"\"\"\n",
"Create a logistic regression model\n",
"\"\"\"\n",
"\n",
"classifier = LogisticRegression(max_iter=100)\n",
"classifier.fit(X_train, y_train)\n",
"\n",
"\n",
"\"\"\"\n",
"Compute testing accuracy\n",
"\"\"\"\n",
"\n",
"score = classifier.score(X_test, y_test)\n",
"\n",
"print(\"Accuracy: \", (score*100), '%', sep='')\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Accuracy: 84.0%\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "9jNX44DCPcDr",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b5a4d6f3-0137-4110-8296-417e70b196cc"
},
"source": [
"\"\"\"\n",
"Logistic regression vectorized implementation\n",
"\"\"\"\n",
"\n",
"def h_theta(z):\n",
" return 1/ (1 + np.exp(-z))\n",
"\n",
"def costFn(theta, x, y):\n",
" h_theta_x = h_theta(np.dot(x, theta))\n",
" cost = (-y * np.log(h_theta_x)) - ((1 - y) * np.log(1 - h_theta_x))\n",
" j_theta = 1/m * sum(cost)\n",
" deviation = 1 / m * np.dot(x.transpose(), (h_theta_x - y))\n",
" return j_theta[0], deviation\n",
"\n",
"def gradientDescent(x,y,theta,alpha,num_iters):\n",
" for i in range(num_iters):\n",
" cost, dev = costFn(theta, x, y)\n",
" theta = theta - (alpha * dev)\n",
" return theta\n",
"\n",
"def predictClass(theta,x):\n",
" predictions = x.dot(theta)\n",
" return predictions > 0\n",
"\n",
"m = X_train.shape[0]\n",
"n = df.shape[1] - 1\n",
"\n",
"X_train = X_train.toarray()\n",
"X_train = np.array(X_train)\n",
"\n",
"x = []\n",
"maxx = 0\n",
"for i in range(X_train.shape[0]):\n",
" maxx = max(maxx, np.sum(X_train[i]))\n",
"for i in range(X_train.shape[0]):\n",
" x.append([1, (np.sum(X_train[i]) / maxx)])\n",
"x = np.asarray(x)\n",
"\n",
"y = np.asarray(y_train)\n",
"y = y.reshape(m, 1)\n",
"x = x.reshape(m, 2)\n",
"\n",
"theta = np.zeros((n + 1,1))\n",
"cost, deviation = costFn(theta,x,y)\n",
"\n",
"theta = gradientDescent(x, y, theta, 0.1, 1000)\n",
"\n",
"y_pred = predictClass(theta, x)\n",
"\n",
"print(\"training Accuracy: \", (sum(y_pred == y)[0] / m) * 100,\"%\", sep='')\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"training Accuracy: 55.875%\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment