kmehant/mehant_logisticregression.ipynb

## mehant_logisticregression.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "411843_Mehant_LogisticRegression.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/kmehant/fb29dcfb54dd19a9736c058dae0a56b4/411843_mehant_logisticregression.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EephMyPHGbnD",
        "outputId": "03311919-8df5-4998-f751-4448d8b6c6e3"
      },
      "source": [
        "\n",
        "\"\"\"\n",
        "Import python modules\n",
        "\"\"\"\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import confusion_matrix, classification_report\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "\n",
        "import matplotlib.pyplot as plt\n",
        "import os\n",
        "import re\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Create a pandas dataframe\n",
        "\"\"\"\n",
        "\n",
        "columns = ['target', 'review']\n",
        "df = pd.DataFrame(columns=columns)\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Import data into dataframes and perform cleaning\n",
        "\"\"\"\n",
        "\n",
        "for pos_file in os.listdir('/content/drive/MyDrive/data/pos'):\n",
        "     f = open('/content/drive/MyDrive/data/pos/' + pos_file, \"r\")\n",
        "     file_data = f.read()\n",
        "     # clean data\n",
        "     file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
        "     file_data = file_data.strip()\n",
        "     # append at the end of the dataframe\n",
        "     df.loc[len(df.index)] = [1, file_data]\n",
        "\n",
        "\n",
        "for neg_file in os.listdir('/content/drive/MyDrive/data/neg'):\n",
        "     f = open('/content/drive/MyDrive/data/neg/' + neg_file, \"r\")\n",
        "     file_data = f.read()\n",
        "     # clean data\n",
        "     file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
        "     file_data = file_data.strip()\n",
        "     # append at the end of the dataframe\n",
        "     df.loc[len(df.index)] = [0, file_data]\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Split the data into test and train data\n",
        "\"\"\"\n",
        "\n",
        "x = df.review.values\n",
        "y = df.target.values\n",
        "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)\n",
        "\n",
        "y_train = y_train.astype(int)\n",
        "y_test = y_test.astype(int)\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Create a count vectorizer\n",
        "\"\"\"\n",
        "\n",
        "vectorizer = CountVectorizer(stop_words=['is','are','and','in','the'])\n",
        "vectorizer.fit(x_train)\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Transform text (independent feature) into numerical type using count vectorizer\n",
        "\"\"\"\n",
        "\n",
        "X_train = vectorizer.transform(x_train)\n",
        "X_test = vectorizer.transform(x_test)\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Create a logistic regression model\n",
        "\"\"\"\n",
        "\n",
        "classifier = LogisticRegression(max_iter=100)\n",
        "classifier.fit(X_train, y_train)\n",
        "\n",
        "\n",
        "\"\"\"\n",
        "Compute testing accuracy\n",
        "\"\"\"\n",
        "\n",
        "score = classifier.score(X_test, y_test)\n",
        "\n",
        "print(\"Accuracy: \", (score*100), '%', sep='')\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Accuracy: 84.0%\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9jNX44DCPcDr",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "b5a4d6f3-0137-4110-8296-417e70b196cc"
      },
      "source": [
        "\"\"\"\n",
        "Logistic regression vectorized implementation\n",
        "\"\"\"\n",
        "\n",
        "def h_theta(z):\n",
        "    return 1/ (1 + np.exp(-z))\n",
        "\n",
        "def costFn(theta, x, y):\n",
        "    h_theta_x = h_theta(np.dot(x, theta))\n",
        "    cost = (-y * np.log(h_theta_x)) - ((1 - y) * np.log(1 - h_theta_x))\n",
        "    j_theta = 1/m * sum(cost)\n",
        "    deviation = 1 / m * np.dot(x.transpose(), (h_theta_x - y))\n",
        "    return j_theta[0], deviation\n",
        "\n",
        "def gradientDescent(x,y,theta,alpha,num_iters):\n",
        "    for i in range(num_iters):\n",
        "        cost, dev = costFn(theta, x, y)\n",
        "        theta = theta - (alpha * dev)\n",
        "    return theta\n",
        "\n",
        "def predictClass(theta,x):\n",
        "    predictions = x.dot(theta)\n",
        "    return predictions > 0\n",
        "\n",
        "m = X_train.shape[0]\n",
        "n = df.shape[1] - 1\n",
        "\n",
        "X_train = X_train.toarray()\n",
        "X_train = np.array(X_train)\n",
        "\n",
        "x = []\n",
        "maxx = 0\n",
        "for i in range(X_train.shape[0]):\n",
        "    maxx = max(maxx, np.sum(X_train[i]))\n",
        "for i in range(X_train.shape[0]):\n",
        "    x.append([1, (np.sum(X_train[i]) / maxx)])\n",
        "x = np.asarray(x)\n",
        "\n",
        "y = np.asarray(y_train)\n",
        "y = y.reshape(m, 1)\n",
        "x = x.reshape(m, 2)\n",
        "\n",
        "theta = np.zeros((n + 1,1))\n",
        "cost, deviation = costFn(theta,x,y)\n",
        "\n",
        "theta = gradientDescent(x, y, theta, 0.1, 1000)\n",
        "\n",
        "y_pred = predictClass(theta, x)\n",
        "\n",
        "print(\"training Accuracy: \", (sum(y_pred == y)[0] / m) * 100,\"%\", sep='')\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "training Accuracy: 55.875%\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "411843_Mehant_LogisticRegression.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"display_name": "Python 3",
	"name": "python3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/kmehant/fb29dcfb54dd19a9736c058dae0a56b4/411843_mehant_logisticregression.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "EephMyPHGbnD",
	"outputId": "03311919-8df5-4998-f751-4448d8b6c6e3"
	},
	"source": [
	"\n",
	"\"\"\"\n",
	"Import python modules\n",
	"\"\"\"\n",
	"\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.metrics import confusion_matrix, classification_report\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"\n",
	"import matplotlib.pyplot as plt\n",
	"import os\n",
	"import re\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Create a pandas dataframe\n",
	"\"\"\"\n",
	"\n",
	"columns = ['target', 'review']\n",
	"df = pd.DataFrame(columns=columns)\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Import data into dataframes and perform cleaning\n",
	"\"\"\"\n",
	"\n",
	"for pos_file in os.listdir('/content/drive/MyDrive/data/pos'):\n",
	" f = open('/content/drive/MyDrive/data/pos/' + pos_file, \"r\")\n",
	" file_data = f.read()\n",
	" # clean data\n",
	" file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
	" file_data = file_data.strip()\n",
	" # append at the end of the dataframe\n",
	" df.loc[len(df.index)] = [1, file_data]\n",
	"\n",
	"\n",
	"for neg_file in os.listdir('/content/drive/MyDrive/data/neg'):\n",
	" f = open('/content/drive/MyDrive/data/neg/' + neg_file, \"r\")\n",
	" file_data = f.read()\n",
	" # clean data\n",
	" file_data = re.sub(r'[^a-zA-Z0-9_\\s]+', '', file_data)\n",
	" file_data = file_data.strip()\n",
	" # append at the end of the dataframe\n",
	" df.loc[len(df.index)] = [0, file_data]\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Split the data into test and train data\n",
	"\"\"\"\n",
	"\n",
	"x = df.review.values\n",
	"y = df.target.values\n",
	"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)\n",
	"\n",
	"y_train = y_train.astype(int)\n",
	"y_test = y_test.astype(int)\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Create a count vectorizer\n",
	"\"\"\"\n",
	"\n",
	"vectorizer = CountVectorizer(stop_words=['is','are','and','in','the'])\n",
	"vectorizer.fit(x_train)\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Transform text (independent feature) into numerical type using count vectorizer\n",
	"\"\"\"\n",
	"\n",
	"X_train = vectorizer.transform(x_train)\n",
	"X_test = vectorizer.transform(x_test)\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Create a logistic regression model\n",
	"\"\"\"\n",
	"\n",
	"classifier = LogisticRegression(max_iter=100)\n",
	"classifier.fit(X_train, y_train)\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Compute testing accuracy\n",
	"\"\"\"\n",
	"\n",
	"score = classifier.score(X_test, y_test)\n",
	"\n",
	"print(\"Accuracy: \", (score*100), '%', sep='')\n"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Accuracy: 84.0%\n"
	],
	"name": "stdout"
	},
	{
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
	"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
	"\n",
	"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
	" https://scikit-learn.org/stable/modules/preprocessing.html\n",
	"Please also refer to the documentation for alternative solver options:\n",
	" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
	" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
	],
	"name": "stderr"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "9jNX44DCPcDr",
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"outputId": "b5a4d6f3-0137-4110-8296-417e70b196cc"
	},
	"source": [
	"\"\"\"\n",
	"Logistic regression vectorized implementation\n",
	"\"\"\"\n",
	"\n",
	"def h_theta(z):\n",
	" return 1/ (1 + np.exp(-z))\n",
	"\n",
	"def costFn(theta, x, y):\n",
	" h_theta_x = h_theta(np.dot(x, theta))\n",
	" cost = (-y * np.log(h_theta_x)) - ((1 - y) * np.log(1 - h_theta_x))\n",
	" j_theta = 1/m * sum(cost)\n",
	" deviation = 1 / m * np.dot(x.transpose(), (h_theta_x - y))\n",
	" return j_theta[0], deviation\n",
	"\n",
	"def gradientDescent(x,y,theta,alpha,num_iters):\n",
	" for i in range(num_iters):\n",
	" cost, dev = costFn(theta, x, y)\n",
	" theta = theta - (alpha * dev)\n",
	" return theta\n",
	"\n",
	"def predictClass(theta,x):\n",
	" predictions = x.dot(theta)\n",
	" return predictions > 0\n",
	"\n",
	"m = X_train.shape[0]\n",
	"n = df.shape[1] - 1\n",
	"\n",
	"X_train = X_train.toarray()\n",
	"X_train = np.array(X_train)\n",
	"\n",
	"x = []\n",
	"maxx = 0\n",
	"for i in range(X_train.shape[0]):\n",
	" maxx = max(maxx, np.sum(X_train[i]))\n",
	"for i in range(X_train.shape[0]):\n",
	" x.append([1, (np.sum(X_train[i]) / maxx)])\n",
	"x = np.asarray(x)\n",
	"\n",
	"y = np.asarray(y_train)\n",
	"y = y.reshape(m, 1)\n",
	"x = x.reshape(m, 2)\n",
	"\n",
	"theta = np.zeros((n + 1,1))\n",
	"cost, deviation = costFn(theta,x,y)\n",
	"\n",
	"theta = gradientDescent(x, y, theta, 0.1, 1000)\n",
	"\n",
	"y_pred = predictClass(theta, x)\n",
	"\n",
	"print(\"training Accuracy: \", (sum(y_pred == y)[0] / m) * 100,\"%\", sep='')\n"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"training Accuracy: 55.875%\n"
	],
	"name": "stdout"
	}
	]
	}
	]
	}