seanbenhur/scripts.ipynb

## scripts.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Scripts .ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyOM2qIA/w+PTB03InC0DIdd",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/seanbenhur/c4693800501aa05a82b06d22329f7d3e/scripts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-eFyc-G7O3nL",
        "outputId": "db33f12f-79b4-46c4-ea52-40724bb87ee0"
      },
      "source": [
        "from sklearn.datasets import fetch_20newsgroups\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.pipeline import make_pipeline\n",
        "from sklearn.metrics import classification_report\n",
        "from imblearn.under_sampling import RandomUnderSampler\n",
        "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
        "from collections import Counter\n",
        "\n",
        "categories = [\n",
        "    \"alt.atheism\",\n",
        "    \"talk.religion.misc\",\n",
        "    \"comp.graphics\",\n",
        "    \"sci.space\",\n",
        "]\n",
        "newsgroups_train = fetch_20newsgroups(subset=\"train\", categories=categories)\n",
        "newsgroups_test = fetch_20newsgroups(subset=\"test\", categories=categories)\n",
        "\n",
        "X_train = newsgroups_train.data\n",
        "X_test = newsgroups_test.data\n",
        "\n",
        "y_train = newsgroups_train.target\n",
        "y_test = newsgroups_test.target"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Downloading 20news dataset. This may take a few minutes.\n",
            "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SHVU7LqkPrmA",
        "outputId": "21e97173-ef36-4c64-a91a-9a93e9f5199a"
      },
      "source": [
        "print(f\"Training class distributions summary: {Counter(y_train)}\")\n",
        "print(f\"Test class distributions summary: {Counter(y_test)}\")"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Training class distributions summary: Counter({2: 593, 1: 584, 0: 480, 3: 377})\n",
            "Test class distributions summary: Counter({2: 394, 1: 389, 0: 319, 3: 251})\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vA3Iw6FURO9Z"
      },
      "source": [
        "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
        "model.fit(X_train, y_train)\n",
        "y_pred = model.predict(X_test)"
      ],
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "w_T-Ieb4QKic",
        "outputId": "ac21e6e0-82a7-49f4-bf70-b8ec8bdf7191"
      },
      "source": [
        "print(classification_report(y_test,y_pred))"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.67      0.94      0.79       319\n",
            "           1       0.96      0.92      0.94       389\n",
            "           2       0.87      0.98      0.92       394\n",
            "           3       0.97      0.36      0.52       251\n",
            "\n",
            "    accuracy                           0.84      1353\n",
            "   macro avg       0.87      0.80      0.79      1353\n",
            "weighted avg       0.87      0.84      0.82      1353\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YDFPmdKqQE8q",
        "outputId": "a99461a4-ccc2-43c3-bf9a-70013ef06120"
      },
      "source": [
        "model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())\n",
        "\n",
        "model.fit(X_train, y_train)\n",
        "y_pred = model.predict(X_test)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.\n",
            "  warnings.warn(msg, category=FutureWarning)\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RpLrlyEBQe55",
        "outputId": "5f846bb9-e9fb-42f3-bf5e-a88b18573c80"
      },
      "source": [
        "print(classification_report(y_test,y_pred))"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.73      0.87      0.79       319\n",
            "           1       0.97      0.86      0.91       389\n",
            "           2       0.95      0.90      0.93       394\n",
            "           3       0.76      0.76      0.76       251\n",
            "\n",
            "    accuracy                           0.86      1353\n",
            "   macro avg       0.85      0.85      0.85      1353\n",
            "weighted avg       0.87      0.86      0.86      1353\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "b5VD7SU0QlbC"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "Scripts .ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyOM2qIA/w+PTB03InC0DIdd",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/seanbenhur/c4693800501aa05a82b06d22329f7d3e/scripts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "-eFyc-G7O3nL",
	"outputId": "db33f12f-79b4-46c4-ea52-40724bb87ee0"
	},
	"source": [
	"from sklearn.datasets import fetch_20newsgroups\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.naive_bayes import MultinomialNB\n",
	"from sklearn.pipeline import make_pipeline\n",
	"from sklearn.metrics import classification_report\n",
	"from imblearn.under_sampling import RandomUnderSampler\n",
	"from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
	"from collections import Counter\n",
	"\n",
	"categories = [\n",
	" \"alt.atheism\",\n",
	" \"talk.religion.misc\",\n",
	" \"comp.graphics\",\n",
	" \"sci.space\",\n",
	"]\n",
	"newsgroups_train = fetch_20newsgroups(subset=\"train\", categories=categories)\n",
	"newsgroups_test = fetch_20newsgroups(subset=\"test\", categories=categories)\n",
	"\n",
	"X_train = newsgroups_train.data\n",
	"X_test = newsgroups_test.data\n",
	"\n",
	"y_train = newsgroups_train.target\n",
	"y_test = newsgroups_test.target"
	],
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Downloading 20news dataset. This may take a few minutes.\n",
	"Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
	],
	"name": "stderr"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "SHVU7LqkPrmA",
	"outputId": "21e97173-ef36-4c64-a91a-9a93e9f5199a"
	},
	"source": [
	"print(f\"Training class distributions summary: {Counter(y_train)}\")\n",
	"print(f\"Test class distributions summary: {Counter(y_test)}\")"
	],
	"execution_count": 14,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Training class distributions summary: Counter({2: 593, 1: 584, 0: 480, 3: 377})\n",
	"Test class distributions summary: Counter({2: 394, 1: 389, 0: 319, 3: 251})\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vA3Iw6FURO9Z"
	},
	"source": [
	"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
	"model.fit(X_train, y_train)\n",
	"y_pred = model.predict(X_test)"
	],
	"execution_count": 15,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "w_T-Ieb4QKic",
	"outputId": "ac21e6e0-82a7-49f4-bf70-b8ec8bdf7191"
	},
	"source": [
	"print(classification_report(y_test,y_pred))"
	],
	"execution_count": 16,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.67 0.94 0.79 319\n",
	" 1 0.96 0.92 0.94 389\n",
	" 2 0.87 0.98 0.92 394\n",
	" 3 0.97 0.36 0.52 251\n",
	"\n",
	" accuracy 0.84 1353\n",
	" macro avg 0.87 0.80 0.79 1353\n",
	"weighted avg 0.87 0.84 0.82 1353\n",
	"\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "YDFPmdKqQE8q",
	"outputId": "a99461a4-ccc2-43c3-bf9a-70013ef06120"
	},
	"source": [
	"model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())\n",
	"\n",
	"model.fit(X_train, y_train)\n",
	"y_pred = model.predict(X_test)"
	],
	"execution_count": 12,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.\n",
	" warnings.warn(msg, category=FutureWarning)\n"
	],
	"name": "stderr"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "RpLrlyEBQe55",
	"outputId": "5f846bb9-e9fb-42f3-bf5e-a88b18573c80"
	},
	"source": [
	"print(classification_report(y_test,y_pred))"
	],
	"execution_count": 13,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.73 0.87 0.79 319\n",
	" 1 0.97 0.86 0.91 389\n",
	" 2 0.95 0.90 0.93 394\n",
	" 3 0.76 0.76 0.76 251\n",
	"\n",
	" accuracy 0.86 1353\n",
	" macro avg 0.85 0.85 0.85 1353\n",
	"weighted avg 0.87 0.86 0.86 1353\n",
	"\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "b5VD7SU0QlbC"
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}