Skip to content

Instantly share code, notes, and snippets.

@seanbenhur
Last active April 22, 2021 04:12
Show Gist options
  • Save seanbenhur/c4693800501aa05a82b06d22329f7d3e to your computer and use it in GitHub Desktop.
Save seanbenhur/c4693800501aa05a82b06d22329f7d3e to your computer and use it in GitHub Desktop.
Scripts .ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Scripts .ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyOM2qIA/w+PTB03InC0DIdd",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/seanbenhur/c4693800501aa05a82b06d22329f7d3e/scripts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-eFyc-G7O3nL",
"outputId": "db33f12f-79b4-46c4-ea52-40724bb87ee0"
},
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import classification_report\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
"from collections import Counter\n",
"\n",
"categories = [\n",
" \"alt.atheism\",\n",
" \"talk.religion.misc\",\n",
" \"comp.graphics\",\n",
" \"sci.space\",\n",
"]\n",
"newsgroups_train = fetch_20newsgroups(subset=\"train\", categories=categories)\n",
"newsgroups_test = fetch_20newsgroups(subset=\"test\", categories=categories)\n",
"\n",
"X_train = newsgroups_train.data\n",
"X_test = newsgroups_test.data\n",
"\n",
"y_train = newsgroups_train.target\n",
"y_test = newsgroups_test.target"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Downloading 20news dataset. This may take a few minutes.\n",
"Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SHVU7LqkPrmA",
"outputId": "21e97173-ef36-4c64-a91a-9a93e9f5199a"
},
"source": [
"print(f\"Training class distributions summary: {Counter(y_train)}\")\n",
"print(f\"Test class distributions summary: {Counter(y_test)}\")"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"Training class distributions summary: Counter({2: 593, 1: 584, 0: 480, 3: 377})\n",
"Test class distributions summary: Counter({2: 394, 1: 389, 0: 319, 3: 251})\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "vA3Iw6FURO9Z"
},
"source": [
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
"model.fit(X_train, y_train)\n",
"y_pred = model.predict(X_test)"
],
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "w_T-Ieb4QKic",
"outputId": "ac21e6e0-82a7-49f4-bf70-b8ec8bdf7191"
},
"source": [
"print(classification_report(y_test,y_pred))"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.67 0.94 0.79 319\n",
" 1 0.96 0.92 0.94 389\n",
" 2 0.87 0.98 0.92 394\n",
" 3 0.97 0.36 0.52 251\n",
"\n",
" accuracy 0.84 1353\n",
" macro avg 0.87 0.80 0.79 1353\n",
"weighted avg 0.87 0.84 0.82 1353\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YDFPmdKqQE8q",
"outputId": "a99461a4-ccc2-43c3-bf9a-70013ef06120"
},
"source": [
"model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())\n",
"\n",
"model.fit(X_train, y_train)\n",
"y_pred = model.predict(X_test)"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.\n",
" warnings.warn(msg, category=FutureWarning)\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RpLrlyEBQe55",
"outputId": "5f846bb9-e9fb-42f3-bf5e-a88b18573c80"
},
"source": [
"print(classification_report(y_test,y_pred))"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.73 0.87 0.79 319\n",
" 1 0.97 0.86 0.91 389\n",
" 2 0.95 0.90 0.93 394\n",
" 3 0.76 0.76 0.76 251\n",
"\n",
" accuracy 0.86 1353\n",
" macro avg 0.85 0.85 0.85 1353\n",
"weighted avg 0.87 0.86 0.86 1353\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "b5VD7SU0QlbC"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment