Skip to content

Instantly share code, notes, and snippets.

@herokyar
Created May 27, 2015 23:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save herokyar/5ffb653df69139a3e2b2 to your computer and use it in GitHub Desktop.
Save herokyar/5ffb653df69139a3e2b2 to your computer and use it in GitHub Desktop.
Crowdpac document classification
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:5d2e7d234e6e98bdca13d3d067a55a6515efbda6f41684fc30252ce9d5296d68"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Crowdpac homework -> by hasan erokyar 5/26/2015\n",
"\n"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"predicting document categories with Support Vector Machines (multi label classification)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.metrics import jaccard_similarity_score\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model.logistic import LogisticRegression\n",
"\n",
"from sklearn.svm import SVC\n",
"\n",
"from IPython.display import Image"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 144
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#create training dataframe"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#read data into dataframe (text column)\n",
"df_training_text = pd.read_table('crowdpac/training_set_text.csv')\n",
"df_training_text.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2/13/2003--Introduced. RAFT (Restore Access t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 6/26/2003--Introduced. RAFT (Restore Access t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> To amend the Transportation Equity Act for the...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 38,
"text": [
" text\n",
"0 4/26/2001--Introduced. Marriage IRA Fairness ...\n",
"1 2/13/2003--Introduced. RAFT (Restore Access t...\n",
"2 6/26/2003--Introduced. RAFT (Restore Access t...\n",
"3 To suspend temporarily the duty on 9,10-Anthra...\n",
"4 To amend the Transportation Equity Act for the...\n",
"\n",
"[5 rows x 1 columns]"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#read data into dataframe (labels column)\n",
"df_training_label = pd.read_table('crowdpac/training_set_label.csv')\n",
"df_training_label.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> economy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> economy energy transportation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> economy energy transportation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> economy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> transportation economy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 53,
"text": [
" labels\n",
"0 economy\n",
"1 economy energy transportation\n",
"2 economy energy transportation\n",
"3 economy\n",
"4 transportation economy\n",
"\n",
"[5 rows x 1 columns]"
]
}
],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#determine the unique categories\n",
"categories = set()\n",
"for m in df_training_label.labels:\n",
" categories.update(g for g in m.split(' '))\n",
"categories = sorted(categories)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print type(categories)\n",
"print categories"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'list'>\n",
"['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'defense_and_foreign_policy', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n"
]
}
],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#make a column for each category (dont use this part, because it converts to binary True and False, in scikit-learn you need numbers)\n",
"\n",
"#for category in categories:\n",
"# df_training_label[category] = [category in c.split(' ') for c in df_training_label.labels]\n",
" \n",
"#df_training_label.head()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#make a column for each category\n",
"for category in categories:\n",
" df_training_label[category] = [1 if category in c.split(' ') else 0 for c in df_training_label.labels] #list comprehension with if/else\n",
" \n",
"df_training_label.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" <th>agriculture</th>\n",
" <th>banking_and_finance</th>\n",
" <th>civil_rights</th>\n",
" <th>congress_and_procedural</th>\n",
" <th>crime</th>\n",
" <th>defense_and_foreign_policy</th>\n",
" <th>economy</th>\n",
" <th>education</th>\n",
" <th>emergency</th>\n",
" <th>energy</th>\n",
" <th>environment</th>\n",
" <th>guns</th>\n",
" <th>healthcare</th>\n",
" <th>immigration</th>\n",
" <th>labor</th>\n",
" <th>law_courts_and_judges</th>\n",
" <th>parks_and_recreation</th>\n",
" <th>sports</th>\n",
" <th>transportation</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> economy energy transportation</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> economy energy transportation</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> transportation economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 21 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 57,
"text": [
" labels agriculture banking_and_finance \\\n",
"0 economy 0 0 \n",
"1 economy energy transportation 0 0 \n",
"2 economy energy transportation 0 0 \n",
"3 economy 0 0 \n",
"4 transportation economy 0 0 \n",
"\n",
" civil_rights congress_and_procedural crime defense_and_foreign_policy \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"\n",
" economy education emergency energy environment guns healthcare \\\n",
"0 1 0 0 0 0 0 0 \n",
"1 1 0 0 1 0 0 0 \n",
"2 1 0 0 1 0 0 0 \n",
"3 1 0 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" immigration labor law_courts_and_judges parks_and_recreation sports \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 0 0 \n",
"4 0 0 0 0 0 \n",
"\n",
" transportation \n",
"0 0 ... \n",
"1 1 ... \n",
"2 1 ... \n",
"3 0 ... \n",
"4 1 ... \n",
"\n",
"[5 rows x 21 columns]"
]
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#concatenate dataframes\n",
"df_training = pd.concat([df_training_text, df_training_label], axis=1)\n",
"df_training.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>labels</th>\n",
" <th>agriculture</th>\n",
" <th>banking_and_finance</th>\n",
" <th>civil_rights</th>\n",
" <th>congress_and_procedural</th>\n",
" <th>crime</th>\n",
" <th>defense_and_foreign_policy</th>\n",
" <th>economy</th>\n",
" <th>education</th>\n",
" <th>emergency</th>\n",
" <th>energy</th>\n",
" <th>environment</th>\n",
" <th>guns</th>\n",
" <th>healthcare</th>\n",
" <th>immigration</th>\n",
" <th>labor</th>\n",
" <th>law_courts_and_judges</th>\n",
" <th>parks_and_recreation</th>\n",
" <th>sports</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n",
" <td> economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2/13/2003--Introduced. RAFT (Restore Access t...</td>\n",
" <td> economy energy transportation</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 6/26/2003--Introduced. RAFT (Restore Access t...</td>\n",
" <td> economy energy transportation</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n",
" <td> economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> To amend the Transportation Equity Act for the...</td>\n",
" <td> transportation economy</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 22 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 190,
"text": [
" text \\\n",
"0 4/26/2001--Introduced. Marriage IRA Fairness ... \n",
"1 2/13/2003--Introduced. RAFT (Restore Access t... \n",
"2 6/26/2003--Introduced. RAFT (Restore Access t... \n",
"3 To suspend temporarily the duty on 9,10-Anthra... \n",
"4 To amend the Transportation Equity Act for the... \n",
"\n",
" labels agriculture banking_and_finance \\\n",
"0 economy 0 0 \n",
"1 economy energy transportation 0 0 \n",
"2 economy energy transportation 0 0 \n",
"3 economy 0 0 \n",
"4 transportation economy 0 0 \n",
"\n",
" civil_rights congress_and_procedural crime defense_and_foreign_policy \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"\n",
" economy education emergency energy environment guns healthcare \\\n",
"0 1 0 0 0 0 0 0 \n",
"1 1 0 0 1 0 0 0 \n",
"2 1 0 0 1 0 0 0 \n",
"3 1 0 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" immigration labor law_courts_and_judges parks_and_recreation sports \n",
"0 0 0 0 0 0 ... \n",
"1 0 0 0 0 0 ... \n",
"2 0 0 0 0 0 ... \n",
"3 0 0 0 0 0 ... \n",
"4 0 0 0 0 0 ... \n",
"\n",
"[5 rows x 22 columns]"
]
}
],
"prompt_number": 190
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#create test dataframe"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#read data into dataframe (text column)\n",
"df_test_text = pd.read_table('crowdpac/test_set_text.csv')\n",
"df_test_text.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 4/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 7/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> mr. president, i ask unanimous consent that wh...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 79,
"text": [
" text\n",
"0 1/7/2003--Introduced. Investment Tax Incentiv...\n",
"1 7/19/2001--Introduced. Customs Business Fairn...\n",
"2 4/11/2002--Introduced. Senior Nutrition Act o...\n",
"3 7/11/2002--Introduced. Senior Nutrition Act o...\n",
"4 mr. president, i ask unanimous consent that wh...\n",
"\n",
"[5 rows x 1 columns]"
]
}
],
"prompt_number": 79
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#read data into dataframe (labels column)\n",
"df_test_label = pd.read_table('crowdpac/test_set_label.csv')\n",
"df_test_label.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> economy banking_and_finance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> banking_and_finance economy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> agriculture economy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> agriculture economy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> congress_and_procedural</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 80,
"text": [
" labels\n",
"0 economy banking_and_finance\n",
"1 banking_and_finance economy\n",
"2 agriculture economy\n",
"3 agriculture economy\n",
"4 congress_and_procedural\n",
"\n",
"[5 rows x 1 columns]"
]
}
],
"prompt_number": 80
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#determine the unique categories\n",
"categories = set()\n",
"for m in df_test_label.labels:\n",
" categories.update(g for g in m.split(' '))\n",
"categories = sorted(categories)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 81
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print type(categories)\n",
"print categories"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'list'>\n",
"['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'doc_labels', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n"
]
}
],
"prompt_number": 82
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"one observation : \"defence_and_foreign_policy\" column must be equal to the column name \"doc_labels\" in the test set"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_test = pd.concat([df_test_text, df_test_label], axis=1)\n",
"df_test.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>labels</th>\n",
" <th>agriculture</th>\n",
" <th>banking_and_finance</th>\n",
" <th>civil_rights</th>\n",
" <th>congress_and_procedural</th>\n",
" <th>crime</th>\n",
" <th>doc_labels</th>\n",
" <th>economy</th>\n",
" <th>education</th>\n",
" <th>emergency</th>\n",
" <th>energy</th>\n",
" <th>environment</th>\n",
" <th>guns</th>\n",
" <th>healthcare</th>\n",
" <th>immigration</th>\n",
" <th>labor</th>\n",
" <th>law_courts_and_judges</th>\n",
" <th>parks_and_recreation</th>\n",
" <th>sports</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
" <td> economy banking_and_finance</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n",
" <td> banking_and_finance economy</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 4/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
" <td> agriculture economy</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 7/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
" <td> agriculture economy</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> mr. president, i ask unanimous consent that wh...</td>\n",
" <td> congress_and_procedural</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 22 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 84,
"text": [
" text \\\n",
"0 1/7/2003--Introduced. Investment Tax Incentiv... \n",
"1 7/19/2001--Introduced. Customs Business Fairn... \n",
"2 4/11/2002--Introduced. Senior Nutrition Act o... \n",
"3 7/11/2002--Introduced. Senior Nutrition Act o... \n",
"4 mr. president, i ask unanimous consent that wh... \n",
"\n",
" labels agriculture banking_and_finance \\\n",
"0 economy banking_and_finance 0 1 \n",
"1 banking_and_finance economy 0 1 \n",
"2 agriculture economy 1 0 \n",
"3 agriculture economy 1 0 \n",
"4 congress_and_procedural 0 0 \n",
"\n",
" civil_rights congress_and_procedural crime doc_labels economy \\\n",
"0 0 0 0 0 1 \n",
"1 0 0 0 0 1 \n",
"2 0 0 0 0 1 \n",
"3 0 0 0 0 1 \n",
"4 0 1 0 0 0 \n",
"\n",
" education emergency energy environment guns healthcare immigration \\\n",
"0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 \n",
"\n",
" labor law_courts_and_judges parks_and_recreation sports \n",
"0 0 0 0 0 ... \n",
"1 0 0 0 0 ... \n",
"2 0 0 0 0 ... \n",
"3 0 0 0 0 ... \n",
"4 0 0 0 0 ... \n",
"\n",
"[5 rows x 22 columns]"
]
}
],
"prompt_number": 84
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#classification with Support Vector Machines"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#create a TfidfVectorizer (this will transform training and test set so that we can train a LogisticRegression model)\n",
"vectorizer = TfidfVectorizer()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 89
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"X_train = vectorizer.fit_transform(df_training.text)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 92
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"X_test = vectorizer.transform(df_test.text)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 93
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#make prediction with scikit-learn\n",
"#train the classifier (binary classification in this case)\n",
"#classifier = LogisticRegression()\n",
"\n",
"classifier = SVC(kernel='linear')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 177
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"classifier.fit(X_train, df_training.economy) #fit the model"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 179,
"text": [
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
" kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
" shrinking=True, tol=0.001, verbose=False)"
]
}
],
"prompt_number": 179
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"predictions = classifier.predict(X_test) #make prediction on the test set -> predictions is a numpy array"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 180
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print type(predictions)\n",
"print len(predictions)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'numpy.ndarray'>\n",
"248\n"
]
}
],
"prompt_number": 181
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"predictions[:20]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 182,
"text": [
"array([1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0], dtype=int64)"
]
}
],
"prompt_number": 182
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i in df_training.ix[:, 'agriculture':]: #this is how to select sub set of a dataframe\n",
" print i"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"agriculture\n",
"banking_and_finance\n",
"civil_rights\n",
"congress_and_procedural\n",
"crime\n",
"defense_and_foreign_policy\n",
"economy\n",
"education\n",
"emergency\n",
"energy\n",
"environment\n",
"guns\n",
"healthcare\n",
"immigration\n",
"labor\n",
"law_courts_and_judges\n",
"parks_and_recreation\n",
"sports\n",
"transportation\n",
"womens_issues\n"
]
}
],
"prompt_number": 183
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#generalize the prediction for all the categories"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"all_predictions = [] #store all the predictions for each category\n",
"\n",
"for i in df_training.ix[:, 'agriculture':]:\n",
" \n",
" classifier.fit(X_train, df_training[i]) #fit the model\n",
" \n",
" predictions = classifier.predict(X_test) #make prediction on the test set -> predictions is a numpy array\n",
" \n",
" all_predictions.append(predictions)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 184
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print len(all_predictions)\n",
"print all_predictions[0] #agriculture\n",
"print all_predictions[1] #banking_and_finance"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"20\n",
"[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0\n",
" 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
"[1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n",
" 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]\n"
]
}
],
"prompt_number": 185
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"adict = {} #create an empty dictionary to store category names and also its predictions\n",
"\n",
"for i,category in enumerate(df_training.ix[:, 'agriculture':]): #this is how to select sub set of a dataframe\n",
" #print i\n",
" \n",
" adict[category] = all_predictions[i]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 186
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_all_predictions = pd.DataFrame(adict)\n",
"df_all_predictions.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>agriculture</th>\n",
" <th>banking_and_finance</th>\n",
" <th>civil_rights</th>\n",
" <th>congress_and_procedural</th>\n",
" <th>crime</th>\n",
" <th>defense_and_foreign_policy</th>\n",
" <th>economy</th>\n",
" <th>education</th>\n",
" <th>emergency</th>\n",
" <th>energy</th>\n",
" <th>environment</th>\n",
" <th>guns</th>\n",
" <th>healthcare</th>\n",
" <th>immigration</th>\n",
" <th>labor</th>\n",
" <th>law_courts_and_judges</th>\n",
" <th>parks_and_recreation</th>\n",
" <th>sports</th>\n",
" <th>transportation</th>\n",
" <th>womens_issues</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 20 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 187,
"text": [
" agriculture banking_and_finance civil_rights congress_and_procedural \\\n",
"0 0 1 0 0 \n",
"1 0 1 0 0 \n",
"2 1 0 0 0 \n",
"3 1 0 0 0 \n",
"4 0 0 0 1 \n",
"\n",
" crime defense_and_foreign_policy economy education emergency energy \\\n",
"0 0 0 1 0 0 0 \n",
"1 0 0 1 0 0 0 \n",
"2 0 0 1 0 0 0 \n",
"3 0 0 1 0 0 0 \n",
"4 0 0 0 0 0 0 \n",
"\n",
" environment guns healthcare immigration labor law_courts_and_judges \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 \n",
"\n",
" parks_and_recreation sports transportation womens_issues \n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"\n",
"[5 rows x 20 columns]"
]
}
],
"prompt_number": 187
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 161
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#measure the accuracy of the model"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df_test.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>labels</th>\n",
" <th>agriculture</th>\n",
" <th>banking_and_finance</th>\n",
" <th>civil_rights</th>\n",
" <th>congress_and_procedural</th>\n",
" <th>crime</th>\n",
" <th>doc_labels</th>\n",
" <th>economy</th>\n",
" <th>education</th>\n",
" <th>emergency</th>\n",
" <th>energy</th>\n",
" <th>environment</th>\n",
" <th>guns</th>\n",
" <th>healthcare</th>\n",
" <th>immigration</th>\n",
" <th>labor</th>\n",
" <th>law_courts_and_judges</th>\n",
" <th>parks_and_recreation</th>\n",
" <th>sports</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
" <td> economy banking_and_finance</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n",
" <td> banking_and_finance economy</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows \u00d7 22 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 188,
"text": [
" text \\\n",
"0 1/7/2003--Introduced. Investment Tax Incentiv... \n",
"1 7/19/2001--Introduced. Customs Business Fairn... \n",
"\n",
" labels agriculture banking_and_finance \\\n",
"0 economy banking_and_finance 0 1 \n",
"1 banking_and_finance economy 0 1 \n",
"\n",
" civil_rights congress_and_procedural crime doc_labels economy \\\n",
"0 0 0 0 0 1 \n",
"1 0 0 0 0 1 \n",
"\n",
" education emergency energy environment guns healthcare immigration \\\n",
"0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 \n",
"\n",
" labor law_courts_and_judges parks_and_recreation sports \n",
"0 0 0 0 0 ... \n",
"1 0 0 0 0 ... \n",
"\n",
"[2 rows x 22 columns]"
]
}
],
"prompt_number": 188
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"#measure the accucacy of the model with Jaccard Similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#calculate the jaccard similarity score\n",
"jaccard_similarity_score(df_all_predictions, df_test.ix[:, 'agriculture':])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 189,
"text": [
"0.78897849462365599"
]
}
],
"prompt_number": 189
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"# overall accuracy of the model is about %79 percent. the accuracy can also be improved with the cross validation and grid search"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment