Created
May 27, 2015 23:44
-
-
Save herokyar/5ffb653df69139a3e2b2 to your computer and use it in GitHub Desktop.
Crowdpac document classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:5d2e7d234e6e98bdca13d3d067a55a6515efbda6f41684fc30252ce9d5296d68" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Crowdpac homework -> by hasan erokyar 5/26/2015\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"predicting document categories with Support Vector Machines (multi label classification)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"from sklearn.metrics import jaccard_similarity_score\n", | |
"\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.linear_model.logistic import LogisticRegression\n", | |
"\n", | |
"from sklearn.svm import SVC\n", | |
"\n", | |
"from IPython.display import Image" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 144 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#create training dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#read data into dataframe (text column)\n", | |
"df_training_text = pd.read_table('crowdpac/training_set_text.csv')\n", | |
"df_training_text.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2/13/2003--Introduced. RAFT (Restore Access t...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 6/26/2003--Introduced. RAFT (Restore Access t...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> To amend the Transportation Equity Act for the...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 1 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 38, | |
"text": [ | |
" text\n", | |
"0 4/26/2001--Introduced. Marriage IRA Fairness ...\n", | |
"1 2/13/2003--Introduced. RAFT (Restore Access t...\n", | |
"2 6/26/2003--Introduced. RAFT (Restore Access t...\n", | |
"3 To suspend temporarily the duty on 9,10-Anthra...\n", | |
"4 To amend the Transportation Equity Act for the...\n", | |
"\n", | |
"[5 rows x 1 columns]" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#read data into dataframe (labels column)\n", | |
"df_training_label = pd.read_table('crowdpac/training_set_label.csv')\n", | |
"df_training_label.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>labels</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> economy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> economy energy transportation</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> economy energy transportation</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> economy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> transportation economy</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 1 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 53, | |
"text": [ | |
" labels\n", | |
"0 economy\n", | |
"1 economy energy transportation\n", | |
"2 economy energy transportation\n", | |
"3 economy\n", | |
"4 transportation economy\n", | |
"\n", | |
"[5 rows x 1 columns]" | |
] | |
} | |
], | |
"prompt_number": 53 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#determine the unique categories\n", | |
"categories = set()\n", | |
"for m in df_training_label.labels:\n", | |
" categories.update(g for g in m.split(' '))\n", | |
"categories = sorted(categories)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 43 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(categories)\n", | |
"print categories" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'list'>\n", | |
"['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'defense_and_foreign_policy', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n" | |
] | |
} | |
], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#make a column for each category (dont use this part, because it converts to binary True and False, in scikit-learn you need numbers)\n", | |
"\n", | |
"#for category in categories:\n", | |
"# df_training_label[category] = [category in c.split(' ') for c in df_training_label.labels]\n", | |
" \n", | |
"#df_training_label.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#make a column for each category\n", | |
"for category in categories:\n", | |
" df_training_label[category] = [1 if category in c.split(' ') else 0 for c in df_training_label.labels] #list comprehension with if/else\n", | |
" \n", | |
"df_training_label.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>labels</th>\n", | |
" <th>agriculture</th>\n", | |
" <th>banking_and_finance</th>\n", | |
" <th>civil_rights</th>\n", | |
" <th>congress_and_procedural</th>\n", | |
" <th>crime</th>\n", | |
" <th>defense_and_foreign_policy</th>\n", | |
" <th>economy</th>\n", | |
" <th>education</th>\n", | |
" <th>emergency</th>\n", | |
" <th>energy</th>\n", | |
" <th>environment</th>\n", | |
" <th>guns</th>\n", | |
" <th>healthcare</th>\n", | |
" <th>immigration</th>\n", | |
" <th>labor</th>\n", | |
" <th>law_courts_and_judges</th>\n", | |
" <th>parks_and_recreation</th>\n", | |
" <th>sports</th>\n", | |
" <th>transportation</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> economy energy transportation</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> economy energy transportation</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> transportation economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 21 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 57, | |
"text": [ | |
" labels agriculture banking_and_finance \\\n", | |
"0 economy 0 0 \n", | |
"1 economy energy transportation 0 0 \n", | |
"2 economy energy transportation 0 0 \n", | |
"3 economy 0 0 \n", | |
"4 transportation economy 0 0 \n", | |
"\n", | |
" civil_rights congress_and_procedural crime defense_and_foreign_policy \\\n", | |
"0 0 0 0 0 \n", | |
"1 0 0 0 0 \n", | |
"2 0 0 0 0 \n", | |
"3 0 0 0 0 \n", | |
"4 0 0 0 0 \n", | |
"\n", | |
" economy education emergency energy environment guns healthcare \\\n", | |
"0 1 0 0 0 0 0 0 \n", | |
"1 1 0 0 1 0 0 0 \n", | |
"2 1 0 0 1 0 0 0 \n", | |
"3 1 0 0 0 0 0 0 \n", | |
"4 1 0 0 0 0 0 0 \n", | |
"\n", | |
" immigration labor law_courts_and_judges parks_and_recreation sports \\\n", | |
"0 0 0 0 0 0 \n", | |
"1 0 0 0 0 0 \n", | |
"2 0 0 0 0 0 \n", | |
"3 0 0 0 0 0 \n", | |
"4 0 0 0 0 0 \n", | |
"\n", | |
" transportation \n", | |
"0 0 ... \n", | |
"1 1 ... \n", | |
"2 1 ... \n", | |
"3 0 ... \n", | |
"4 1 ... \n", | |
"\n", | |
"[5 rows x 21 columns]" | |
] | |
} | |
], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#concatenate dataframes\n", | |
"df_training = pd.concat([df_training_text, df_training_label], axis=1)\n", | |
"df_training.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" <th>labels</th>\n", | |
" <th>agriculture</th>\n", | |
" <th>banking_and_finance</th>\n", | |
" <th>civil_rights</th>\n", | |
" <th>congress_and_procedural</th>\n", | |
" <th>crime</th>\n", | |
" <th>defense_and_foreign_policy</th>\n", | |
" <th>economy</th>\n", | |
" <th>education</th>\n", | |
" <th>emergency</th>\n", | |
" <th>energy</th>\n", | |
" <th>environment</th>\n", | |
" <th>guns</th>\n", | |
" <th>healthcare</th>\n", | |
" <th>immigration</th>\n", | |
" <th>labor</th>\n", | |
" <th>law_courts_and_judges</th>\n", | |
" <th>parks_and_recreation</th>\n", | |
" <th>sports</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n", | |
" <td> economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2/13/2003--Introduced. RAFT (Restore Access t...</td>\n", | |
" <td> economy energy transportation</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 6/26/2003--Introduced. RAFT (Restore Access t...</td>\n", | |
" <td> economy energy transportation</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n", | |
" <td> economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> To amend the Transportation Equity Act for the...</td>\n", | |
" <td> transportation economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 22 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 190, | |
"text": [ | |
" text \\\n", | |
"0 4/26/2001--Introduced. Marriage IRA Fairness ... \n", | |
"1 2/13/2003--Introduced. RAFT (Restore Access t... \n", | |
"2 6/26/2003--Introduced. RAFT (Restore Access t... \n", | |
"3 To suspend temporarily the duty on 9,10-Anthra... \n", | |
"4 To amend the Transportation Equity Act for the... \n", | |
"\n", | |
" labels agriculture banking_and_finance \\\n", | |
"0 economy 0 0 \n", | |
"1 economy energy transportation 0 0 \n", | |
"2 economy energy transportation 0 0 \n", | |
"3 economy 0 0 \n", | |
"4 transportation economy 0 0 \n", | |
"\n", | |
" civil_rights congress_and_procedural crime defense_and_foreign_policy \\\n", | |
"0 0 0 0 0 \n", | |
"1 0 0 0 0 \n", | |
"2 0 0 0 0 \n", | |
"3 0 0 0 0 \n", | |
"4 0 0 0 0 \n", | |
"\n", | |
" economy education emergency energy environment guns healthcare \\\n", | |
"0 1 0 0 0 0 0 0 \n", | |
"1 1 0 0 1 0 0 0 \n", | |
"2 1 0 0 1 0 0 0 \n", | |
"3 1 0 0 0 0 0 0 \n", | |
"4 1 0 0 0 0 0 0 \n", | |
"\n", | |
" immigration labor law_courts_and_judges parks_and_recreation sports \n", | |
"0 0 0 0 0 0 ... \n", | |
"1 0 0 0 0 0 ... \n", | |
"2 0 0 0 0 0 ... \n", | |
"3 0 0 0 0 0 ... \n", | |
"4 0 0 0 0 0 ... \n", | |
"\n", | |
"[5 rows x 22 columns]" | |
] | |
} | |
], | |
"prompt_number": 190 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#create test dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#read data into dataframe (text column)\n", | |
"df_test_text = pd.read_table('crowdpac/test_set_text.csv')\n", | |
"df_test_text.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 4/11/2002--Introduced. Senior Nutrition Act o...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 7/11/2002--Introduced. Senior Nutrition Act o...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> mr. president, i ask unanimous consent that wh...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 1 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 79, | |
"text": [ | |
" text\n", | |
"0 1/7/2003--Introduced. Investment Tax Incentiv...\n", | |
"1 7/19/2001--Introduced. Customs Business Fairn...\n", | |
"2 4/11/2002--Introduced. Senior Nutrition Act o...\n", | |
"3 7/11/2002--Introduced. Senior Nutrition Act o...\n", | |
"4 mr. president, i ask unanimous consent that wh...\n", | |
"\n", | |
"[5 rows x 1 columns]" | |
] | |
} | |
], | |
"prompt_number": 79 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#read data into dataframe (labels column)\n", | |
"df_test_label = pd.read_table('crowdpac/test_set_label.csv')\n", | |
"df_test_label.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>labels</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> economy banking_and_finance</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> banking_and_finance economy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> agriculture economy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> agriculture economy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> congress_and_procedural</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 1 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 80, | |
"text": [ | |
" labels\n", | |
"0 economy banking_and_finance\n", | |
"1 banking_and_finance economy\n", | |
"2 agriculture economy\n", | |
"3 agriculture economy\n", | |
"4 congress_and_procedural\n", | |
"\n", | |
"[5 rows x 1 columns]" | |
] | |
} | |
], | |
"prompt_number": 80 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#determine the unique categories\n", | |
"categories = set()\n", | |
"for m in df_test_label.labels:\n", | |
" categories.update(g for g in m.split(' '))\n", | |
"categories = sorted(categories)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 81 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(categories)\n", | |
"print categories" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'list'>\n", | |
"['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'doc_labels', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n" | |
] | |
} | |
], | |
"prompt_number": 82 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"one observation : \"defence_and_foreign_policy\" column must be equal to the column name \"doc_labels\" in the test set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df_test = pd.concat([df_test_text, df_test_label], axis=1)\n", | |
"df_test.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" <th>labels</th>\n", | |
" <th>agriculture</th>\n", | |
" <th>banking_and_finance</th>\n", | |
" <th>civil_rights</th>\n", | |
" <th>congress_and_procedural</th>\n", | |
" <th>crime</th>\n", | |
" <th>doc_labels</th>\n", | |
" <th>economy</th>\n", | |
" <th>education</th>\n", | |
" <th>emergency</th>\n", | |
" <th>energy</th>\n", | |
" <th>environment</th>\n", | |
" <th>guns</th>\n", | |
" <th>healthcare</th>\n", | |
" <th>immigration</th>\n", | |
" <th>labor</th>\n", | |
" <th>law_courts_and_judges</th>\n", | |
" <th>parks_and_recreation</th>\n", | |
" <th>sports</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n", | |
" <td> economy banking_and_finance</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n", | |
" <td> banking_and_finance economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 4/11/2002--Introduced. Senior Nutrition Act o...</td>\n", | |
" <td> agriculture economy</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 7/11/2002--Introduced. Senior Nutrition Act o...</td>\n", | |
" <td> agriculture economy</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> mr. president, i ask unanimous consent that wh...</td>\n", | |
" <td> congress_and_procedural</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 22 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 84, | |
"text": [ | |
" text \\\n", | |
"0 1/7/2003--Introduced. Investment Tax Incentiv... \n", | |
"1 7/19/2001--Introduced. Customs Business Fairn... \n", | |
"2 4/11/2002--Introduced. Senior Nutrition Act o... \n", | |
"3 7/11/2002--Introduced. Senior Nutrition Act o... \n", | |
"4 mr. president, i ask unanimous consent that wh... \n", | |
"\n", | |
" labels agriculture banking_and_finance \\\n", | |
"0 economy banking_and_finance 0 1 \n", | |
"1 banking_and_finance economy 0 1 \n", | |
"2 agriculture economy 1 0 \n", | |
"3 agriculture economy 1 0 \n", | |
"4 congress_and_procedural 0 0 \n", | |
"\n", | |
" civil_rights congress_and_procedural crime doc_labels economy \\\n", | |
"0 0 0 0 0 1 \n", | |
"1 0 0 0 0 1 \n", | |
"2 0 0 0 0 1 \n", | |
"3 0 0 0 0 1 \n", | |
"4 0 1 0 0 0 \n", | |
"\n", | |
" education emergency energy environment guns healthcare immigration \\\n", | |
"0 0 0 0 0 0 0 0 \n", | |
"1 0 0 0 0 0 0 0 \n", | |
"2 0 0 0 0 0 0 0 \n", | |
"3 0 0 0 0 0 0 0 \n", | |
"4 0 0 0 0 0 0 0 \n", | |
"\n", | |
" labor law_courts_and_judges parks_and_recreation sports \n", | |
"0 0 0 0 0 ... \n", | |
"1 0 0 0 0 ... \n", | |
"2 0 0 0 0 ... \n", | |
"3 0 0 0 0 ... \n", | |
"4 0 0 0 0 ... \n", | |
"\n", | |
"[5 rows x 22 columns]" | |
] | |
} | |
], | |
"prompt_number": 84 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#classification with Support Vector Machines" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#create a TfidfVectorizer (this will transform training and test set so that we can train a LogisticRegression model)\n", | |
"vectorizer = TfidfVectorizer()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 89 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"X_train = vectorizer.fit_transform(df_training.text)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 92 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"X_test = vectorizer.transform(df_test.text)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 93 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#make prediction with scikit-learn\n", | |
"#train the classifier (binary classification in this case)\n", | |
"#classifier = LogisticRegression()\n", | |
"\n", | |
"classifier = SVC(kernel='linear')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 177 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"classifier.fit(X_train, df_training.economy) #fit the model" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 179, | |
"text": [ | |
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", | |
" kernel='linear', max_iter=-1, probability=False, random_state=None,\n", | |
" shrinking=True, tol=0.001, verbose=False)" | |
] | |
} | |
], | |
"prompt_number": 179 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"predictions = classifier.predict(X_test) #make prediction on the test set -> predictions is a numpy array" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 180 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(predictions)\n", | |
"print len(predictions)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'numpy.ndarray'>\n", | |
"248\n" | |
] | |
} | |
], | |
"prompt_number": 181 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"predictions[:20]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 182, | |
"text": [ | |
"array([1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0], dtype=int64)" | |
] | |
} | |
], | |
"prompt_number": 182 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for i in df_training.ix[:, 'agriculture':]: #this is how to select sub set of a dataframe\n", | |
" print i" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"agriculture\n", | |
"banking_and_finance\n", | |
"civil_rights\n", | |
"congress_and_procedural\n", | |
"crime\n", | |
"defense_and_foreign_policy\n", | |
"economy\n", | |
"education\n", | |
"emergency\n", | |
"energy\n", | |
"environment\n", | |
"guns\n", | |
"healthcare\n", | |
"immigration\n", | |
"labor\n", | |
"law_courts_and_judges\n", | |
"parks_and_recreation\n", | |
"sports\n", | |
"transportation\n", | |
"womens_issues\n" | |
] | |
} | |
], | |
"prompt_number": 183 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#generalize the prediction for all the categories" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"all_predictions = [] #store all the predictions for each category\n", | |
"\n", | |
"for i in df_training.ix[:, 'agriculture':]:\n", | |
" \n", | |
" classifier.fit(X_train, df_training[i]) #fit the model\n", | |
" \n", | |
" predictions = classifier.predict(X_test) #make prediction on the test set -> predictions is a numpy array\n", | |
" \n", | |
" all_predictions.append(predictions)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 184 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print len(all_predictions)\n", | |
"print all_predictions[0] #agriculture\n", | |
"print all_predictions[1] #banking_and_finance" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"20\n", | |
"[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0\n", | |
" 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", | |
"[1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n", | |
" 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
" 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1\n", | |
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]\n" | |
] | |
} | |
], | |
"prompt_number": 185 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"adict = {} #create an empty dictionary to store category names and also its predictions\n", | |
"\n", | |
"for i,category in enumerate(df_training.ix[:, 'agriculture':]): #this is how to select sub set of a dataframe\n", | |
" #print i\n", | |
" \n", | |
" adict[category] = all_predictions[i]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 186 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df_all_predictions = pd.DataFrame(adict)\n", | |
"df_all_predictions.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>agriculture</th>\n", | |
" <th>banking_and_finance</th>\n", | |
" <th>civil_rights</th>\n", | |
" <th>congress_and_procedural</th>\n", | |
" <th>crime</th>\n", | |
" <th>defense_and_foreign_policy</th>\n", | |
" <th>economy</th>\n", | |
" <th>education</th>\n", | |
" <th>emergency</th>\n", | |
" <th>energy</th>\n", | |
" <th>environment</th>\n", | |
" <th>guns</th>\n", | |
" <th>healthcare</th>\n", | |
" <th>immigration</th>\n", | |
" <th>labor</th>\n", | |
" <th>law_courts_and_judges</th>\n", | |
" <th>parks_and_recreation</th>\n", | |
" <th>sports</th>\n", | |
" <th>transportation</th>\n", | |
" <th>womens_issues</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 20 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 187, | |
"text": [ | |
" agriculture banking_and_finance civil_rights congress_and_procedural \\\n", | |
"0 0 1 0 0 \n", | |
"1 0 1 0 0 \n", | |
"2 1 0 0 0 \n", | |
"3 1 0 0 0 \n", | |
"4 0 0 0 1 \n", | |
"\n", | |
" crime defense_and_foreign_policy economy education emergency energy \\\n", | |
"0 0 0 1 0 0 0 \n", | |
"1 0 0 1 0 0 0 \n", | |
"2 0 0 1 0 0 0 \n", | |
"3 0 0 1 0 0 0 \n", | |
"4 0 0 0 0 0 0 \n", | |
"\n", | |
" environment guns healthcare immigration labor law_courts_and_judges \\\n", | |
"0 0 0 0 0 0 0 \n", | |
"1 0 0 0 0 0 0 \n", | |
"2 0 0 0 0 0 0 \n", | |
"3 0 0 0 0 0 0 \n", | |
"4 0 0 0 0 0 0 \n", | |
"\n", | |
" parks_and_recreation sports transportation womens_issues \n", | |
"0 0 0 0 0 \n", | |
"1 0 0 0 0 \n", | |
"2 0 0 0 0 \n", | |
"3 0 0 0 0 \n", | |
"4 0 0 0 0 \n", | |
"\n", | |
"[5 rows x 20 columns]" | |
] | |
} | |
], | |
"prompt_number": 187 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 161 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#measure the accuracy of the model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df_test.head(2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" <th>labels</th>\n", | |
" <th>agriculture</th>\n", | |
" <th>banking_and_finance</th>\n", | |
" <th>civil_rights</th>\n", | |
" <th>congress_and_procedural</th>\n", | |
" <th>crime</th>\n", | |
" <th>doc_labels</th>\n", | |
" <th>economy</th>\n", | |
" <th>education</th>\n", | |
" <th>emergency</th>\n", | |
" <th>energy</th>\n", | |
" <th>environment</th>\n", | |
" <th>guns</th>\n", | |
" <th>healthcare</th>\n", | |
" <th>immigration</th>\n", | |
" <th>labor</th>\n", | |
" <th>law_courts_and_judges</th>\n", | |
" <th>parks_and_recreation</th>\n", | |
" <th>sports</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1/7/2003--Introduced. Investment Tax Incentiv...</td>\n", | |
" <td> economy banking_and_finance</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 7/19/2001--Introduced. Customs Business Fairn...</td>\n", | |
" <td> banking_and_finance economy</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows \u00d7 22 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 188, | |
"text": [ | |
" text \\\n", | |
"0 1/7/2003--Introduced. Investment Tax Incentiv... \n", | |
"1 7/19/2001--Introduced. Customs Business Fairn... \n", | |
"\n", | |
" labels agriculture banking_and_finance \\\n", | |
"0 economy banking_and_finance 0 1 \n", | |
"1 banking_and_finance economy 0 1 \n", | |
"\n", | |
" civil_rights congress_and_procedural crime doc_labels economy \\\n", | |
"0 0 0 0 0 1 \n", | |
"1 0 0 0 0 1 \n", | |
"\n", | |
" education emergency energy environment guns healthcare immigration \\\n", | |
"0 0 0 0 0 0 0 0 \n", | |
"1 0 0 0 0 0 0 0 \n", | |
"\n", | |
" labor law_courts_and_judges parks_and_recreation sports \n", | |
"0 0 0 0 0 ... \n", | |
"1 0 0 0 0 ... \n", | |
"\n", | |
"[2 rows x 22 columns]" | |
] | |
} | |
], | |
"prompt_number": 188 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"#measure the accucacy of the model with Jaccard Similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#calculate the jaccard similarity score\n", | |
"jaccard_similarity_score(df_all_predictions, df_test.ix[:, 'agriculture':])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 189, | |
"text": [ | |
"0.78897849462365599" | |
] | |
} | |
], | |
"prompt_number": 189 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"# overall accuracy of the model is about %79 percent. the accuracy can also be improved with the cross validation and grid search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment