herokyar/Crowdpac_homework_hasan.ipynb

## Crowdpac_homework_hasan.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:5d2e7d234e6e98bdca13d3d067a55a6515efbda6f41684fc30252ce9d5296d68"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Crowdpac homework -> by hasan erokyar 5/26/2015\n",
      "\n"
     ]
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "predicting document categories with Support Vector Machines (multi label classification)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import pandas as pd\n",
      "\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "from sklearn.metrics import jaccard_similarity_score\n",
      "\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "from sklearn.linear_model.logistic import LogisticRegression\n",
      "\n",
      "from sklearn.svm import SVC\n",
      "\n",
      "from IPython.display import Image"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 144
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#create training dataframe"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#read data into dataframe (text column)\n",
      "df_training_text = pd.read_table('crowdpac/training_set_text.csv')\n",
      "df_training_text.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  2/13/2003--Introduced. RAFT (Restore Access t...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>  6/26/2003--Introduced. RAFT (Restore Access t...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> To amend the Transportation Equity Act for the...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 38,
       "text": [
        "                                                text\n",
        "0   4/26/2001--Introduced. Marriage IRA Fairness ...\n",
        "1   2/13/2003--Introduced. RAFT (Restore Access t...\n",
        "2   6/26/2003--Introduced. RAFT (Restore Access t...\n",
        "3  To suspend temporarily the duty on 9,10-Anthra...\n",
        "4  To amend the Transportation Equity Act for the...\n",
        "\n",
        "[5 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#read data into dataframe (labels column)\n",
      "df_training_label = pd.read_table('crowdpac/training_set_label.csv')\n",
      "df_training_label.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>labels</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>                       economy</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> economy energy transportation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> economy energy transportation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>                       economy</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>        transportation economy</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 53,
       "text": [
        "                          labels\n",
        "0                        economy\n",
        "1  economy energy transportation\n",
        "2  economy energy transportation\n",
        "3                        economy\n",
        "4         transportation economy\n",
        "\n",
        "[5 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 53
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#determine the unique categories\n",
      "categories = set()\n",
      "for m in df_training_label.labels:\n",
      "    categories.update(g for g in m.split(' '))\n",
      "categories = sorted(categories)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 43
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print type(categories)\n",
      "print categories"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'list'>\n",
        "['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'defense_and_foreign_policy', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n"
       ]
      }
     ],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#make a column for each category (dont use this part, because it converts to binary True and False, in scikit-learn you need numbers)\n",
      "\n",
      "#for category in categories:\n",
      "#    df_training_label[category] = [category in c.split(' ') for c in df_training_label.labels]\n",
      "    \n",
      "#df_training_label.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#make a column for each category\n",
      "for category in categories:\n",
      "    df_training_label[category] = [1 if category in c.split(' ') else 0 for c in df_training_label.labels] #list comprehension with if/else\n",
      "    \n",
      "df_training_label.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>labels</th>\n",
        "      <th>agriculture</th>\n",
        "      <th>banking_and_finance</th>\n",
        "      <th>civil_rights</th>\n",
        "      <th>congress_and_procedural</th>\n",
        "      <th>crime</th>\n",
        "      <th>defense_and_foreign_policy</th>\n",
        "      <th>economy</th>\n",
        "      <th>education</th>\n",
        "      <th>emergency</th>\n",
        "      <th>energy</th>\n",
        "      <th>environment</th>\n",
        "      <th>guns</th>\n",
        "      <th>healthcare</th>\n",
        "      <th>immigration</th>\n",
        "      <th>labor</th>\n",
        "      <th>law_courts_and_judges</th>\n",
        "      <th>parks_and_recreation</th>\n",
        "      <th>sports</th>\n",
        "      <th>transportation</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>                       economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> economy energy transportation</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> economy energy transportation</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>                       economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>        transportation economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 21 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 57,
       "text": [
        "                          labels  agriculture  banking_and_finance  \\\n",
        "0                        economy            0                    0   \n",
        "1  economy energy transportation            0                    0   \n",
        "2  economy energy transportation            0                    0   \n",
        "3                        economy            0                    0   \n",
        "4         transportation economy            0                    0   \n",
        "\n",
        "   civil_rights  congress_and_procedural  crime  defense_and_foreign_policy  \\\n",
        "0             0                        0      0                           0   \n",
        "1             0                        0      0                           0   \n",
        "2             0                        0      0                           0   \n",
        "3             0                        0      0                           0   \n",
        "4             0                        0      0                           0   \n",
        "\n",
        "   economy  education  emergency  energy  environment  guns  healthcare  \\\n",
        "0        1          0          0       0            0     0           0   \n",
        "1        1          0          0       1            0     0           0   \n",
        "2        1          0          0       1            0     0           0   \n",
        "3        1          0          0       0            0     0           0   \n",
        "4        1          0          0       0            0     0           0   \n",
        "\n",
        "   immigration  labor  law_courts_and_judges  parks_and_recreation  sports  \\\n",
        "0            0      0                      0                     0       0   \n",
        "1            0      0                      0                     0       0   \n",
        "2            0      0                      0                     0       0   \n",
        "3            0      0                      0                     0       0   \n",
        "4            0      0                      0                     0       0   \n",
        "\n",
        "   transportation      \n",
        "0               0 ...  \n",
        "1               1 ...  \n",
        "2               1 ...  \n",
        "3               0 ...  \n",
        "4               1 ...  \n",
        "\n",
        "[5 rows x 21 columns]"
       ]
      }
     ],
     "prompt_number": 57
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#concatenate dataframes\n",
      "df_training = pd.concat([df_training_text, df_training_label], axis=1)\n",
      "df_training.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text</th>\n",
        "      <th>labels</th>\n",
        "      <th>agriculture</th>\n",
        "      <th>banking_and_finance</th>\n",
        "      <th>civil_rights</th>\n",
        "      <th>congress_and_procedural</th>\n",
        "      <th>crime</th>\n",
        "      <th>defense_and_foreign_policy</th>\n",
        "      <th>economy</th>\n",
        "      <th>education</th>\n",
        "      <th>emergency</th>\n",
        "      <th>energy</th>\n",
        "      <th>environment</th>\n",
        "      <th>guns</th>\n",
        "      <th>healthcare</th>\n",
        "      <th>immigration</th>\n",
        "      <th>labor</th>\n",
        "      <th>law_courts_and_judges</th>\n",
        "      <th>parks_and_recreation</th>\n",
        "      <th>sports</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  4/26/2001--Introduced. Marriage IRA Fairness ...</td>\n",
        "      <td>                       economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  2/13/2003--Introduced. RAFT (Restore Access t...</td>\n",
        "      <td> economy energy transportation</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>  6/26/2003--Introduced. RAFT (Restore Access t...</td>\n",
        "      <td> economy energy transportation</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> To suspend temporarily the duty on 9,10-Anthra...</td>\n",
        "      <td>                       economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> To amend the Transportation Equity Act for the...</td>\n",
        "      <td>        transportation economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 22 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 190,
       "text": [
        "                                                text  \\\n",
        "0   4/26/2001--Introduced. Marriage IRA Fairness ...   \n",
        "1   2/13/2003--Introduced. RAFT (Restore Access t...   \n",
        "2   6/26/2003--Introduced. RAFT (Restore Access t...   \n",
        "3  To suspend temporarily the duty on 9,10-Anthra...   \n",
        "4  To amend the Transportation Equity Act for the...   \n",
        "\n",
        "                          labels  agriculture  banking_and_finance  \\\n",
        "0                        economy            0                    0   \n",
        "1  economy energy transportation            0                    0   \n",
        "2  economy energy transportation            0                    0   \n",
        "3                        economy            0                    0   \n",
        "4         transportation economy            0                    0   \n",
        "\n",
        "   civil_rights  congress_and_procedural  crime  defense_and_foreign_policy  \\\n",
        "0             0                        0      0                           0   \n",
        "1             0                        0      0                           0   \n",
        "2             0                        0      0                           0   \n",
        "3             0                        0      0                           0   \n",
        "4             0                        0      0                           0   \n",
        "\n",
        "   economy  education  emergency  energy  environment  guns  healthcare  \\\n",
        "0        1          0          0       0            0     0           0   \n",
        "1        1          0          0       1            0     0           0   \n",
        "2        1          0          0       1            0     0           0   \n",
        "3        1          0          0       0            0     0           0   \n",
        "4        1          0          0       0            0     0           0   \n",
        "\n",
        "   immigration  labor  law_courts_and_judges  parks_and_recreation  sports      \n",
        "0            0      0                      0                     0       0 ...  \n",
        "1            0      0                      0                     0       0 ...  \n",
        "2            0      0                      0                     0       0 ...  \n",
        "3            0      0                      0                     0       0 ...  \n",
        "4            0      0                      0                     0       0 ...  \n",
        "\n",
        "[5 rows x 22 columns]"
       ]
      }
     ],
     "prompt_number": 190
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#create test dataframe"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#read data into dataframe (text column)\n",
      "df_test_text = pd.read_table('crowdpac/test_set_text.csv')\n",
      "df_test_text.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  7/19/2001--Introduced. Customs Business Fairn...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>  4/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>  7/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> mr. president, i ask unanimous consent that wh...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 79,
       "text": [
        "                                                text\n",
        "0   1/7/2003--Introduced. Investment Tax Incentiv...\n",
        "1   7/19/2001--Introduced. Customs Business Fairn...\n",
        "2   4/11/2002--Introduced. Senior Nutrition Act o...\n",
        "3   7/11/2002--Introduced. Senior Nutrition Act o...\n",
        "4  mr. president, i ask unanimous consent that wh...\n",
        "\n",
        "[5 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 79
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#read data into dataframe (labels column)\n",
      "df_test_label = pd.read_table('crowdpac/test_set_label.csv')\n",
      "df_test_label.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>labels</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> economy banking_and_finance</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> banking_and_finance economy</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>         agriculture economy</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>         agriculture economy</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>     congress_and_procedural</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 80,
       "text": [
        "                        labels\n",
        "0  economy banking_and_finance\n",
        "1  banking_and_finance economy\n",
        "2          agriculture economy\n",
        "3          agriculture economy\n",
        "4      congress_and_procedural\n",
        "\n",
        "[5 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 80
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#determine the unique categories\n",
      "categories = set()\n",
      "for m in df_test_label.labels:\n",
      "    categories.update(g for g in m.split(' '))\n",
      "categories = sorted(categories)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 81
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print type(categories)\n",
      "print categories"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'list'>\n",
        "['agriculture', 'banking_and_finance', 'civil_rights', 'congress_and_procedural', 'crime', 'doc_labels', 'economy', 'education', 'emergency', 'energy', 'environment', 'guns', 'healthcare', 'immigration', 'labor', 'law_courts_and_judges', 'parks_and_recreation', 'sports', 'transportation', 'womens_issues']\n"
       ]
      }
     ],
     "prompt_number": 82
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "one observation : \"defence_and_foreign_policy\" column must be equal to the column name \"doc_labels\" in the test set"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_test = pd.concat([df_test_text, df_test_label], axis=1)\n",
      "df_test.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text</th>\n",
        "      <th>labels</th>\n",
        "      <th>agriculture</th>\n",
        "      <th>banking_and_finance</th>\n",
        "      <th>civil_rights</th>\n",
        "      <th>congress_and_procedural</th>\n",
        "      <th>crime</th>\n",
        "      <th>doc_labels</th>\n",
        "      <th>economy</th>\n",
        "      <th>education</th>\n",
        "      <th>emergency</th>\n",
        "      <th>energy</th>\n",
        "      <th>environment</th>\n",
        "      <th>guns</th>\n",
        "      <th>healthcare</th>\n",
        "      <th>immigration</th>\n",
        "      <th>labor</th>\n",
        "      <th>law_courts_and_judges</th>\n",
        "      <th>parks_and_recreation</th>\n",
        "      <th>sports</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
        "      <td> economy banking_and_finance</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  7/19/2001--Introduced. Customs Business Fairn...</td>\n",
        "      <td> banking_and_finance economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>  4/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
        "      <td>         agriculture economy</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>  7/11/2002--Introduced. Senior Nutrition Act o...</td>\n",
        "      <td>         agriculture economy</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> mr. president, i ask unanimous consent that wh...</td>\n",
        "      <td>     congress_and_procedural</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 22 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 84,
       "text": [
        "                                                text  \\\n",
        "0   1/7/2003--Introduced. Investment Tax Incentiv...   \n",
        "1   7/19/2001--Introduced. Customs Business Fairn...   \n",
        "2   4/11/2002--Introduced. Senior Nutrition Act o...   \n",
        "3   7/11/2002--Introduced. Senior Nutrition Act o...   \n",
        "4  mr. president, i ask unanimous consent that wh...   \n",
        "\n",
        "                        labels  agriculture  banking_and_finance  \\\n",
        "0  economy banking_and_finance            0                    1   \n",
        "1  banking_and_finance economy            0                    1   \n",
        "2          agriculture economy            1                    0   \n",
        "3          agriculture economy            1                    0   \n",
        "4      congress_and_procedural            0                    0   \n",
        "\n",
        "   civil_rights  congress_and_procedural  crime  doc_labels  economy  \\\n",
        "0             0                        0      0           0        1   \n",
        "1             0                        0      0           0        1   \n",
        "2             0                        0      0           0        1   \n",
        "3             0                        0      0           0        1   \n",
        "4             0                        1      0           0        0   \n",
        "\n",
        "   education  emergency  energy  environment  guns  healthcare  immigration  \\\n",
        "0          0          0       0            0     0           0            0   \n",
        "1          0          0       0            0     0           0            0   \n",
        "2          0          0       0            0     0           0            0   \n",
        "3          0          0       0            0     0           0            0   \n",
        "4          0          0       0            0     0           0            0   \n",
        "\n",
        "   labor  law_courts_and_judges  parks_and_recreation  sports      \n",
        "0      0                      0                     0       0 ...  \n",
        "1      0                      0                     0       0 ...  \n",
        "2      0                      0                     0       0 ...  \n",
        "3      0                      0                     0       0 ...  \n",
        "4      0                      0                     0       0 ...  \n",
        "\n",
        "[5 rows x 22 columns]"
       ]
      }
     ],
     "prompt_number": 84
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#classification with Support Vector Machines"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#create a TfidfVectorizer (this will transform training and test set so that we can train a LogisticRegression model)\n",
      "vectorizer = TfidfVectorizer()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 89
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train = vectorizer.fit_transform(df_training.text)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 92
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_test = vectorizer.transform(df_test.text)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 93
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#make prediction with scikit-learn\n",
      "#train the  classifier (binary classification in this case)\n",
      "#classifier = LogisticRegression()\n",
      "\n",
      "classifier = SVC(kernel='linear')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 177
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "classifier.fit(X_train, df_training.economy) #fit the model"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 179,
       "text": [
        "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
        "  kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
        "  shrinking=True, tol=0.001, verbose=False)"
       ]
      }
     ],
     "prompt_number": 179
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "predictions = classifier.predict(X_test)  #make prediction on the test set -> predictions is a numpy array"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 180
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print type(predictions)\n",
      "print len(predictions)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'numpy.ndarray'>\n",
        "248\n"
       ]
      }
     ],
     "prompt_number": 181
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "predictions[:20]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 182,
       "text": [
        "array([1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0], dtype=int64)"
       ]
      }
     ],
     "prompt_number": 182
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for i in df_training.ix[:, 'agriculture':]: #this is how to select sub set of a dataframe\n",
      "    print i"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "agriculture\n",
        "banking_and_finance\n",
        "civil_rights\n",
        "congress_and_procedural\n",
        "crime\n",
        "defense_and_foreign_policy\n",
        "economy\n",
        "education\n",
        "emergency\n",
        "energy\n",
        "environment\n",
        "guns\n",
        "healthcare\n",
        "immigration\n",
        "labor\n",
        "law_courts_and_judges\n",
        "parks_and_recreation\n",
        "sports\n",
        "transportation\n",
        "womens_issues\n"
       ]
      }
     ],
     "prompt_number": 183
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#generalize the prediction for all the categories"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_predictions = []   #store all the predictions for each category\n",
      "\n",
      "for i in df_training.ix[:, 'agriculture':]:\n",
      "    \n",
      "    classifier.fit(X_train, df_training[i]) #fit the model\n",
      "    \n",
      "    predictions = classifier.predict(X_test)  #make prediction on the test set -> predictions is a numpy array\n",
      "    \n",
      "    all_predictions.append(predictions)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 184
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print len(all_predictions)\n",
      "print all_predictions[0] #agriculture\n",
      "print all_predictions[1] #banking_and_finance"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "20\n",
        "[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0\n",
        " 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
        "[1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n",
        " 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
        " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1\n",
        " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]\n"
       ]
      }
     ],
     "prompt_number": 185
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "adict = {} #create an empty dictionary to store category names and also its predictions\n",
      "\n",
      "for i,category in enumerate(df_training.ix[:, 'agriculture':]): #this is how to select sub set of a dataframe\n",
      "    #print i\n",
      "    \n",
      "    adict[category] = all_predictions[i]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 186
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_all_predictions = pd.DataFrame(adict)\n",
      "df_all_predictions.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>agriculture</th>\n",
        "      <th>banking_and_finance</th>\n",
        "      <th>civil_rights</th>\n",
        "      <th>congress_and_procedural</th>\n",
        "      <th>crime</th>\n",
        "      <th>defense_and_foreign_policy</th>\n",
        "      <th>economy</th>\n",
        "      <th>education</th>\n",
        "      <th>emergency</th>\n",
        "      <th>energy</th>\n",
        "      <th>environment</th>\n",
        "      <th>guns</th>\n",
        "      <th>healthcare</th>\n",
        "      <th>immigration</th>\n",
        "      <th>labor</th>\n",
        "      <th>law_courts_and_judges</th>\n",
        "      <th>parks_and_recreation</th>\n",
        "      <th>sports</th>\n",
        "      <th>transportation</th>\n",
        "      <th>womens_issues</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 20 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 187,
       "text": [
        "   agriculture  banking_and_finance  civil_rights  congress_and_procedural  \\\n",
        "0            0                    1             0                        0   \n",
        "1            0                    1             0                        0   \n",
        "2            1                    0             0                        0   \n",
        "3            1                    0             0                        0   \n",
        "4            0                    0             0                        1   \n",
        "\n",
        "   crime  defense_and_foreign_policy  economy  education  emergency  energy  \\\n",
        "0      0                           0        1          0          0       0   \n",
        "1      0                           0        1          0          0       0   \n",
        "2      0                           0        1          0          0       0   \n",
        "3      0                           0        1          0          0       0   \n",
        "4      0                           0        0          0          0       0   \n",
        "\n",
        "   environment  guns  healthcare  immigration  labor  law_courts_and_judges  \\\n",
        "0            0     0           0            0      0                      0   \n",
        "1            0     0           0            0      0                      0   \n",
        "2            0     0           0            0      0                      0   \n",
        "3            0     0           0            0      0                      0   \n",
        "4            0     0           0            0      0                      0   \n",
        "\n",
        "   parks_and_recreation  sports  transportation  womens_issues  \n",
        "0                     0       0               0              0  \n",
        "1                     0       0               0              0  \n",
        "2                     0       0               0              0  \n",
        "3                     0       0               0              0  \n",
        "4                     0       0               0              0  \n",
        "\n",
        "[5 rows x 20 columns]"
       ]
      }
     ],
     "prompt_number": 187
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 161
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#measure the accuracy of the model"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_test.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text</th>\n",
        "      <th>labels</th>\n",
        "      <th>agriculture</th>\n",
        "      <th>banking_and_finance</th>\n",
        "      <th>civil_rights</th>\n",
        "      <th>congress_and_procedural</th>\n",
        "      <th>crime</th>\n",
        "      <th>doc_labels</th>\n",
        "      <th>economy</th>\n",
        "      <th>education</th>\n",
        "      <th>emergency</th>\n",
        "      <th>energy</th>\n",
        "      <th>environment</th>\n",
        "      <th>guns</th>\n",
        "      <th>healthcare</th>\n",
        "      <th>immigration</th>\n",
        "      <th>labor</th>\n",
        "      <th>law_courts_and_judges</th>\n",
        "      <th>parks_and_recreation</th>\n",
        "      <th>sports</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>  1/7/2003--Introduced. Investment Tax Incentiv...</td>\n",
        "      <td> economy banking_and_finance</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  7/19/2001--Introduced. Customs Business Fairn...</td>\n",
        "      <td> banking_and_finance economy</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>2 rows \u00d7 22 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 188,
       "text": [
        "                                                text  \\\n",
        "0   1/7/2003--Introduced. Investment Tax Incentiv...   \n",
        "1   7/19/2001--Introduced. Customs Business Fairn...   \n",
        "\n",
        "                        labels  agriculture  banking_and_finance  \\\n",
        "0  economy banking_and_finance            0                    1   \n",
        "1  banking_and_finance economy            0                    1   \n",
        "\n",
        "   civil_rights  congress_and_procedural  crime  doc_labels  economy  \\\n",
        "0             0                        0      0           0        1   \n",
        "1             0                        0      0           0        1   \n",
        "\n",
        "   education  emergency  energy  environment  guns  healthcare  immigration  \\\n",
        "0          0          0       0            0     0           0            0   \n",
        "1          0          0       0            0     0           0            0   \n",
        "\n",
        "   labor  law_courts_and_judges  parks_and_recreation  sports      \n",
        "0      0                      0                     0       0 ...  \n",
        "1      0                      0                     0       0 ...  \n",
        "\n",
        "[2 rows x 22 columns]"
       ]
      }
     ],
     "prompt_number": 188
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "#measure the accucacy of the model with Jaccard Similarity"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#calculate the jaccard similarity score\n",
      "jaccard_similarity_score(df_all_predictions, df_test.ix[:, 'agriculture':])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 189,
       "text": [
        "0.78897849462365599"
       ]
      }
     ],
     "prompt_number": 189
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "# overall accuracy of the model is about %79 percent. the accuracy can also be improved with the cross validation and grid search"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}