Skip to content

Instantly share code, notes, and snippets.

@mickaellegal
Created January 27, 2014 17:38
Show Gist options
  • Save mickaellegal/8653451 to your computer and use it in GitHub Desktop.
Save mickaellegal/8653451 to your computer and use it in GitHub Desktop.
iPython Notebook: 50onRed test
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Importing the libaries"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Importing the libraries\n",
"import pandas as pd\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 39
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Loading and formatting the data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Loading the training set \n",
"train_data = pd.read_csv(\"training-data-set.csv\", sep=\" \")\n",
"\n",
"train_data.head(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cat1</th>\n",
" <th>cat2</th>\n",
" <th>cat3</th>\n",
" <th>cat4</th>\n",
" <th>cat5</th>\n",
" <th>cat6</th>\n",
" <th>cat7</th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> b</td>\n",
" <td> 1.053900</td>\n",
" <td>-0.062460</td>\n",
" <td> 0.508648</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> y</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> d</td>\n",
" <td>-0.575898</td>\n",
" <td> 1.053315</td>\n",
" <td> 2.100263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> y</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td> 0.392731</td>\n",
" <td>-0.395918</td>\n",
" <td> 1.813869</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> c</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> d</td>\n",
" <td> 1.255048</td>\n",
" <td> 0.812365</td>\n",
" <td> 0.115558</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td>-0.848028</td>\n",
" <td> 1.575932</td>\n",
" <td> 0.407990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> c</td>\n",
" <td>-2.000425</td>\n",
" <td> 0.168658</td>\n",
" <td> 1.089865</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> c</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> d</td>\n",
" <td> 1.986990</td>\n",
" <td> 0.100123</td>\n",
" <td>-0.156572</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> c</td>\n",
" <td> a</td>\n",
" <td> 0.179694</td>\n",
" <td>-0.207595</td>\n",
" <td> 0.150446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> b</td>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td>-0.287543</td>\n",
" <td> 1.227005</td>\n",
" <td> 1.037588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> b</td>\n",
" <td> a</td>\n",
" <td> 0.018208</td>\n",
" <td>-0.942384</td>\n",
" <td>-0.494788</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n",
"0 n n a n n a b 1.053900 -0.062460 0.508648\n",
"1 n n a y y a d -0.575898 1.053315 2.100263\n",
"2 y n a n n a a 0.392731 -0.395918 1.813869\n",
"3 n y c n n a d 1.255048 0.812365 0.115558\n",
"4 n y a n n a a -0.848028 1.575932 0.407990\n",
"5 n n a n y a c -2.000425 0.168658 1.089865\n",
"6 n y c n n a d 1.986990 0.100123 -0.156572\n",
"7 n n a n n c a 0.179694 -0.207595 0.150446\n",
"8 n y b n y a a -0.287543 1.227005 1.037588\n",
"9 n y a n y b a 0.018208 -0.942384 -0.494788"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Converting the categorical variables into numercial variables\n",
"categorical_values = set()\n",
"for i in train_data['cat1']:\n",
" categorical_values.add(i)\n",
"for j in train_data['cat7']:\n",
" categorical_values.add(j)\n",
" \n",
"print categorical_values "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"set(['a', 'c', 'b', 'd', 'n', 'y'])\n"
]
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We assign a numerical value to each of the categorical values\n",
"train_data = train_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n",
"\n",
"train_data.head(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cat1</th>\n",
" <th>cat2</th>\n",
" <th>cat3</th>\n",
" <th>cat4</th>\n",
" <th>cat5</th>\n",
" <th>cat6</th>\n",
" <th>cat7</th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 1.053900</td>\n",
" <td>-0.062460</td>\n",
" <td> 0.508648</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td>-0.575898</td>\n",
" <td> 1.053315</td>\n",
" <td> 2.100263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 6</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.392731</td>\n",
" <td>-0.395918</td>\n",
" <td> 1.813869</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 3</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 1.255048</td>\n",
" <td> 0.812365</td>\n",
" <td> 0.115558</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td>-0.848028</td>\n",
" <td> 1.575932</td>\n",
" <td> 0.407990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td>-2.000425</td>\n",
" <td> 0.168658</td>\n",
" <td> 1.089865</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 3</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 1.986990</td>\n",
" <td> 0.100123</td>\n",
" <td>-0.156572</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" <td> 0.179694</td>\n",
" <td>-0.207595</td>\n",
" <td> 0.150446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td>-0.287543</td>\n",
" <td> 1.227005</td>\n",
" <td> 1.037588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" <td> 0.018208</td>\n",
" <td>-0.942384</td>\n",
" <td>-0.494788</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 31,
"text": [
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n",
"0 5 5 1 5 5 1 2 1.053900 -0.062460 0.508648\n",
"1 5 5 1 6 6 1 4 -0.575898 1.053315 2.100263\n",
"2 6 5 1 5 5 1 1 0.392731 -0.395918 1.813869\n",
"3 5 6 3 5 5 1 4 1.255048 0.812365 0.115558\n",
"4 5 6 1 5 5 1 1 -0.848028 1.575932 0.407990\n",
"5 5 5 1 5 6 1 3 -2.000425 0.168658 1.089865\n",
"6 5 6 3 5 5 1 4 1.986990 0.100123 -0.156572\n",
"7 5 5 1 5 5 3 1 0.179694 -0.207595 0.150446\n",
"8 5 6 2 5 6 1 1 -0.287543 1.227005 1.037588\n",
"9 5 6 1 5 6 2 1 0.018208 -0.942384 -0.494788"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Loading the training labels\n",
"\n",
"train_labels = pd.read_csv(\"training-data-labels.csv\")\n",
"\n",
"train_labels.head(5)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
" label\n",
"0 0\n",
"1 1\n",
"2 1\n",
"3 0\n",
"4 1"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Loading the test set\n",
"\n",
"test_data = pd.read_csv(\"test-data-set.csv\", sep=\" \")\n",
"\n",
"test_data.head(5)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cat1</th>\n",
" <th>cat2</th>\n",
" <th>cat3</th>\n",
" <th>cat4</th>\n",
" <th>cat5</th>\n",
" <th>cat6</th>\n",
" <th>cat7</th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td> 0.171982</td>\n",
" <td>-0.022455</td>\n",
" <td> 0.668533</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> n</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td> 0.301511</td>\n",
" <td> 0.119037</td>\n",
" <td>-0.292068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> y</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td>-0.441025</td>\n",
" <td> 1.052455</td>\n",
" <td> 0.820292</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> c</td>\n",
" <td> a</td>\n",
" <td> 0.421350</td>\n",
" <td> 0.223962</td>\n",
" <td>-0.187951</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> n</td>\n",
" <td> n</td>\n",
" <td> a</td>\n",
" <td> y</td>\n",
" <td> y</td>\n",
" <td> a</td>\n",
" <td> a</td>\n",
" <td>-0.390083</td>\n",
" <td> 0.556335</td>\n",
" <td>-1.434217</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n",
"0 n n a n n a a 0.171982 -0.022455 0.668533\n",
"1 n y a n n a a 0.301511 0.119037 -0.292068\n",
"2 n n a y y a a -0.441025 1.052455 0.820292\n",
"3 n n a n n c a 0.421350 0.223962 -0.187951\n",
"4 n n a y y a a -0.390083 0.556335 -1.434217"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We assign a numerical value to each of the categorical values\n",
"test_data = test_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n",
"\n",
"test_data.head(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cat1</th>\n",
" <th>cat2</th>\n",
" <th>cat3</th>\n",
" <th>cat4</th>\n",
" <th>cat5</th>\n",
" <th>cat6</th>\n",
" <th>cat7</th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.171982</td>\n",
" <td>-0.022455</td>\n",
" <td> 0.668533</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.301511</td>\n",
" <td> 0.119037</td>\n",
" <td>-0.292068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td>-0.441025</td>\n",
" <td> 1.052455</td>\n",
" <td> 0.820292</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" <td> 0.421350</td>\n",
" <td> 0.223962</td>\n",
" <td>-0.187951</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td>-0.390083</td>\n",
" <td> 0.556335</td>\n",
" <td>-1.434217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" <td>-0.207254</td>\n",
" <td> 0.405312</td>\n",
" <td> 0.185214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.320936</td>\n",
" <td> 1.232641</td>\n",
" <td>-0.661283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td>-0.718697</td>\n",
" <td> 0.905296</td>\n",
" <td> 0.838255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 0.391449</td>\n",
" <td> 0.013134</td>\n",
" <td> 0.559273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 3</td>\n",
" <td> 5</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 1.173640</td>\n",
" <td> 0.860782</td>\n",
" <td>-1.237148</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 74,
"text": [
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n",
"0 5 5 1 5 5 1 1 0.171982 -0.022455 0.668533\n",
"1 5 6 1 5 5 1 1 0.301511 0.119037 -0.292068\n",
"2 5 5 1 6 6 1 1 -0.441025 1.052455 0.820292\n",
"3 5 5 1 5 5 3 1 0.421350 0.223962 -0.187951\n",
"4 5 5 1 6 6 1 1 -0.390083 0.556335 -1.434217\n",
"5 5 5 1 6 6 2 1 -0.207254 0.405312 0.185214\n",
"6 5 6 1 5 5 1 1 0.320936 1.232641 -0.661283\n",
"7 5 5 1 5 5 1 1 -0.718697 0.905296 0.838255\n",
"8 5 5 1 5 5 1 2 0.391449 0.013134 0.559273\n",
"9 5 5 3 5 5 1 2 1.173640 0.860782 -1.237148"
]
}
],
"prompt_number": 74
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Testing different classification models"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Testing different classifiers from the scikit-learn libraries\n",
"# Importing the different libraries\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import roc_auc_score"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# I split the training set into a sub training set(75%) et test set (25%)\n",
"\n",
"from sklearn.cross_validation import train_test_split\n",
"x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.25)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 33
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"1 - SVM Model"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create and train a classifier\n",
"SVM = SVC(gamma=0.001)\n",
"\n",
"# Fit the model\n",
"SVM.fit(x_train, y_train)\n",
"\n",
"# Return the accuracy of the model \n",
"accuracy = SVM.score(x_test, y_test)\n",
"print \"The accuracy score for the SVM model is:\" \n",
"print accuracy \n",
"\n",
"# Get the prediction\n",
"preds = SVM.predict(x_test)\n",
"\n",
"# Return the ROC AUC score\n",
"print \"The Area Under the Curve is:\" \n",
"roc_auc_score(y_test, preds)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"The accuracy score for the SVM model is:\n",
"0.6232\n",
"The Area Under the Curve is:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 67,
"text": [
"0.62780636827753056"
]
}
],
"prompt_number": 67
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"2 - Random Forest Model"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create and train a classifier\n",
"RandomForest = RandomForestClassifier()\n",
"\n",
"# Fit the model\n",
"RandomForest.fit(x_train, y_train)\n",
"\n",
"# Return the accuracy of the model\n",
"accuracy = RandomForest.score(x_test, y_test)\n",
"print \"The accuracy of the Random Forest Model is:\"\n",
"print accuracy\n",
"\n",
"# Get the predictions\n",
"preds = RandomForest.predict(x_test)\n",
"\n",
"# Return the ROC AUC score\n",
"print \"The Area Under the Curve is:\" \n",
"roc_auc_score(y_test, preds)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"The accuracy of the Random Forest Model is:\n",
"0.7992\n",
"The Area Under the Curve is:\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 72,
"text": [
"0.80056076107682528"
]
}
],
"prompt_number": 72
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"3 - K-Nearest Neighor Model"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create and train a classifier\n",
"NearestNeighbor = KNeighborsClassifier()\n",
"\n",
"# Fit the model\n",
"NearestNeighbor.fit(x_train, y_train)\n",
"\n",
"# Return the accuracy of the model \n",
"accuracy = NearestNeighbor.score(x_test, y_test)\n",
"print \"The accuracy of the K-Nearest Neighbor Model is:\"\n",
"print accuracy\n",
"\n",
"# Get the predictions\n",
"preds = NearestNeighbor.predict(x_test)\n",
"\n",
"# Return the ROC AUC score\n",
"print \"The Area Under the Curve is:\" \n",
"roc_auc_score(y_test, preds)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"The accuracy of the K-Nearest Neighbor Model is:\n",
"0.7792\n",
"The Area Under the Curve is:\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 71,
"text": [
"0.77948865150800661"
]
}
],
"prompt_number": 71
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Making predictions on the test set"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"The Random Forest model is the one providing the highest prediction accuracy. \n",
"I will therefore use this model to make the predictions on the test set. "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Predicitons made on the test set\n",
"final_preds = RandomForest.predict(test_data)\n",
"\n",
"# Dumping the results into a text file\n",
"np.savetxt('predictions_test_set.txt', final_preds, fmt='%i')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 84
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment