Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Last active September 3, 2016 06:07
Show Gist options
  • Save devashishd12/32c4db65f565e9ebbd06ab1be210c05f to your computer and use it in GitHub Desktop.
Save devashishd12/32c4db65f565e9ebbd06ab1be210c05f to your computer and use it in GitHub Desktop.
ZS Data Science challenge notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ZS Data Science Challenge"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def converttoint(table_name, column_name):\n",
" \"\"\"\n",
" Function to convert string values of a column to int by splitting and assigning [1] value since all\n",
" the string values are like this.\n",
" \n",
" Args:\n",
" ----\n",
" table_name : Table name\n",
" column_name : Column name\n",
" \"\"\"\n",
" for u in table_name[column_name].unique():\n",
" try:\n",
" table_name.loc[table_name[column_name] == u, column_name] = int(u.split()[1])\n",
" except:\n",
" # If there's no district available, assign the value 0\n",
" table_name.loc[table_name[column_name] == u, column_name] = np.NaN"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hospital Profiling"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"hospital_profiling = pd.read_csv('/home/devashish/datasets/ZS/HospitalProfiling.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 12</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 13</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 15</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 16</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 19</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Hospital_employees\n",
"0 Hospital 1 District 12 3\n",
"1 Hospital 1 District 13 6\n",
"2 Hospital 1 District 15 2\n",
"3 Hospital 1 District 16 3\n",
"4 Hospital 1 District 19 5"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_profiling.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(hospital_profiling, 'Hospital_ID')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(hospital_profiling, 'District_ID')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>15</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Hospital_employees\n",
"0 1 12 3\n",
"1 1 13 6\n",
"2 1 15 2\n",
"3 1 16 3\n",
"4 1 19 5"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_profiling.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Starting off with hospital revenue"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"hospital_revenue = pd.read_csv('/home/devashish/datasets/ZS/HospitalRevenue.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Region_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Month 1</th>\n",
" <th>Month 2</th>\n",
" <th>Month 3</th>\n",
" <th>Month 4</th>\n",
" <th>Month 5</th>\n",
" <th>Month 6</th>\n",
" <th>Month 7</th>\n",
" <th>Month 8</th>\n",
" <th>Month 9</th>\n",
" <th>Month 10</th>\n",
" <th>Month 11</th>\n",
" <th>Month 12</th>\n",
" <th>Year Total</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hospital 1</td>\n",
" <td>Region 1</td>\n",
" <td>District 12</td>\n",
" <td>Instrument 2</td>\n",
" <td>8534</td>\n",
" <td>9917</td>\n",
" <td>7825</td>\n",
" <td>11702</td>\n",
" <td>8776</td>\n",
" <td>7755</td>\n",
" <td>9289</td>\n",
" <td>7796</td>\n",
" <td>7595</td>\n",
" <td>8292</td>\n",
" <td>7787</td>\n",
" <td>8282</td>\n",
" <td>103550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hospital 1</td>\n",
" <td>Region 1</td>\n",
" <td>District 12</td>\n",
" <td>Instrument 3</td>\n",
" <td>298</td>\n",
" <td>298</td>\n",
" <td>214</td>\n",
" <td>311</td>\n",
" <td>261</td>\n",
" <td>223</td>\n",
" <td>237</td>\n",
" <td>171</td>\n",
" <td>173</td>\n",
" <td>183</td>\n",
" <td>193</td>\n",
" <td>0</td>\n",
" <td>2562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Hospital 1</td>\n",
" <td>Region 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 1</td>\n",
" <td>37</td>\n",
" <td>40</td>\n",
" <td>38</td>\n",
" <td>43</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Hospital 1</td>\n",
" <td>Region 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 2</td>\n",
" <td>2486</td>\n",
" <td>3332</td>\n",
" <td>3193</td>\n",
" <td>2556</td>\n",
" <td>2108</td>\n",
" <td>2757</td>\n",
" <td>2639</td>\n",
" <td>2531</td>\n",
" <td>2771</td>\n",
" <td>2682</td>\n",
" <td>12317</td>\n",
" <td>1369</td>\n",
" <td>40741</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Hospital 1</td>\n",
" <td>Region 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 3</td>\n",
" <td>857</td>\n",
" <td>892</td>\n",
" <td>739</td>\n",
" <td>759</td>\n",
" <td>736</td>\n",
" <td>415</td>\n",
" <td>1203</td>\n",
" <td>434</td>\n",
" <td>448</td>\n",
" <td>113</td>\n",
" <td>829</td>\n",
" <td>1124</td>\n",
" <td>8549</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Region_ID District_ID Instrument_ID Month 1 Month 2 Month 3 \\\n",
"0 Hospital 1 Region 1 District 12 Instrument 2 8534 9917 7825 \n",
"1 Hospital 1 Region 1 District 12 Instrument 3 298 298 214 \n",
"2 Hospital 1 Region 1 District 13 Instrument 1 37 40 38 \n",
"3 Hospital 1 Region 1 District 13 Instrument 2 2486 3332 3193 \n",
"4 Hospital 1 Region 1 District 13 Instrument 3 857 892 739 \n",
"\n",
" Month 4 Month 5 Month 6 Month 7 Month 8 Month 9 Month 10 Month 11 \\\n",
"0 11702 8776 7755 9289 7796 7595 8292 7787 \n",
"1 311 261 223 237 171 173 183 193 \n",
"2 43 29 0 0 0 0 0 0 \n",
"3 2556 2108 2757 2639 2531 2771 2682 12317 \n",
"4 759 736 415 1203 434 448 113 829 \n",
"\n",
" Month 12 Year Total \n",
"0 8282 103550 \n",
"1 0 2562 \n",
"2 0 187 \n",
"3 1369 40741 \n",
"4 1124 8549 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(hospital_revenue, 'Hospital_ID')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(hospital_revenue, 'Region_ID')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"converttoint(hospital_revenue, 'District_ID')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(hospital_revenue, 'Instrument_ID')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"hospital_revenue = hospital_revenue.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 43846\n",
"Region_ID 43846\n",
"District_ID 43846\n",
"Instrument_ID 43846\n",
"Month 1 43846\n",
"Month 2 43846\n",
"Month 3 43846\n",
"Month 4 43846\n",
"Month 5 43846\n",
"Month 6 43846\n",
"Month 7 43846\n",
"Month 8 43846\n",
"Month 9 43846\n",
"Month 10 43846\n",
"Month 11 43846\n",
"Month 12 43846\n",
"Year Total 43846\n",
"dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue.count()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"projected_revenue = pd.read_csv('/home/devashish/datasets/ZS/ProjectedRevenue.csv')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Annual_Projected_Revenue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 2</td>\n",
" <td>17164</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 3</td>\n",
" <td>655645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 13</td>\n",
" <td>Instrument 4</td>\n",
" <td>281452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 2</td>\n",
" <td>Instrument 2</td>\n",
" <td>12199</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Hospital 1</td>\n",
" <td>District 20</td>\n",
" <td>Instrument 2</td>\n",
" <td>178128</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Instrument_ID Annual_Projected_Revenue\n",
"0 Hospital 1 District 13 Instrument 2 17164\n",
"1 Hospital 1 District 13 Instrument 3 655645\n",
"2 Hospital 1 District 13 Instrument 4 281452\n",
"3 Hospital 1 District 2 Instrument 2 12199\n",
"4 Hospital 1 District 20 Instrument 2 178128"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projected_revenue.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(projected_revenue, 'Hospital_ID')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"converttoint(projected_revenue, 'District_ID')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(projected_revenue, 'Instrument_ID')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 11410\n",
"District_ID 11320\n",
"Instrument_ID 11410\n",
"Annual_Projected_Revenue 11410\n",
"dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projected_revenue.count()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"projected_revenue = projected_revenue.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 11320\n",
"District_ID 11320\n",
"Instrument_ID 11320\n",
"Annual_Projected_Revenue 11320\n",
"dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projected_revenue.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unique Instrument_IDs"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 3, 1, 4, 5, 6, 7, 8, 10, 11, 15, 20, 13], dtype=object)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue['Instrument_ID'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unique District_IDs"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([12, 13, 16, 18, 19, 2, 20, 28, 3, 34, 35, 37, 39, 4, 41, 5, 50, 52,\n",
" 32, 45, 44, 29, 10, 14, 21, 27, 33, 38, 42, 46, 47, 48, 49, 51, 6,\n",
" 9, 17, 7, 15, 25, 30, 31, 43, 8, 40, 22, 23, 11, 24, 26, 1, 36], dtype=object)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue['District_ID'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we can see no hospitals use all the instruments"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID\n",
"1 [2, 3, 1, 4, 5, 6, 7, 8, 10, 11]\n",
"2 [1, 2, 3, 4, 6, 7, 5]\n",
"3 [1, 2, 3, 6, 7, 4, 5, 8]\n",
"4 [2, 3, 1, 4, 5, 8, 6, 7]\n",
"5 [1, 2, 3, 4, 6, 7, 5, 8]\n",
"Name: Instrument_ID, dtype: object"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue.groupby(['Hospital_ID'])['Instrument_ID'].unique().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And no hospitals are in all the districts"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID\n",
"1 [12, 13, 16, 18, 19, 2, 20, 28, 3, 34, 35, 37,...\n",
"2 [13, 16, 20, 21, 28, 34, 41, 50]\n",
"3 [10, 13, 18, 29, 3, 32, 34, 35, 37, 4, 41, 44,...\n",
"4 [12, 13, 20, 28, 3, 34, 47, 8]\n",
"5 [10, 13, 16, 17, 18, 19, 20, 21, 24, 25, 27, 2...\n",
"Name: District_ID, dtype: object"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_revenue.groupby(['Hospital_ID'])['District_ID'].unique().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Making final train dataframe by feature engineering"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['Hospital_ID'] = hospital_revenue['Hospital_ID']"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = train.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>1000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>1001</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID\n",
"0 1\n",
"52 10\n",
"60 100\n",
"73 1000\n",
"100 1001"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Adding total hospital employees into the mix"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, hospital_profiling.groupby('Hospital_ID', as_index=False).sum(), on='Hospital_ID')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Rename second column\n",
"train = train.rename(columns={'Hospital_employees':'Total_Hospital_employees'})"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees\n",
"0 1 13088\n",
"1 10 1076\n",
"2 100 4925\n",
"3 1000 5756\n",
"4 1001 4002"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Add district ID with hospital employees in that district"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['key'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(hospital_profiling['District_ID'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['key'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.columns = ['District_ID', 'key']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, df, on='key')"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"del train['key']"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 74382\n",
"Total_Hospital_employees 74382\n",
"District_ID 74382\n",
"dtype: int64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.count()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, hospital_profiling.groupby(['Hospital_ID', 'District_ID'], as_index=False).sum(),\n",
" on=['Hospital_ID', 'District_ID'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = train.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>13</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>15</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>16</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>19</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID Hospital_employees\n",
"0 1 13088 12 3.0\n",
"1 1 13088 13 6.0\n",
"2 1 13088 15 2.0\n",
"3 1 13088 16 3.0\n",
"4 1 13088 19 10.0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train = train.rename(columns={'Hospital_employees':'Hospital_employees_in_district'})"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>13</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>15</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>16</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>19</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 1 13088 13 \n",
"2 1 13088 15 \n",
"3 1 13088 16 \n",
"4 1 13088 19 \n",
"\n",
" Hospital_employees_in_district \n",
"0 3.0 \n",
"1 6.0 \n",
"2 2.0 \n",
"3 3.0 \n",
"4 10.0 "
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add total number of hospitals in district"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, hospital_profiling.groupby('District_ID', as_index=False).agg({'Hospital_ID' : np.count_nonzero}),\n",
" on='District_ID')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = train.rename(columns={'Hospital_ID_x':'Hospital_ID', 'Hospital_ID_y': 'Hospitals_in_District'})"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 74382\n",
"Total_Hospital_employees 74382\n",
"District_ID 74382\n",
"Hospital_employees_in_district 74382\n",
"Hospitals_in_District 74382\n",
"dtype: int64"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add unique Instrument_IDs to column"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(hospital_revenue['Instrument_ID'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.columns = ['Instrument_ID']"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District \n",
"0 3.0 1086 \n",
"1 0.0 1086 \n",
"2 3.0 1086 \n",
"3 0.0 1086 \n",
"4 97.0 1086 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['key'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train['key'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, df, on='key')"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del train['key']"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 966966\n",
"Total_Hospital_employees 966966\n",
"District_ID 966966\n",
"Hospital_employees_in_district 966966\n",
"Hospitals_in_District 966966\n",
"Instrument_ID 966966\n",
"dtype: int64"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.count()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 1 13088 12 \n",
"2 1 13088 12 \n",
"3 1 13088 12 \n",
"4 1 13088 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \n",
"0 3.0 1086 2 \n",
"1 3.0 1086 3 \n",
"2 3.0 1086 1 \n",
"3 3.0 1086 4 \n",
"4 3.0 1086 5 "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add total instrument demand"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(hospital_revenue[\"Instrument_ID\"].value_counts().reset_index())"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.columns = ['Instrument_ID', 'Total_Instr_Demand']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, df, on='Instrument_ID')"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Total_Instr_Demand</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \\\n",
"0 3.0 1086 2 \n",
"1 0.0 1086 2 \n",
"2 3.0 1086 2 \n",
"3 0.0 1086 2 \n",
"4 97.0 1086 2 \n",
"\n",
" Total_Instr_Demand \n",
"0 13635 \n",
"1 13635 \n",
"2 13635 \n",
"3 13635 \n",
"4 13635 "
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add instrument value"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = hospital_revenue[['Instrument_ID', 'Year Total']]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.groupby('Instrument_ID', as_index=False).sum()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mean = df['Year Total'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Instrument_Value'] = df['Year Total'].apply(lambda x: x / mean)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"del df['Year Total']"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Instrument_ID</th>\n",
" <th>Instrument_Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.230520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1.250719</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0.460606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>1.275721</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Instrument_ID Instrument_Value\n",
"0 1 0.230520\n",
"1 2 7.172774\n",
"2 3 1.250719\n",
"3 4 0.460606\n",
"4 5 1.275721"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = pd.merge(train, df, on=['Instrument_ID'])"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \\\n",
"0 3.0 1086 2 \n",
"1 0.0 1086 2 \n",
"2 3.0 1086 2 \n",
"3 0.0 1086 2 \n",
"4 97.0 1086 2 \n",
"\n",
" Total_Instr_Demand Instrument_Value \n",
"0 13635 7.172774 \n",
"1 13635 7.172774 \n",
"2 13635 7.172774 \n",
"3 13635 7.172774 \n",
"4 13635 7.172774 "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add median revenue for particular instrument"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = hospital_revenue[['Instrument_ID', 'Year Total']]"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.groupby('Instrument_ID', as_index=False).median()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.columns = ['Instrument_ID', 'Instrument_Median']"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"train = pd.merge(train, df, on='Instrument_ID', how='left')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Join with previous year revenue"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"train = pd.merge(train, hospital_revenue.drop('Region_ID', axis=1),\n",
" on=['Hospital_ID', 'District_ID', 'Instrument_ID'],\n",
" how='left')"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = train.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" <th>Instrument_Median</th>\n",
" <th>Month 1</th>\n",
" <th>...</th>\n",
" <th>Month 4</th>\n",
" <th>Month 5</th>\n",
" <th>Month 6</th>\n",
" <th>Month 7</th>\n",
" <th>Month 8</th>\n",
" <th>Month 9</th>\n",
" <th>Month 10</th>\n",
" <th>Month 11</th>\n",
" <th>Month 12</th>\n",
" <th>Year Total</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>8534.0</td>\n",
" <td>...</td>\n",
" <td>11702.0</td>\n",
" <td>8776.0</td>\n",
" <td>7755.0</td>\n",
" <td>9289.0</td>\n",
" <td>7796.0</td>\n",
" <td>7595.0</td>\n",
" <td>8292.0</td>\n",
" <td>7787.0</td>\n",
" <td>8282.0</td>\n",
" <td>103550.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>1047.0</td>\n",
" <td>...</td>\n",
" <td>338.0</td>\n",
" <td>1303.0</td>\n",
" <td>1067.0</td>\n",
" <td>973.0</td>\n",
" <td>761.0</td>\n",
" <td>630.0</td>\n",
" <td>821.0</td>\n",
" <td>666.0</td>\n",
" <td>841.0</td>\n",
" <td>10453.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \\\n",
"0 3.0 1086 2 \n",
"1 0.0 1086 2 \n",
"2 3.0 1086 2 \n",
"3 0.0 1086 2 \n",
"4 97.0 1086 2 \n",
"\n",
" Total_Instr_Demand Instrument_Value Instrument_Median Month 1 \\\n",
"0 13635 7.172774 8656.0 8534.0 \n",
"1 13635 7.172774 8656.0 0.0 \n",
"2 13635 7.172774 8656.0 0.0 \n",
"3 13635 7.172774 8656.0 0.0 \n",
"4 13635 7.172774 8656.0 1047.0 \n",
"\n",
" ... Month 4 Month 5 Month 6 Month 7 Month 8 Month 9 Month 10 \\\n",
"0 ... 11702.0 8776.0 7755.0 9289.0 7796.0 7595.0 8292.0 \n",
"1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 ... 338.0 1303.0 1067.0 973.0 761.0 630.0 821.0 \n",
"\n",
" Month 11 Month 12 Year Total \n",
"0 7787.0 8282.0 103550.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 666.0 841.0 10453.0 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['Buy_or_not'] = train['Year Total']"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" <th>Instrument_Median</th>\n",
" <th>Month 1</th>\n",
" <th>...</th>\n",
" <th>Month 5</th>\n",
" <th>Month 6</th>\n",
" <th>Month 7</th>\n",
" <th>Month 8</th>\n",
" <th>Month 9</th>\n",
" <th>Month 10</th>\n",
" <th>Month 11</th>\n",
" <th>Month 12</th>\n",
" <th>Year Total</th>\n",
" <th>Buy_or_not</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>8534.0</td>\n",
" <td>...</td>\n",
" <td>8776.0</td>\n",
" <td>7755.0</td>\n",
" <td>9289.0</td>\n",
" <td>7796.0</td>\n",
" <td>7595.0</td>\n",
" <td>8292.0</td>\n",
" <td>7787.0</td>\n",
" <td>8282.0</td>\n",
" <td>103550.0</td>\n",
" <td>103550.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>1047.0</td>\n",
" <td>...</td>\n",
" <td>1303.0</td>\n",
" <td>1067.0</td>\n",
" <td>973.0</td>\n",
" <td>761.0</td>\n",
" <td>630.0</td>\n",
" <td>821.0</td>\n",
" <td>666.0</td>\n",
" <td>841.0</td>\n",
" <td>10453.0</td>\n",
" <td>10453.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \\\n",
"0 3.0 1086 2 \n",
"1 0.0 1086 2 \n",
"2 3.0 1086 2 \n",
"3 0.0 1086 2 \n",
"4 97.0 1086 2 \n",
"\n",
" Total_Instr_Demand Instrument_Value Instrument_Median Month 1 \\\n",
"0 13635 7.172774 8656.0 8534.0 \n",
"1 13635 7.172774 8656.0 0.0 \n",
"2 13635 7.172774 8656.0 0.0 \n",
"3 13635 7.172774 8656.0 0.0 \n",
"4 13635 7.172774 8656.0 1047.0 \n",
"\n",
" ... Month 5 Month 6 Month 7 Month 8 Month 9 Month 10 \\\n",
"0 ... 8776.0 7755.0 9289.0 7796.0 7595.0 8292.0 \n",
"1 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 ... 1303.0 1067.0 973.0 761.0 630.0 821.0 \n",
"\n",
" Month 11 Month 12 Year Total Buy_or_not \n",
"0 7787.0 8282.0 103550.0 103550.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 666.0 841.0 10453.0 10453.0 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = train['Buy_or_not']"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/devashish/miniconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" if __name__ == '__main__':\n",
"/home/devashish/miniconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"df[df != 0] = 1\n",
"df[df == 0] = 0"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>District_ID</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" <th>Instrument_Median</th>\n",
" <th>Month 1</th>\n",
" <th>...</th>\n",
" <th>Month 5</th>\n",
" <th>Month 6</th>\n",
" <th>Month 7</th>\n",
" <th>Month 8</th>\n",
" <th>Month 9</th>\n",
" <th>Month 10</th>\n",
" <th>Month 11</th>\n",
" <th>Month 12</th>\n",
" <th>Year Total</th>\n",
" <th>Buy_or_not</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13088</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>8534.0</td>\n",
" <td>...</td>\n",
" <td>8776.0</td>\n",
" <td>7755.0</td>\n",
" <td>9289.0</td>\n",
" <td>7796.0</td>\n",
" <td>7595.0</td>\n",
" <td>8292.0</td>\n",
" <td>7787.0</td>\n",
" <td>8282.0</td>\n",
" <td>103550.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>1076</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100</td>\n",
" <td>4925</td>\n",
" <td>12</td>\n",
" <td>3.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000</td>\n",
" <td>5756</td>\n",
" <td>12</td>\n",
" <td>0.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1001</td>\n",
" <td>4002</td>\n",
" <td>12</td>\n",
" <td>97.0</td>\n",
" <td>1086</td>\n",
" <td>2</td>\n",
" <td>13635</td>\n",
" <td>7.172774</td>\n",
" <td>8656.0</td>\n",
" <td>1047.0</td>\n",
" <td>...</td>\n",
" <td>1303.0</td>\n",
" <td>1067.0</td>\n",
" <td>973.0</td>\n",
" <td>761.0</td>\n",
" <td>630.0</td>\n",
" <td>821.0</td>\n",
" <td>666.0</td>\n",
" <td>841.0</td>\n",
" <td>10453.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" Hospital_ID Total_Hospital_employees District_ID \\\n",
"0 1 13088 12 \n",
"1 10 1076 12 \n",
"2 100 4925 12 \n",
"3 1000 5756 12 \n",
"4 1001 4002 12 \n",
"\n",
" Hospital_employees_in_district Hospitals_in_District Instrument_ID \\\n",
"0 3.0 1086 2 \n",
"1 0.0 1086 2 \n",
"2 3.0 1086 2 \n",
"3 0.0 1086 2 \n",
"4 97.0 1086 2 \n",
"\n",
" Total_Instr_Demand Instrument_Value Instrument_Median Month 1 \\\n",
"0 13635 7.172774 8656.0 8534.0 \n",
"1 13635 7.172774 8656.0 0.0 \n",
"2 13635 7.172774 8656.0 0.0 \n",
"3 13635 7.172774 8656.0 0.0 \n",
"4 13635 7.172774 8656.0 1047.0 \n",
"\n",
" ... Month 5 Month 6 Month 7 Month 8 Month 9 Month 10 \\\n",
"0 ... 8776.0 7755.0 9289.0 7796.0 7595.0 8292.0 \n",
"1 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 ... 1303.0 1067.0 973.0 761.0 630.0 821.0 \n",
"\n",
" Month 11 Month 12 Year Total Buy_or_not \n",
"0 7787.0 8282.0 103550.0 1.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 666.0 841.0 10453.0 1.0 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train the model"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictors = ['Hospital_ID', 'Total_Hospital_employees', 'District_ID', 'Hospital_employees_in_district',\n",
" 'Hospitals_in_District', 'Instrument_ID', 'Total_Instr_Demand', 'Instrument_Value', 'Instrument_Median']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Machine Learning"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/devashish/EXPERIMENTATION/scikit-learn/sklearn/cross_validation.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
]
}
],
"source": [
"from sklearn.feature_selection import SelectKBest, f_classif, f_regression\n",
"from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
"from sklearn.linear_model import LogisticRegression, LinearRegression\n",
"from sklearn import cross_validation\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"selector = SelectKBest(f_classif, k=5)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"SelectKBest(k=5, score_func=<function f_classif at 0x7f27ab209c08>)"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selector.fit(train[predictors], train['Buy_or_not'])"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"scores = -np.log10(selector.pvalues_)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAGQCAYAAABLSBB3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3WmYZFWV7vH/WyDKLI7VAjIoItoIKoMDrdUqOF5BbWlR\nHHBoJxTx3m5Ar1LS2qjdjqDtVREBRxAV6FYElUQGlbGYRQRRRBlEhkJapeS9H/aJqsgkMiIyqjLO\niVPv73niqYwTEeQiMnPFPvvsvZZsExER7bSg7gAiImL+JMlHRLRYknxERIslyUdEtFiSfEREiyXJ\nR0S02MAkL2kTST+UdJmkSyS9rTp+sKTfSLqguj2n6zUHSbpK0hWSdpvP/4GIiJidBq2Tl7QQWGh7\niaT1gPOB3YF/BJba/uiM528DfAXYEdgE+D6wlbMgPyJi7AaO5G3fYHtJ9fWdwBXAxtXD6vGS3YGv\n2V5m+1rgKmCnVRNuRETMxZzm5CVtDmwP/LQ6tK+kJZI+L2nD6tjGwHVdL7ueFR8KERExRkMn+Wqq\n5hvAftWI/tPAlra3B24APjI/IUZExKjWHOZJktakJPhjbJ8AYPvmrqd8Djip+vp6YNOuxzapjs38\nb2aOPiJiBLZ7TZX3NOxI/gvA5bY/0TlQXZDteDFwafX1icDLJK0laQvgkcA5swTa+NvBBx9cewyJ\nM3FOcpyTEOMkxTlXA0fykp4KvAK4RNKFgIF3AS+XtD1wD3At8MYqcV8u6VjgcuBu4C0eJbKIiFhp\nA5O87bOANXo8dHKf1xwKHLoScUVExCqQHa8DLFq0qO4QhpI4V63EuepMQowwOXHO1cDNUPP2jaXM\n4kREzJEkPA8XXiMiYgIlyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiSfERE\niyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIsl\nyUdEtFiS/AALF26OpNpvCxduXvdbERETSLbr+caS6/recyEJaEKcYhLer4iYX5KwrWGfn5F8RESL\nJclHRLRYknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiSfEREiyXJ\nR0S02MAkL2kTST+UdJmkSyS9vTq+kaRTJF0p6XuSNux6zUGSrpJ0haTd5vN/ICIiZjewCqWkhcBC\n20skrQecD+wO7APcYvvDkg4ANrJ9oKTHAF8GdgQ2Ab4PbDWz5GSqUM5VqlBGxDxUobR9g+0l1dd3\nAldQkvfuwFHV044C9qi+fiHwNdvLbF8LXAXsNPT/QURErDJzmpOXtDmwPfAT4KG2b4TyQQA8pHra\nxsB1XS+7vjoWERFjtuawT6ymar4B7Gf7Tkkz5w7mPJewePHi5V8vWrSIRYsWzfU/ERHRalNTU0xN\nTY38+qE6Q0laE/gv4Lu2P1EduwJYZPvGat7+NNvbSDoQsO0PVc87GTjY9k9n/DczJz8nmZOPiPnr\nDPUF4PJOgq+cCLym+vrVwAldx18maS1JWwCPBM4ZNqCIiFh1hlld81TgR8AllCGtgXdREvexwKbA\nr4A9bd9WveYg4HXA3ZTpnVN6/Hczkp+TjOQjYu4j+TTyHiBJPiKaJI28IyJiuST5iIgWS5KPiGix\nJPmIiBZLko+IaLEk+YiIFkuSj4hosST5iIgWS5KPiGixJPmIiBZLko+IaLEk+YiIFkuSj4hosST5\niIgWS5KPiGixJPmIiBZLko+IaLEk+YiIFkuSj4hosST5iIgWS5KPiGixJPmIiBZLko+IaLEk+YiI\nFkuSj4hosST5iIgWS5KPiGixJPmIiBZLko+IaLEk+YiIFkuSj4hosST5iIgWS5KPiGixJPmIiBZL\nko+IaLGBSV7SEZJulHRx17GDJf1G0gXV7Tldjx0k6SpJV0jabb4Cj4iIwYYZyR8JPLvH8Y/afkJ1\nOxlA0jbAnsA2wHOBT0vSKos2IiLmZGCSt30mcGuPh3ol792Br9leZvta4Cpgp5WKMCIiRrYyc/L7\nSloi6fOSNqyObQxc1/Wc66tjERFRg1GT/KeBLW1vD9wAfGTVhRQREavKmqO8yPbNXXc/B5xUfX09\nsGnXY5tUx3pavHjx8q8XLVrEokWLRgknIqK1pqammJqaGvn1sj34SdLmwEm2t63uL7R9Q/X1/sCO\ntl8u6THAl4GdKdM0pwJbucc3kdTrcOOU68ZNiFNMwvsVEfNLEraHXtAycCQv6SvAIuCBkn4NHAz8\nvaTtgXuAa4E3Ati+XNKxwOXA3cBbJiKTR0S01FAj+Xn5xhnJz1FG8hEx95F8drxGRLRYknxERIsl\nyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclH\nRLRYknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiSfEREiyXJR0S0\nWJJ8RESLJclHRLRYknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxERIslyUdEtFiS\nfEREiyXJR0S0WJJ8RESLDUzyko6QdKOki7uObSTpFElXSvqepA27HjtI0lWSrpC023wFHhERgw0z\nkj8SePaMYwcC37e9NfBD4CAASY8B9gS2AZ4LfFqSVl24ERExFwOTvO0zgVtnHN4dOKr6+ihgj+rr\nFwJfs73M9rXAVcBOqybUiIiYq1Hn5B9i+0YA2zcAD6mObwxc1/W866tjERFRg1V14dWr6L8TERGr\n0Jojvu5GSQ+1faOkhcBN1fHrgU27nrdJdaynxYsXL/960aJFLFq0aMRwIiLaaWpqiqmpqZFfL3vw\nIFzS5sBJtret7n8I+IPtD0k6ANjI9oHVhdcvAztTpmlOBbZyj28iqdfhxinXjZsQp5iE9ysi5pck\nbA+9oGXgSF7SV4BFwAMl/Ro4GPggcJyk1wK/oqyowfblko4FLgfuBt4yEZk8IqKlhhrJz8s3zkh+\njjKSj4i5j+Sz4zUiosWS5CMiWixJPiKixZLkIyJaLEk+IqLFkuQjIlosST4iosWS5CMiWixJPiKi\nxZLkIyJaLEk+IqLFkuQjIlosST4iosWS5CMiWixJPiKixZLkIyJaLEk+IqLFkuQjIlosST4iosWS\n5CMiWixJPiKixZLkIyJaLEk+IqLFkuQjIlosST4iosWS5CMiWixJPiKixZLkIyJaLEk+IqLFkuQj\nIlosST4iosWS5CMiWixJPiKixZLkIyJaLEk+IqLFkuQjIlpszZV5saRrgduBe4C7be8kaSPg68Bm\nwLXAnrZvX8k4IyJiBCs7kr8HWGT78bZ3qo4dCHzf9tbAD4GDVvJ7RETEiFY2yavHf2N34Kjq66OA\nPVbye0RExIhWNskbOFXSuZJeXx17qO0bAWzfADxkJb9HRESMaKXm5IGn2v6dpAcDp0i6kpL4u828\nv9zixYuXf71o0SIWLVq0kuFERLTL1NQUU1NTI79e9qw5eG7/Ielg4E7g9ZR5+hslLQROs71Nj+d7\nVX3v+SSJPp9TYyQm4f2KiPklCdsa9vkjT9dIWkfSetXX6wK7AZcAJwKvqZ72auCEUb9HRESsnJFH\n8pK2AL5FGeauCXzZ9gclPQA4FtgU+BVlCeVtPV6fkfycZCQfEXMfya+y6Zq5SpKfqyT5iBjjdE1E\nRDRfknxERIslyUdEtFiSfEREiyXJR0S0WJJ8RESLJclHRLRYknxLLFy4OZJqvy1cuHndb0VEdMlm\nqAEmZTPUpMQZESsnm6EiImK5JPmIiBZLko+IaLEk+YiIFkuSj4hosST5iIgWS5KPiGixJPmIiBZL\nko+IaLEk+YiIFkuSj4hosST5iIgWS5KPiGixJPmIiBZLko+xakLd+9S8j9VJ6skPMCl12hPnXKTm\nfUyu1JOPiIjlkuQjIlosST4iosWS5CMiWixJPiKixZLkIyJaLEk+IqLFkuQjIlosST4iosWS5CMi\nWmzekryk50j6maSfSzpgvr5PRETMbl6SvKQFwOHAs4HHAntJevR8fK+I+TA1NVV3CEOZhDgnIUaY\nnDjnar5G8jsBV9n+le27ga8Bu8/T94pY5SblD34S4pyEGGFy4pyr+UryGwPXdd3/TXUsIiLGaM06\nv3kpO1uvhz50M2644dq6w4iImBfzUk9e0pOAxbafU90/ELDtD3U9JwW9IyJGMJd68vOV5NcArgSe\nCfwOOAfYy/YVq/ybRUTErOZlusb2XyXtC5xCmfc/Igk+ImL8amv/FxER8y87XiMiWixJvgdJa6ta\n+iPpEZKeJ6nWlUgREaPIdE0Pks4DngZsCPwEuABYavtVtQbWg6QHA9i+ue5YZiPpGNuvHHSsbpK2\nsP3LQcdidpIuBGZNKrafMMZwgjGvk5e0NfBPQKfEwRXA52xfOc44hrDA9l2SXgv8p+0PSlpSd1Ad\n1VnGwcC+lLMxSVoGHGb7kFqD6+2x3Xeq1VdPrCmWfo4HZiahb9CgWCX9PfA2YOvq0BXA4banagtq\nun+o/n0TsAZwTHX/FcBfa4loAEnrAP8beLjtN0jaCtja9n/VHNo0ku4LvATYnK7cPehvfmzTNZKe\nDEwBS4HPAp8D/gicVq2rb5IFknak/GJ2ftBr1BjPTPsDTwV2tP0A2xsBOwNPlbR/vaGtIOkgSUuB\nx0m6o7otBW4CTqg5vOUkPVrSS4ANJb246/Ya4H41h7ecpOcDXwBOAl5O+f38DvAFSc+rM7YO21fb\nvhp4pu132r6wuv0fYNe645vFkcCfgSdX968H3l9fOLM6gVIeZhkld3Zu/dkeyw34LrCox/GnA98d\nVxxDxvoMyh/Pu6v7WwKfrjuurvguBB7U4/iDgQvrjq9HXIfWHcOA+Han/KHfUv3buX0SeErd8XXF\nOQVs1+P444DT645vRkwXAU/qur8zcFHdcc0S63nVvxd2HWtcrMClo7xunNM1j3CPU0rbp0v67Bjj\nGMj2D4EfVqdH2L4GeEu9UU1zH9u/n3nQ9s2S7lNHQAOcI2lD27cDSLo/5QP/2zXHBYDtE4ATJD3Z\n9o/rjqePhbYvmnnQ9sWSHlpHQH28HjhS0v0AAXcBr603pFn9RdLaVNcSJD2CMrJvmrMlbWv7krm8\naJyra5b2eWzwKccYSdpJ0iXAVdX97SQdVnNY3f4y4mN1ObiT4AFs30a5ptA0b6o+gACQtJGkL9QZ\n0Az9/k4a9Tdk+1zbf0sZwe9ke1vb59Yd1ywOBk4GNpX0ZeAHwL/UG1JPuwDnS7pS0sWSLpF08aAX\njXMkv6mkT/Y4LppXofKTwAuAbwPYvqi64NUU20m6o8dx0aA55C69BhNNXJL6uOoDCADbt0p6fJ0B\nzfAISSf2OC7KlGJjSFoL2IPqImGnGKHtf6sxrJ5snyrpAuBJlPdyv15nyg3w3FFeNM4/tH/u89h5\nY4tiOAts/2pGlczGrAyw3aSLwMM4T9JHgU9V998KnF9jPLNZIGkj27cCSHoAzfow6teT4T/GFsVw\nvgX8ifJzbszfTi+SnlZ92ZlteIwkbP+orph6sf0rAEkPYQ6DubH9Ats+alzfaxW4TtJOgKvlfm8D\nfl5zTMtVyWdWtv8wrliG9DbgPcDXq/unUhJ903wE+LGk4ygjun8APlBvSCvYPr3uGOZgs2q6ZhJ0\nD0DvR2l6dD5lAUZjSHoh5Xf0YZQVaptRltA+tu/rqqu2807SSfTfJPHCsQQyhOqT8pPAs6pD3wf2\nbcopnKRfUt7LXuVGbbtRp+6TRNJjWPHH/UPbl9cZT7fqOlG/v6HHjTGcviR9Hvhok96/YUnaFPi4\n7ZfUHUs3SRdRfje/b/vx1RTy3rZf1/d1Y0zyT+/3+ISNUiaCpMfavqzG7/9x2++Y7QO+KR/skjaw\nfcdsZ0hNOTOStFm/xzun801QfSA9CvgFZaWKKAOQxu94rTYbXmb7MXXH0k3SebZ3qJL9423fI+ki\n29v1e904p2uGSuKSjq/7E1TSIynzxwttbyfpccDzbR9aZ1wjOIZ77+Ac9/eH5s0Xz/QVyoX285n+\nYaTqfiPOjIZN4pJ+bPvJg585r/ao+fsPrVo51/m5LwC2p5QyaZrbJK0H/Aj4sqSbGGJVVeNq10i6\n0HatKxokTQHvAj5VnRaJshGh79xX0zTkvVwDONr2K+qMY5DqZ7yp7V/XHcvKasLPvaM6O1p+kdD2\nb2sMpydJr+66uwy41vZZdcUzG0nrUi5mi7LbeUPgy7Zv6fe6Jq0c6GjCp866ts/uWvZlSXfXHNMo\nan8vXRrIbCZpLdtNXMMPLP8Z/zewbd2xrAK1/9yrEgwfAzah7CR+GGXfyaP7va4Ok7IoxHb3qH3o\nmJuY5JvgFklbsGIH3B7ADfWGNNGuAc6q1ngv/0W1/dH6QurpAkk7NnjTziT5AKW+0inV2fCuwJ41\nxzRNnwvZnesHjbiQLelM27tUdZ/uNZ1oe4N+r29ikh+6Qe082hc4Ani0pF9R+tS+rN6QRtKUkfPV\n1W0BsH51rPbRZg87A6+ofuZ/pGF/7HPQhL+hZVWZjQWSVG04atq1mRfUHcAwbO9S/bv+oOf20sQk\nf0DdAQDX2H6GpA0p1y1uG/iKGkj6ge1nznbMdlOqe15u+7juA5JeWlcwfTy77gCGIelDtg/oc6wJ\ndfpvry4SngkcXV0k/J+aY5qmSauR+lnZfTHjXEI5EadGAJKuBo4FjrTdmE1QHVXRp3WA04BFrBi5\nbQCcbLtR856SLpi5dK7Xsbppcpqb9Ho/L27Y39D6lKQu4FWUi4THuIHNbVRKnR8GbAOsRSkr/sdB\n0yDjMmNfzMOBW6uv7w/82vYW/V4/zpH8RJwaVZ4I7AV8SdJfKDW8j7V9Z71hLfdG4B2Ui1nnsyLJ\n3wEcXldQM0l6LvA8YOMZdYs2oKxiaJpGNzeR9GZKNdQtZxSmWh9o1GoQ20theUOO4wY8vW6HU6Zj\njwN2oHwoParWiLp0krikzwHfsv2d6v5zGWKpauOWUDaNpEXAlymJ6Vjg/W5IOzhJb7PdpOqY00ja\njrLm+BDgvV0PLQVO69SIqZukgyhLZtemlMSF8sH5F+Cztg+qK7Zu1fThRsChwIFdDy1tyoatDkmv\nB/6VUrfmHlacsT+81sB66NpktPxsqEnLUDskXWJ720HH7vW6cSf5pp8aAUhaADwH2Ifyif7l6vZ3\nwPtsb93n5WMj6a2UdbK3Vfc3Avay/el6I5tO0n1s3119vRFlPfrAEqnjJunQpiT0QaqzjIcyvQ1c\nY9b4S7oKeKrtm+qOZRBJP6KUMPk8ZRXd74DXDNpJOm6SvgecAXypOvQK4Gm2+15LGmc9+Y7DKVMh\nV1FGTq9nRXXCprgK+EdKz9TtbH/Y9vW2v0apNd0Ub/CM0rjAG2qMZzanStqguoB0AfA5SR+rO6ge\n/qvacIKkvSV9dFApgTpI2he4kVLo7b+rW6P6kVKWzfYqh91Er6Tkwn0pq6o2pfRSbZq9KN3fvgV8\ns/p6r0EvqmMk3/hTo04tk7rjGKS6mP04Vz/EanR3cdN25nZ+vtUp/Ka2D27ahUIoFy+B7Sjt9L5I\nGdntabtv3aVxk/QLYOdBOx3rpFKH//PAT+jqsmT7nbUFNYOkfwa+avs3dccyF5LWnbExqq86RvJ3\nqTQUWCLpwyqNp+uIo5/1JB0n6QZJv5P0dUkPqzuoHk4Gvi7pmZKeCXy1OtY0a0r6G8pmmKaNOLst\nqz4wdwcOt/0pVqzrb5LrgNsHPqten6FcDF4CXNZ1a5KHUUpLnyHpLZIeXHdA/Uh6iqTLKeWFOx3r\nBk7N1jGS34xyqrkWsD9ladWnXDq8N0I19/UN4Ojq0CuBlw6a+xq36trBG4HOWvlTgc/bblSThmpN\n/HuAM22/RdKWwL/XXYhuJkmnUz4k9wGeRqnZfdGgC1vjJukIYGvKNE33KLkxO4glLbG9fd1xDFLV\nLHoaZXXNHpQG5F8FvtlZIdQUkn5K6XFwYmfmQ9KlHlC3v44kv5/tTww6Vqdev6CT8ksbo5O0EHg5\ncK7tMyQ9nNJw/OgBLx0rST3749p+37hjmY2kD1B2OZ/E9A+ixk6DVtOdzwI+CGxte52aQ5pG0k9t\n79w9va0hSg3XkeR7beRo2pz8D4HPsqKT0Z7AG203olOMpGNt7znbBrOmzHVL+hfbH9b0Uq7L2X57\nDWG1hqR1bN81+JnjJ+m6HocbuYQSQNK2lNH8PwK/p8zVN2bgCSDpG8BHKYtXdgb2A3aw3bfkytg2\nQ0naizJK2kLTmxGvDzRqjS/wWuDTlFU/plw8em2tEU23X/Vv0zeYXVH927QevtNoJQtAjZukJ1Nq\nK60HPLzaj/BG22+pN7IVbG9adwyDSNqKkthfRlnP/zVgN9vX1BrY7N4EfALYGLgeOIUh2miOs6zB\nZsAW9NjIQVkR0sQdkI1VnVp+3/bf1x1LjNeoc7PjJGltymBkM9tvVmnEs5Xt79Yc2nIq5Uu+CnzN\n9qV1xzNfxtkZ6lfAr4C6O9bMqlq73a+HZmOWf7nUab9H0oa2G7nSQhPU1xeWn7J36v5c7hpbJw5i\n+zppWrHJRl1sp5QCuYSygRDgt5SyAY1J8rYfMczzVHOnrRklQe5l0LTnOKdrJuGUeNI+ze8ELpF0\nKtPrtDdlrrtTWvbFwEJW7NTbi7LCqhGqcgEnUIo/XUT5ndxW0q+B3Rt4sfA6SU8BLOk+lBHzFQNe\nM25b2d6rWlmF7bs041Npgtxv8FPm1ZsouelYyoflnN7HcY7kV6om8jjYPqL7vkpxJdtuVInULt+s\nbt0aU4zIVV9fSR+xvUPXQydJatI8/b9Srhs8w/Y9sHx56gcpzS/eVmNsvYw0Nztmf1GpltrZqLcF\nzelvMFd1/039DfBSykXhZZQFId/wkCXQa6knL+kJwC6UN+9M2xfWEcdsqviOoGwbRtKNwOubFidw\n/17LUesKpo91JW3ZuaBV/cGvW3NM3Z5F2Tl8T+eA7XskvYsy5dAotn9PqVvSZIdQ9hxsIuko4OnA\n6+oNaTJVO5s/A3xG0iaUC8WXSzrA9jGDXj/2JC/pvZRPpc4I9IuSjrP9/nHH0seRwDtsnwbLK1F+\nkbLlvUleTRnRdXtNj2N12x+YknQN5VRzM+Cf6g1pmr/0uvBve5mkP/d6QZ2qD8m3AZszvUBZY65x\n2D5Z0vnAUyg/83+ehGJls2jENFM1+NwL2JVybeP8oV5Xwzr5K4HtbP+pur82sMQNqewIvdftN2kt\nf9dy1F0oVek6NgD+6hndoppA0n1ZcVHzZ7b/3PXYrrZPrScykPQzyh/PzD9mAV+yvc34o5qdpIso\nZ5qXUMr4Aiumx+pWrfzajRU/7yuAU5u2E7tDAzptSfrbOlffSDoEeD7lffwapTHQ0KsR60jypwEv\n8oryuPenbCFuxEYjWL7KZi3K8ipT5sLupuqQ7prL5LZtOWqvDXJj/v6n9Xu8actUOzsf646jl6pG\n0Q+BW4ALKR+U2wMPoFzzuKHG8HqaZYNmYwroSboH+CUreh10kvZQXfXqSPLfBnak1Fkx5dTjHOA3\n0IyVIZLO6POwbT9tbMH0oVIW93+q+eNHUUZO33VVu31SNOksqZ+6zzi64ng5sBXlgmt3yYALaguq\nIulI4FLbH5lxfH/KdY996ons3tTVaYtSgqFjfeAs23vXEtgMGlDu2gN61daR5F/d73HbR40rlklX\nzXn+HaVb0FnAuZT55aZflJum7pH8sJoSp6RDKUXzrmbFdI2bcDYs6Weepcdwv8fqoAnqtDWM2dbz\nj/3Ca3cSV0O7BEnaANibe1/YasxmqIqq9cevAz5d1YlZUndQLdaIC3CUhQtb2m7iksR+y40btRS5\n2kR4O7CXpnfaWk/Sem5Qp60h9VzPX8fqminghdX3Ph+4SdJZDUug36F0MJp2YauBVNUxeQUrlqet\nUWM8o7q27gCGVPd66Y5LgftTSiE3zYaSeq3yEWVhQOOodNpaTNmgt/zMiNI8ZpL0/P2sY538hrbv\nUOkSdLSrLkE1xNHPOk24NjCEdwAHUTq4X6ZSp73vRcS6VDs0N2f6mdHR1b8vrimsSXV/4GeSzmX6\nnHwTllCeRTnT6OXscQYyB++glBZubKetlVFHku/uEvTuGr7/ML4iaR9KF6PG1sKulsyd3nX/GqBx\nH06SjgEeQekS1FlGZ1Y0ZZkU19YdQKVnPfkmsP3KYZ4naW/bXxr8zLGYhE5bw+g5nVhHkj8E+B7l\n6vW51ejzqhri6OdO4OOU7e6dUyBTapvUTtLHbb9jtgJgDRnRddsBeIzHfZV/BJNwxtGU9fAr6Z2s\nqGVUt2som/Ua22kLBq/np1yMv/frJuDvbuwk/ZLSKLmJc55IeqLt8yX1bDDdtCQg6Tjg7bZ/V3cs\n/cx2xtG0qbsZRf7WAu4D/LEhRf6G0qRls5qATlsw+nr+Oi68bgIcBjy1OnQGsJ+b1TH9F0Cjpma6\n2T6/+vd0Vc2Hbd9cb1R9PYhSa+McmjeH3G0izji6i/xVlR13B55UX0Qjacx73Enmaminre71/DOu\nX65PuQbS//U1rJM/FfgK0CmsszfwCtu7jjWQPiQdDzyGsnOvOyk1ZgWQpMXAvsACylzcMuAw24fU\nGVcvOeOYf00aGQ+jSfGqq9OW7cZ12lrZ9fx1zMk/2PaRXfe/KOkdNcTRz3eqWyNJeiflTGhH27+s\njm0J/Kek/W1/rNYAZ2haMu9jIs44JHVfG1hAOQP5U03h3Eu15nwP28f3edpPxhXPED4OPBs4EcD2\nRZIasasdVn49fx1J/hZJe1PqwkApDNWopUu2j5C0FvBw27+oO54eXgns6lJyFigra6r39RSgEUle\nk9EoptviugMY0v/q+noZZdXP7vWEcm8uXcveBcya5G2/eYwhDeTmd9oaeT1/HUn+tZQ5+U6rvbOB\nxtSzAJD0fEpX9LUojce3Bw62/aJ6I1vuPt0JvsP2zSqdghrBQzaKkbSR7VvHE9XsJuGMoxrJXdy0\ns7UeTqmi1xwvAAAR+ElEQVTO0L/O9K5lTbzWNQmdtmDE9fxZXdNDVRPmmcBpXtEo+RLb29YbWdGv\nhkpT6qvMRd0xT9oZh6RzbO9Udxz9SLqu665Z8V42YhlyN0kPovRgeBYlzlMoi0EaNcNQVUvd1XOs\nMjvOHq+H0b+pc5OWqd1t+7YZp29N+jTcTlKvEZGovx/lKGqtCTNpZxzAWZIO596j5NqrUHbZ0jOq\noTbpLLObJ6PTFoy4nn+c0zXdPT3fR4N37QFXSNoTWKDSheftNOhCke1JrE/TT5M+QPv5AdCEs6Tt\nq3+7V1IZqL0KZZefcu/3qtex2mkCOm1Vfl3d1qpuQxlnI+/u6pPvcLNLCu8LvJdyceOblB26TS3B\nEOPTlCqUr6tKWCxXra6qnaSHUBpPry1pW1a8ZxsA69QWWH/fpiyhPIkGFyQcdT1/LY28afjIzfYf\ngQOq2710ygqMN6pWa0ryHKQpv7ff4N4j4uOAJ9YQy0zPpyyu2AT4FCt+tkuB99QV1AB/sv3JuoMY\npHs9PzD0ev66kvyka8wa2kkg6RHAb2z/WaUp+uMoFUhvq57SuJ60TSTp0cBjKeV8u9fKb0BDrsVU\ne2COlLSn7WPrjmdIn6hKGzSu09YMI63nH+eF1+6VC+t0XThs5AqGWKWOB3aQ9Ejgs8AJlF3PzwMY\nZtdeQ9R9xrE18AJKqeHutfJLgTfUEtHsHiJpA5ey4p+hnHkcZPsHdQfWw7aUvSfPYPr68yZd4wBG\nW88/zjn5visXotXusb1M0osopRcOk3Rh3UHN1PQzDtsnACdIerLtH9cZyxD+yfbhknajzNG/AfgC\nzZhSmqnJnba6jbSef8G8h9VOdY/oJs3dkvYCXk2p0Q+lcmLTHA/8teuMY1PKGQfQqDOOF0naQNJ9\nJP1A0s3Vbucm6Zy1P4/yQXkRzc03nU5bTfcm4K3AxsD1lFVWbx30oszJj+bwugOYMPtQfkE/YPuX\n1ZK1Ywa8pg4TccYB7Gb7X6o4rwVeDPyI5tRnB7hI0neARwHvkrQezblwPVOTO20tN+p6/iT5LpK+\nRf8NWy+u/j1ibEG1gO3LJR1A1XSlKqr2oXqj6qn7jKMz593EM45OTM8HjrN9+4x52ibYhzI18wuX\nZvMPYkUf4qZp8p6d5UZdz58kP11G6PNA0v8C/oPptYAOadpIick54zhJ0s+A/wHeXPUUaEwVSlhe\npOzXwCMlNTrPTELNospI6/lTuybmXVUL6BnAVFctoEtt/229kU0uSQ8Abq+S6TrABrZvqDuuDkn/\nRukV8TOmd9l6Xn1R9aYJ6bQl6ae2d57r6xr9CVuXapXFByiNQ5avP7b9qNqCmmx395hSaMzOQkmX\n0H+arm8p15o8Gth8xii5SY3RXwI8ynajzjB68eR02hppPX+SfG9fBN5PmWJ4LuU0Pqc8o7tM0suB\nNSRtRakFdHbNMXV7Qd0BzIVm6UVLs5L8L4GJq7HkMrXx7SqZHjjo+WM20nr+TNf0IOl820/sLi8s\n6TzbO9Qd2ySqphPeDexWHfoe8P5JGOU1kaQraHgvWpVWio8Dvk9DW2h2zNJp6+m2n1xTSD1J+gXl\n5z6n9fwZyff2Z0kLgKslvYmyJjWbuUZUFVN6t6QPzKWw0rhJehKloc02lLnZNWjg3CxlXfdCoMm9\naE+ubpOg0Z22unTW8980lxclyfe2P7AuZVrhA5TaII3qXjVJql16n2eOhZVqcDjwMkqxrx2AV1HW\neTdN43vRTsoy4wnqtAUjrufPdE0Pkl5s+5uDjsVwJP0U+AfgxCavrulMyUm6uHOxVdKFnZibQtLT\nex1vwlLAavNYv4vYTawn3/hOWzD6zz0j+d7+L6WOfLd39zgWQxqlsFIN7lJp4L5E0ocp0yGN24rf\nhGTexz/UHcAIJqHT1sg/9yT5LpKeDTwH2FhSd0utDWjQkr8JNCmNkl9JSer7UqbsNqWUDGiEHj1o\nlz9EQyq52r56mOep6qs73/EMaRI6bY28nj9JfrqbKBc3/gRc1nV8Kc1bTjVJ3kRplNwprHQKQxRW\nqsEetj9B+fl3uvDsR4m9di2r5Lpu3QF0aWynrW6jrufPnHwPku5r+8+DnxnDkPRAN6zzfS+SLpg5\nZ9zEOfk26PVe12WWn/v5tptYFnmaYX4/M5LvIumrtvcCfiLpXp9+TfmlnEA/kbSEUk/85Kat766K\nkr2cUlfnxK6H1geaUl44VrFJ6LTVbZb1/AP3miTJT/fP1b+TePGoyR4FPIvS+/MwSccCX7T983rD\nWu5sykXWBwEf6Tq+FLi4lojarwllMyep0xaMuJ4/0zWzqCr77Ui50HGe7ZtrDqkVJP09pe75usBF\nwIET0OUoVjFJ21WNRGo3CZ22qvX8bx9lPX/jloc1gaR9gAsop/B7A+dJenW9UU0uSQ+UtJ+k84D/\nQ6mJ/SDgf9PVeakuks6s/l0q6Y6u21Kt6EUcQ5B0q6Q/9LjdKmn51FdTEnyl8Z22bP8V2GuU12Yk\n34OkK4FdOqP3alR/pu2t641sMkn6OaUu+5G2fzPjsQNsN7GBSIygGnHOqkpWjSJpie3tVTptvQB4\nJ/Aj29vVHNo0kj5GWTY5p/X8mZPv7Q/AbV33byMX4FbG1rYtaT1J69m+s/NA0xK8pCcAu1Cm6c60\n3cT2f401M4lXde+7L2L+drwRDWUSOm3BiOv5k+R7uxL4saRvU97EPYBLJb0dwPYn6wxuAj22Ko/7\nAMoS35uBV9u+tOa4ppH0XuClrNjZ/EVJx9l+f41hTSRJzwc+BmwC3ELZI/FzSh38pml8p63KSOv5\nM13Tg6R/7fe47feMK5Y2kHQ28G7bp1X3FwH/ZvsptQY2QzVNt12nBLKktYElmaabu2rJ7K7AKbYf\nL2lXYE/bTVy10vhOWzD6ev6M5HvoTuLVzrJ1bP+xz0uiv3U7CR7A9pSkJu147PgtZWqhM4q7L2WH\nbszdMts3S1ogSbZPlfQfdQfVR2M7ba3sev4k+R4kHU2pX7IMOAd4oKR/t/3R/q+MWVwj6T2saIq9\nN3BNn+fX5XZKF6tTKdN0uwLnSPokgO231xnchLld0nrAmcDRkm6iTIc0jprfaWul1vNnuqaHrqvt\nL6eslT+Asla+ib0+G0/SRpRaMJ2CVGcAi23fWl9U9zZomazto8YVy6STtD5wF2WZ9quADYGjbf++\n1sB60AR02oLR1/NnJN/bfarTtt2B/7T9F0mpQjmiKpk3fhTcncSrD6ZNbWfH62gOsv0uysj4CABJ\n/wa8q9aoepuETltQ1vNfRjkjOpnSXnF/21/q96Ik+d4+D/ya8sM/XdLDgTv7vyRmknQS/RtINKaT\nEYCkKeCFlL+L84GbJJ3VxL6kE+A53DuhP7/HsSZofKetym62/6Vaz38tpQz2jyg7yGeVJN9DtXV4\n+fZhSdfRsNrSE6LJF9p62dD2HZJeT5laOFhSRvJzIOmNlNLSj5LUvUlnfcoHZxMtrjuAIY20nj9J\nvodqPvE9wNOqQ6cD7wfm1CV9ddfdyUal49KjKSP7K+facX5M1pT0N8CelE5gMXfHAj8ADmV6D4al\ntufUgHpcGt5pq9tI6/lz4bUHScdRNm505mhfCWxjO9UpR1BtjPkMcDWl+uAWlEbe3601sBkkvZTy\n4X6W7TdXG03+3fZLag5tIkl6LPB31d0zbF/W7/njpgnotDXTKOv5k+R76KyuGXQshlONPl5g+xfV\n/UcA/227ibsfYxWQ9FZK969vV4d2Bz5l+9P1RTX5VNpobk7XLIztvks9M13T258kPcn2TwAkPYlm\nbnOeFEs7Cb5yDWWNb6NI2gQ4DHhqdegMYL+ZRdViKG8EdurUKapW1pwNJMmPaNT1/Enyvb0FOEbS\nfSmnbndR1vrGaM6T9B3KfK0p9WHO7ezes/3Nfi8eoyMppY9fWt3fuzq2a20RTS4x/RrW3TSjUcgk\n24ER1vMnyfdQle58bDX/he1UoFw59wNuBJ5e3b8ZWJuye8+sKAhWtwfbPrLr/hclvaO2aCaQpDVt\nL6Psbv6ppOOrh17EimtcMZqR1vNnTr5Lp8rkbFJ9st0k/YAycv9qdWgvYB/bz6wvqsnSXURL0k50\n7XK2fW59kU0+SadRyg3PaT1/RvLTPbjr69dR7dSLlSNpC0o3qM2ZfsGoaZtNXkuZk/8Y5QzjbGCf\nWiOaPMunZGyfQ0lIsWosHuVFGcnPQtKFth9fdxxtIOkiygfmJcDy8hATtD45hiTpN8CshfxS5G/8\nMpKfXT79Vp0/NXmqS9Jh9C+/0Pi6Ow2yBrAeuci6yqzsev4k+RiHT0g6GDiF6XOJfXtTjtF5XV+/\nDzi4rkBa4He2Dxn8tBiW7fVX5vWZruki6UJWfGI+GvhZ5yHKJ+YTer4w+pJ0KGXX8NWsmK6x7cbV\nA8o03crJ+9c8GclPl7IF8+OlwJYNrVczU0Y9KycrkRomSb6L7auHeZ6kM23vMviZUbmU0tWmkQWq\nYtXJnpLmSZIfTRP7kzbZ/YGfSTqXBtbrnnFhax1Jd3QeoqGFqiKGlSQ/mpzSz02jL2Su7IWtiCZL\nko95Z/t0SZsBW9n+flUidY2644pYHSyoO4AJlTXAcyDpDcA3gP9XHdqYFSVoI2IeJcmP5jV1BzBh\n3kop33sHgO2rgIfUGlHEaiLTNV0k3Ur/nWWdqpQXjTWwyfdn23/p9KOUtCa5rhExFkny0z2o7gBa\n6nRJ7wLWlrQrpV7/STXHFLFayI7XPqp68vfr3Lf92xrDmViSFlCqeu5GOSv6HvD5uTY/iIi5S5Lv\noWo8/TFgE+AWyoXCn6cn6fyQdHyaZUfMj1x47e0DlAuFV9reFHg2pd9nzI8t6w4goq2S5HtbZvtm\nYIEk2T4V2KnuoFosp5MR8yQXXnu7XdJ6wJnA0ZJuAv6n5pgiIuYsc/I9SFofuItypvMqYEPgaNu/\nrzWwlkp52oj5k+ma3g6y/Vfbd9s+ompZ9s66g2qxA+oOIKKtMpLvobvjfNexi2xvV1dMk0jSJfTf\nXPa4MYcUsdrJnHwXSW8E3gQ8SlJ3a7r1gfPriWqivaDuACJWdxnJd5G0EfBA4FDgwK6HltpOw4uI\nmDhJ8rOQ9Fjg76q7Z9i+rM54JpmkJwGHAdsAa1HKDP8xzTgi5l8uvPYg6a3AccDDq9uxkt5Sb1QT\n7XBgL+AqYG3g9cCnao0oYjWRkXwPki4GnmL7zur+esDZuVA4Gknn2d5B0sWd9zDLJiPGIxdeexPw\nl677d5NGISvjLklrAUskfRj4HTmLjBiLJPkukta0vQw4BvippOOrh14EHFVfZBPvlZSkvi+wP7Ap\n8OJaI4pYTWS6pkv3+nhJOwG7VA+dYfvc+iKbbJL2s/2JQcciYtVLku+SeeL5McvmsrzXEWOQ6Zrp\nHixp1vIFVXmDGJKkvYCXA1tIOrHrofWBP9QTVcTqJUl+ujWA9chF1lXlbMpF1gcBH+k6vhS4uJaI\nIlYzma7p0mtaISJikmUkP11G8KuQpDNt7yJpKdMLlXUKlGXHa8Q8y0i+i6QH2M5ccUS0RpJ8jIWk\nJ1CWpBo40/aFNYcUsVrIrsOYd5LeS9lM9kDKRdgvSvq/9UYVsXrISD7mnaQrge1s/6m6vzawxPbW\n9UYW0X4Zycc4/Ba4X9f9+wLX1xRLxGolI/mYd5K+DewInEqZk98VOAf4DYDtt9cXXUS7JcnHvJP0\n6n6P207xt4h5kiQfY1W1WNzUdna8RoxB5uRj3kmakrSBpAcAFwCfk5Q6QBFjkCQf47Ch7TsoNeSP\ntr0z8KyaY4pYLSTJxzisKelvgD2B/6o7mIjVSZJ8jMMhwPeAq22fK2lLSlPviJhnufAaEdFiGcnH\nvJO0iaRvSbqpuh0vaZO644pYHSTJxzgcCZwIPKy6nVQdi4h5lumamHeSltjeftCxiFj1MpKPcbhF\n0t6S1qhuewO31B1UxOogI/mYd5I2Aw4DnkypXXM28Hbbv641sIjVQJJ8RESLpcdrzBtJhzG9t+s0\nqT4ZMf+S5GM+ndf19fuAg+sKJGJ1lemaGAtJF9p+fN1xRKxusromxiWjiYgaJMlHRLRYpmti3kha\nyooR/DrAXZ2HANveoJbAIlYjSfIRES2W6ZqIiBZLko+IaLEk+YiIFkuSj4hosST5iIgW+/9jejSm\nvCIa5wAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f279e9759d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.bar(range(len(predictors)), scores)\n",
"plt.xticks(range(len(predictors)), predictors, rotation='vertical')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"selector = SelectKBest(f_regression, k=5)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"SelectKBest(k=5, score_func=<function f_regression at 0x7f27ab209d70>)"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selector.fit(train[predictors], train['Year Total'])"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"scores = -np.log10(selector.pvalues_)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAGQCAYAAAC+tpvHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3WmYZFWV7vH/WyDKWOJEKVUyKCDSDCqDA63ZKjheQW1p\nQRRxaBSRwXu7GbxaJa1Na7cjSHtVREAUQVSgWxEQEhlUxoJiFEFGZRAZCmmVknU/7B2VkVmRmZFV\nlWfvPPH+nieejDgRwVlkRq04Z5+911JEYGZmM9+s0gGYmdnK4YRuZtYSTuhmZi3hhG5m1hJO6GZm\nLeGEbmbWEpMmdElzJZ0r6VpJiyR9OG+fL+lOSVfk22u73nOopJskXS9p5+n8HzAzs0STzUOXNAeY\nExELJa0FXA7sAvwDsDgiPjfm9ZsD3wa2A+YC5wCbhCe8m5lNq0mP0CPi7ohYmO8/AlwPrJ+fVo+3\n7AKcFBFLIuJW4CZg+5UTrpmZjWdKY+iSNgS2AX6ZN+0naaGkr0uanbetD9zR9ba7GPkCMDOzadJ3\nQs/DLd8DDshH6kcDG0fENsDdwGenJ0QzM+vHqv28SNKqpGR+QkScBhAR93W95GvAGfn+XcC8rufm\n5m1j/5seUzczWw4R0Wu4u+8j9G8A10XEFzsb8sXSjrcA1+T7pwNvl7SapI2A5wKXjBNUdbf58+cX\nj8ExOaZBjMsx9XebyKRH6JJeBrwDWCTpSiCAw4A9JG0DPA7cCuyTk/R1kk4GrgMeA/aNyaIwM7MV\nNmlCj4iLgFV6PHXmBO85AjhiBeIyM7Mp8krRMYaGhkqHsAzH1B/H1L8a43JMK27ShUXTtmPJIzFm\nZlMkiVjBi6JmZlY5J3Qzs5ZwQjczawkndDOzlnBCNzNrCSd0M7OWcEI3M2sJJ3Qzs5ZwQjczawkn\ndDOzlnBCNzNrCSd0M7OWcEI3M2sJJ3Qzs5ZwQrfWmDNnQyQ1fpszZ8MZFZO1l+uhW2tIInVIbHzP\n4/Z6rDEmm9lcD93MbAA4oZuZtYQTuplZSzihm5m1hBO6mVlLOKGbmbWEE7qZWUs4oZuZtYQTuplZ\nSzihm5m1hBO6mVlLOKGbmbWEE7qZWUs4oZuZtYQTuplZSzihm5m1hBO6mVlLOKGbmbXEpAld0lxJ\n50q6VtIiSfvn7etKOkvSjZJ+Iml213sOlXSTpOsl7Tyd/wNmZpZM2lNU0hxgTkQslLQWcDmwC7A3\ncH9EfEbSwcC6EXGIpOcDJwLbAXOBc4BNxjYQdU9RW9lq7N9ZY0w2s61QT9GIuDsiFub7jwDXkxL1\nLsBx+WXHAbvm+28CToqIJRFxK3ATsP0K/R+YmdmkpjSGLmlDYBvgF8B6EXEPpKQPPCO/bH3gjq63\n3ZW3mZnZNFq13xfm4ZbvAQdExCOSxp7PTfn8bsGCBUvvDw0NMTQ0NNX/hJlZqw0PDzM8PNzXaycd\nQweQtCrwX8CPI+KLedv1wFBE3JPH2c+LiM0lHQJERHw6v+5MYH5E/HLMf9Nj6LZS1TheXWNMNrOt\n0Bh69g3guk4yz04H3p3v7wWc1rX97ZJWk7QR8FzgkilHbWZmU9LPLJeXAT8DFpEONQI4jJSkTwbm\nAbcBu0XEg/k9hwLvBR4jDdGc1eO/6yN0W6lqPBquMSab2SY6Qu9ryGU6OKHbylZj8qwxJpvZVsaQ\ni5mZVc4J3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDN\nzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxa\nwgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ\n3cysJZzQzcxawgndzKwlnNDNzFpi0oQu6RhJ90i6umvbfEl3Sroi317b9dyhkm6SdL2knacrcDMz\nG62fI/Rjgdf02P65iHhhvp0JIGlzYDdgc+B1wNGStNKiNTOzcU2a0CPiQuCBHk/1StS7ACdFxJKI\nuBW4Cdh+hSI0M7O+rMgY+n6SFkr6uqTZedv6wB1dr7krbzMzs2m2vAn9aGDjiNgGuBv47MoLyczM\nlseqy/OmiLiv6+HXgDPy/buAeV3Pzc3belqwYMHS+0NDQwwNDS1POGZmrTU8PMzw8HBfr1VETP4i\naUPgjIjYMj+eExF35/sHAdtFxB6Sng+cCOxAGmo5G9gkeuxEUq/NZsstXX8v8ZkS432Wa4zJZjZJ\nRETPySaTHqFL+jYwBDxV0u3AfODvJG0DPA7cCuwDEBHXSToZuA54DNjXWdvMrBl9HaFPy459hG4r\nWY1HwzXGZDPbREfoXilqZtYSTuhmZi3hhG5m1hJO6GZmLeGEbmbWEk7oZmYt4YRuZtYSTuhmZi3h\nhG5m1hJO6GZmLeGEbmbWEk7oZmYt4YRuZtYSTuhmZi3hhG5m1hJO6GZmLeGEbmbWEk7oZmYt4YRu\nZtYSTuhmZi3hhG5m1hJO6GZmLeGEbmbWEk7oZmYt4YRuZtYSTuhmZi3hhG5m1hJO6GZmLeGEbmbW\nEk7oZmYt4YRuZtYSTuhmZi3hhG7LZc6cDZHU+G3OnA1L/6+bVUsRUWbHUpTat604SUCJv58Y73Pj\nmEbtedyYbGaTRESo13M+QjczawkndDOzlpg0oUs6RtI9kq7u2raupLMk3SjpJ5Jmdz13qKSbJF0v\naefpCtzMzEbr5wj9WOA1Y7YdApwTEZsB5wKHAkh6PrAbsDnwOuBopUFEMzObZpMm9Ii4EHhgzOZd\ngOPy/eOAXfP9NwEnRcSSiLgVuAnYfuWEamZmE1neMfRnRMQ9ABFxN/CMvH194I6u192Vt5mZ2TRb\nWRdFPT/KzKywVZfzffdIWi8i7pE0B7g3b78LmNf1url5W08LFixYen9oaIihoaHlDMfMrJ2Gh4cZ\nHh7u67V9LSyStCFwRkRsmR9/GvhDRHxa0sHAuhFxSL4oeiKwA2mo5Wxgk14riLywaGarccGMYxq1\nZy8saqmJFhZNeoQu6dvAEPBUSbcD84F/A06R9B7gNtLMFiLiOkknA9cBjwH7OmubmTXDS/9tudR4\n5OmYRu3ZR+gt5aX/ZmYDwAndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDN\nzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxa\nwgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ\n3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJVZdkTdLuhV4CHgceCwitpe0LvBd\nYAPgVmC3iHhoBeM0M7NJrOgR+uPAUES8ICK2z9sOAc6JiM2Ac4FDV3AfZmbWhxVN6Orx39gFOC7f\nPw7YdQX3YWZmfVjRhB7A2ZIulfS+vG29iLgHICLuBp6xgvswM7M+rNAYOvCyiPidpKcDZ0m6kZTk\nu419vNSCBQuW3h8aGmJoaGgFwzEza5fh4WGGh4f7eq0ixs23UyJpPvAI8D7SuPo9kuYA50XE5j1e\nHytr3yvTnDkbcs89tzW6z/XW24C777610X2uKElM8F09nXtmvM+NYxq153FjsplNEhGhXs8t95CL\npDUkrZXvrwnsDCwCTgfenV+2F3Da8u6jhJTMo9Fb018gZtZOKzLksh7wA0mR/zsnRsRZki4DTpb0\nHuA2YLeVEKeZmU1ipQ25THnHlQ65lDlFnnmnxzUOJTimUXuecZ8p68+0DLmYmVldnNDNzFrCCd3M\nrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwl\nnNDNzFrCCd3MrCWc0M3MWsIJ3cysJZzQzcxawgndzKwlnNDNzMYxZ86GSGr8NmfOhssVr5tEj+Em\n0f2psfmxYxq15xn3mapRjX8/N4k2MxsATuhmZi3hhG5m1hJO6GZmLeGEbmbWEk7oZmYt4YRuZtYS\nTuhmZi3hhG5m1hJO6GZmLeGEPgOUqCexvLUkzKwc13IZo8ZaLo5p1J5nVN2UGmOy/tX493MtFzOz\nAeCEbmbWEtOW0CW9VtINkn4l6eDp2o+ZmSXTktAlzQKOAl4DbAHsLul507EvM5u64eHh0iEso8aY\nZprpOkLfHrgpIm6LiMeAk4BdpmlfZjZFNSbPGmOaaaYroa8P3NH1+M68zczMpsmqJXeepgQ1a731\nNuDuu29tfL9mZtNtWuahS3oxsCAiXpsfHwJERHy66zWeJGtmthzGm4c+XQl9FeBG4FXA74BLgN0j\n4vqVvjMzMwOmacglIv4qaT/gLNI4/TFO5mZm06vY0n8zM1u5vFLUzKwlBj6hS1pdebqNpOdIer2k\norN/zMyWx8APuUi6DHg5MBv4BXAFsDgi3lU0MEDS0wEi4r7SsQBIOiEi3jnZtqZJ2igifjPZtkEk\n6UomKBcYES9sMBybZo0eiUraDPhHoFMG4HrgaxFxY5NxjDErIh6V9B7gPyPi3yQtLBVMPluYD+xH\nOoOSpCXAkRFxeKm4si26H+TZTC8qFEu3U4Gxiel7FIpN0t8BHwY2y5uuB46KiOEC4fx9/vkBYBXg\nhPz4HcBfC8QziqQ1gP8NPDsi3i9pE2CziPivgjE9EXgrsCFdObKCf3+TamzIRdJLgGFgMfBV4GvA\nH4Hz8rz1UmZJ2o70Ae98iFYpGM9BwMuA7SLiKRGxLrAD8DJJB5UISNKhkhYDW0l6ON8WA/cCp5WI\nKcf1PElvBWZLekvX7d3AkwrF9AbgG8AZwB6kz9WPgG9Ien3T8UTEzRFxM/CqiPhIRFyZb/8H2Knp\neHo4Fvgz8JL8+C7gk+XCAdJnehdgCSlHdW71i4hGbsCPgaEe218B/LipOHrs/5Wkf3AfzY83Bo4u\nGM+VwNN6bH86cGWpuHIMR5Tcf494diElhPvzz87tS8BLC8U0DGzdY/tWwPkFf1dXAS/uerwDcFUF\nf8PL8s8ru7YVjQu4pvTvZXlvTQ65PCd6nHJGxPmSvtpgHGP3fy5wbj7NIiJuAfYtFQ/whIj4/diN\nEXGfpCeUCKjLJZJmR8RDAJKeTPqS/mGJYCLiNOA0SS+JiJ+XiKGHORFx1diNEXG1pPVKBJS9DzhW\n0pMAAY8C7ykYT8dfJK1OHueX9BzSEXtJF0vaMiIWFY5jypqc5bJ4gueKnc5I2l7SIuCm/HhrSUeW\nigf4y3I+14T5nWQOEBEPksb7S/tA/nIBQNK6kr5RKJaJPsvFPucRcWlE/A3pyHz7iNgyIi4tFU+X\n+cCZwDxJJwI/Bf65bEjsCFwu6UZJV0taJOnqwjH1pckj9HmSvtRjuyhbifFLwBuBHwJExFX5olYp\nW0t6uMd2UWhcuEuvA4Aapnhulb9cAIiIByS9oFAsz5F0eo/tIg3nFSFpNWBX8oW+TmG8iPjXUjHl\n/Z8t6QrgxaTf0QG9zlAb9rrC+19uTf5j/KcJnrussSiWNSsibhtT+bHY1f+IKHlBdjKXSfoc8OX8\n+EPA5QXj6Zglad2IeABA0lMo90UzUd3//2gsimX9APgT6e9VfHZLh6SX57udM/jn5ybIPysVU0Tc\nBiDpGZQ/iJqSxj70EXFcU/uaojskbQ9Enob3YeBXpYLJyWhcEfGHpmLp4cPAx4Dv5sdnk5J6aZ8F\nfi7pFNJR3t8DnyoRSEScX2K/fdggD7nUpvtA70mk5jiXkyYrFCHpTaTP1LNIM7k2IE093WKi99Wg\nsYVFks5g4gUOb2okkDHyt/CXgFfnTecA+5U67ZP0G9LvqVd5zIiIYqftNZP0fEaSwLkRcV2hOBYx\n8ed8qwbDWUrS14HPlfq99EvSPOALEfHWgjFcRfosnRMRL8hDsHtGxHtLxdSvJhP6KyZ6vuIjmypJ\n2iIirm1oX1+IiAPH+1Iu+GW8TkQ8PN5ZTYmzGUkbTPR853S+afmLZlPg16RZJErh1LVSNC+suzYi\nnl8whssiYtuc2F8QEY9Luioiti4VU7+aHHLpK2FLOrXJb2dJzyWNCc+JiK0lbQW8ISKOaCqG5XQC\ny66OnM59Qdkx4F6+TbqgfTmjv2iUHzd+NtNvwpb084h4yeSvXGl2bXBffcszyjp/u1nANqTyGyU9\nKGkt4GfAiZLuZYYsLKquloukKyOisRkKkoaBw4Av59MrkRYWVD1eVuD3tApwfES8o6l99iP/veZF\nxO2lY5mKpv9+Xft9Cl0X+iLit03H0E3SXl0PlwC3RsRFpeIBkLQm6QKySCt9ZwMnRsT9JePqRw1T\nzsZq+htmzYi4uGsaV0h6rOEYlkejv6dITUs2kLRaRJSeD79U/nv9N7Bl6VimqNG/Xy5J8HlgLmll\n7bNIay+eN9H7pluNkyUiovtovLr4JlJjQm/a/ZI2YmSl2q7A3WVDqtYtwEV5nvXSD31EfK5cSABc\nIWm7ShbK1OpTpBpBZ+Uz0Z2A3UoFM8HF487YfuMXjyVdGBE75jpFywzhRcQ6Tcc0VTUm9J7NT6fR\nfsAxwPMk3Ubqgfr2hmNYHiWOkm/Ot1nA2nlbDWN2OwDvyH+/P1IwKUxB05/zJbl8xCxJygt6Sl4T\neWPBffcUETvmn2tP9tpa1ZjQD254f7dExCslzSZdU3hw0nc0QNJPI+JV422LiBIVKq+LiFPGxPS2\nAnGM9ZrSAYwl6dMRcfAE25quIf9QvtB3IXB8vtD3Pw3HsFSp2T4TqXwNSF+anLZY3SkWgKSbgZOB\nYyOi2IKirnieBKwBnAcMMXIktw5wZkQUG/OUdMXYaW69tjVNFTbeGOd3dXXBz/napAQu4F2kC30n\nROHmKUqls48ENgdWI5Wu/mOJ4Y0xa0CeDTyQ7z8ZuD0iNmo6pqlq8gi9ulOs7EXA7sC3JP2FVMv6\n5Ih4pFA8+wAHki5aXc5IQn8YOKpEQJJeB7weWH9MPZ51SDMTSqum8YakD5KqdW48pqDT2kCx2RsR\nsRiWNpQ4ZZKXN+ko0hDnKcC2pC+bTUsE0knYkr4G/CAifpQfv45Kp32OVd20xZIkDQEnkhLVycAn\no1AbM0kfjoiSVR+XkrQ1aX7w4cDHu55aDJzXqaFSIK5DSVNOVyeVg4X0BfgX4KsRcWiBmGYD6wJH\nAId0PbW45Cm7pPcB/0Kq4/I4I2fGzy4VU46rs4hn6dlLqSmdXTEtiogtJ9tWo8YTek2nWDmeWcBr\ngb1JRwYn5tvfAp+IiM0mePt0xvUh0tzXB/PjdYHdI+LoEvHkGJ4QEY91xTMvIoqXFZV0RInkPZl8\nprAeo9uYFZkvL+km4GURcW+J/Y9H0s9IZTe+Tppd9jvg3SVXZUr6CXAB8K286R3AyyOiums1YzVZ\nD73jKNIQx02kI6v3MVK9r4SbgH8g9ezcOiI+ExF3RcRJpNrMpbw/xpSEBd5fMB6AsyWtky8eXQF8\nTdLnC8cE8F95MQiS9pT0ucmW4E83SfsB95AKmP13vhXrk0mactqrLHNp7yTlof1IM5Tmkfp5lrQ7\nqUPYD4Dv5/u7F42oTyWO0Ks6xerUAymx74nki8hbRf4D5aO9q0uuYO38nfLp+7yImF/yQl9XXFcD\nW5PavH2TdLS3W0RMWD9ommP6NbBDLasLlerDfx34BV0dgSLiI4Xi+SfgOxFxZ4n990PSmmMWGVWv\nxBH6o0rF9hdK+oxS4+MScXSsJekUSXdL+p2k70p6VsF4Os4EvivpVZJeBXwnbytpVUnPJC1IKXm0\nOdaS/MW3C3BURHyZkXnypdwBPDTpq5rzFdJF2YXAtV23Up5FKnl8gaR9JT29YCyjSHqppOtIJXM7\nXcyKDXVORYkj9A1Ip6KrkTrczybVUbm50UBG4vkJ8D3g+LzpncDbSo+X5bH9fYDOXPSzga9HRLHm\nBHnO+ceACyNiX0kbA//eZDG1ceI6n/RltzfwclIN66tKXsSSdAywGWmopfuIuMiqWkkLI2KbEvse\nT67D83LSLJddSY2svwN8vzMrp1BcvyTV1D+9M3Ig6Zqos578KCUS+gER8cXJtjUYzzIf9Bo//DY+\nSXOAPYBLI+ICSc8mNa8+fpK3TmdMPXutRsQnmo4FQNKnSKt8z2D0F0wVw415SPHVwL8Bm0XEGgVj\n+WVE7NA9FKwZUj63RELvteCi5Bj6ucBXGenCsxuwT0QU6Zgi6eSI2G28hViFalz8c0R8RqNLnXbH\ntH/TMc0UktaIiEcnf+W0x3FHj83Fpy0CSNqSdJT+D8DvSWPrRQ7wcjzfAz5HmsCxA3AAsG1EVF8S\npLGFRZJ2Jx1FbaTRTXTXBkouqX0PcDRppk2QLhq9p2A8B+SfNS3Euj7/LNn7dRmquJiSpJeQagSt\nBTw7z+XfJyL2LRFPRMwrsd/xSNqElMTfTpobfxKwc0TcUjSw5APAF0nN6+8CzqKOVouTanLp/wbA\nRvRYcEGavVHDisMq5NPPcyLi70rHYsuntnFYSauTDhY2iIgPKjV22SQiflwonptJ4+UnRcQ1JWJo\noyY7Ft0G3AY02aVlXHn+9ES9H4tM58r7/qukxyXNjojiMyVUaT/YjnzK3qlxc1001JpvMhFxhzSq\nqGKxC9qkkhaLSAvmAH5LWm5fJKFHxHP6eZ0a7Ow0pqzFMmbC0GKTQy61nR7XflTwCLBI0tmMrj1e\n4kPVKbP6FmAOIyvodifNWCoiL7M/jVRI6SrSZ2lLSbcDuxS+4HeHpJcCIekJpKPj6yd5z3TaJCJ2\nzzOViIhHNebbplJPmvwlK80HSHnhZNIX3kz4/YzS5BF6VbWGI+KY7sdKRYsiIoqVFB3j+/nWrUjh\nncj9YCV9NiK27XrqDEklx9X/hTSu/8qIeByWTvf8N1JDhw8XjK22cdi/KFXy7CxU24gyNfWnqsnP\n/DOBt5Euzi4hTZT4XlRSUrsfReqhS3ohsCPpj3VhRFxZIo6uWI4hLe9F0j3A+0rGlD251/TOUsFk\na0rauHPhKieFNQvG82rSatrHOxsidWg/jDS8UExE/J5UA6QWh5Pm6s+VdBzwCuC9ZUOqS17V+xXg\nK5Lmki7YXifp4Ig4YeJ316HxhC7p46Rvwc7R5zclnRIRn2w6luxY4MCIOC/HN0RaPl56zulepCO8\nbu/usa1JBwHDkm4hnY5uAPxjwXj+0utiekQskfTnXm9oSv6y+zCwIaOLcxW53hARZ0q6HHgp6W/3\nT7UV6hpH48Me+SBvd2An0jWGy5uOYXmVmId+I7B1RPwpP14dWBjlqhouMwe+8Lz4zvTOHUkV3zrW\nAf4aY7oYNU3SExm5AHlDRPy567mdIuLsBmO5gfQPb+w/egHfiojNm4plLElXkc78FpHK1QIjw1cN\nx7IKsDMjf7frgbNLrjru0CSdnST9TVOzYCQdDryB9Ps5idRQZkbNviuR0M8D3hwjZWGfTFrqW2oh\nz+dJZQi+QxoC+gfgMXK372i4POxMnt7Za9HYNO/vvImeLznts7PasNT+u+J4JnAucD9wJenLbhvg\nKaRrD0Uboo+z0LBIwTdJjwO/YaS2fic5zoQetUCZhP5DYDtSbZIgndZcAtwJzc/ikHTBBE9HRLy8\nsWC6KJWD/Z88Jrwp6ejqx5Hrkdeo5JnNRJo+c8j73APYhHQxtHup/RUNx3EscE1EfHbM9oNI1x/2\nbjKerv0v7exEKknQsTZwUUTsWSCmCUsuR4V9UMcqkdD3muj5iDiuqVhqlsc7/5bU/eYi4FLSmHFN\nF9pGafoIvV8l4pJ0BKnQ282MDLlE02eikm6IcfrQTvTcdFOlnZ360eTc+Klq/KJod8JWBV1vJK0D\n7MmyF6+KLSzKlOcKvxc4OtdSWVg4ppmqxHzitwEbR0TpqYETTcMtNkU3L5h7CNhdozs7rSVprSjU\n2alPTc6Nn5ISs1yGgTflfV8O3CvpooIJ9Eek7jujLl5VQLkeyDsYmV62SsF4+nFr6QDGUWL+/jWk\nbvGlZ5LMltRrZo1IF9qLUurstIC0QG3pmQypWUmtqm3EXGIe+uyIeFip683xkbveFIijY41Kl/Qe\nCBxK6j5+rVLt8QkvAjYhr37ckNFnM8fnn28pFFaNngzcIOlSRo+hNz1t8SLS2UIvFzcZyDgOJJXL\nraKz00xXIqF3d735aIH9j/VtSXuTOvBUUyc6T287v+vxLUDRLx5JJwDPIXW96Ux5C0aag9Tq1gL7\n7FkPvWkR8c5+Xidpz4j41uSvXOlq6+zUj2pLApRI6IcDPyFdyb40H3neVCCOjkeAL5CWkXdOpYJU\nH6Rxkr4QEQeOVxCrcCGsbYHnR9NX0vtQ25lDifnmK+gjjNToadItpMVqVXR2gsnnxpMudlep8Vku\ntZH0G1Iz39JjnQBIelFEXC6pZ4PjkolC0inA/hHxu1Ix9DLemUPJobQxRehWA54A/LFAEbq+lJpy\nqso6O0Fdc+OnqsRF0bnAkcDL8qYLgAOiXPfvXwNVtOECiIjL88/zlRvnRsR9ZaNa6mmk2haXUHZc\neKzqzhy6i9Dlqoa7AC8uF9GkShV++wTU0dmpe278mOt6a5OuRVSvxDz0s4FvA51iN3sC74iInRoN\nZCSeU4Hnk1bTdSepYtMWJS0A9gNmkcbrlgBHRsThpWLKcVV31gD1njmMVevCKyh6hL60s1NEFO3s\nNJPnxneUGEN/ekQc2/X4m5IOLBBHx4/yrQqSPkI6e9kuIn6Tt20M/KekgyLi86ViK524J1DdmYOk\n7nH7WaSziD8VimUVYNeIOHWCl/2iqXjG+ALwGuB0gIi4SlKR1dkzfG48UCah3y9pT1LtFEjFlYpN\nWYqIYyStBjw7In5dKo4u7wR2ilR+FUgzXPLv7Cyg8YSu+pqTjLWg8P57+V9d95eQZtrsUiKQSB2w\nDgPGTegR8cEGQxq775o6O83UufFAmYT+HtIYeqcF3MVAkXoSAJLeQOrwvRqpgfU2wPyIeHOhkJ7Q\nncw7IuK3XjeLAAAR90lEQVQ+pc43jYs+m5NIWjciHmgmqhG1nTnko7urS55N9XBWPhP+LqM7YJW+\nflRbZyeYwXPjPcsl1Ux5FXBejDTzXRQRWxaKZ9y6I7XWSukoUG2x2jMHSZdExPal9j+WpDu6HgYj\nv6Mi03M7JD2NVOP/1Tmms0iTJIol01zFc6eouLLpeJrsKXokEzcaLjXF7LGIeHDMKV/Jb7mtJfU6\nahIV15DIGl1wUfmZw0WSjmLZI+JGqy122TjGVOosdcbXLerr7AQVzo3vV5NDLt29Jz9BJSvpgOsl\n7QbMUuoysz/lLhAREbXXa5lIrad7PwWaPrPZJv/snpkUQJG6/8AvWfZ30Gtbo1RZZ6fs9nxbLd9m\njCabRHdXWTww6imTux/wcdLFj++TVrHWUJLAVp4SS7Xfm8s1jASRZis1StIzSM2PV5e0JSO/i3WA\nNZqOp4cfkqYtnkElxfFqmhs/VUWaRFPRkVxE/BE4ON+W0VmK32xUM1atNS5KfN6+x7JHv6cAL2o4\njjeQJiLMBb7MyN9oMfCxhmPp5U8R8aXSQXTrnhsPFJ0bP1WlEvpMUmRObI0kPQe4MyL+rNRMeytS\nxcwH80uK9jutgaTnAVuQytZ2z0VfhwLXQPKaj2Ml7RYRJze9/z58MS//L9rZaYxq5sZPVZMXRbtn\nIqzRdeGv+IwE69upwLaSngt8FTiNtOr39QAVr6Zr8sxhM+CNpPK53XPRFwPvbzCOsZ4haZ1Ipau/\nQjp7ODQiflowJoAtSWsvXsnoOd+lrjWkACqbG9+vJsfQJ5yJYDPC4xGxRNKbSaUIjpR0Zemgajpz\niIjTgNMkvSQift7UfvvwjxFxlKSdSWPq7we+QfNDQGPV0tmpW41z4/syq3QAM0Ct48IlPCZpd2Av\nUv14SFUESzsV+GvXmcM80pkDUOzM4c2S1pH0BEk/lXRfXu1bSufs+PWkL7urqOPff6ezU00+AHwI\nWB+4izRj6UNFI+qTx9And1TpACqyN+nD/qmI+E2ecnbCJO9pQo1nDjtHxD/nmG4F3gL8jDI1xwGu\nkvQjYFPgMElrUcfkhFo6Oy1V6dz4vgxsQpf0AyZe6PSW/POYxoKqXERcJ+lgcvOPXDzs02WjAkaf\nOXTGrUufOXT2/wbglIh4aMyYbNP2Jg2v/DpS8/GnMdKrtqRa1qMsVenc+L4MbELHR95TJul/Af/B\n6Lo3h1fwQa/xzOEMSTcA/wN8MNe2L1JtEZYW6LodeK6kav7d11aHJ6tubny/Br6Wi/Uv1715JTDc\nVffmmoj4m7KR1UnSU4CHcjJdA1gnIu4uFMu/knoP3MDork6vLxFPhyrs7CTplxGxQ6n9r4hqvqlL\nyTMkPkVqcrF0nnBEbFosqHo91mPooNgRjKRFTDxsVrrc6fOADcccEZdqqP1WYNOIKHaW0EvU2dmp\nxrnxfRn4hA58E/gkaSjhdaTTd5+29HatpD2AVSRtQqp7c3HBeN5YcN8T0jh9TimX0H8DVF0nKNJw\nwQ9zMj1kstdPoyrnxvdj4IdcJF0eES/qLpkr6bKI2LZ0bLXJwwYfBXbOm34CfLK2o74aSLqeivqc\nKrXp2wo4h0paLcK4nZ1eEREvKRQSkn5N+tvVNDe+Lz5Chz9LmgXcLOkDpHmnXgTVQy5U9FFJn6qp\naJGkF5OapmxOGoddhcLjsKT51XOAWvqcnplvtamms1OXztz4ewvHMWVO6HAQsCZp+OBTpJobxToo\n1Syvnvs69RUtOgp4O6n41bbAu0jzrUuqqs9pjdNvK+3sBBXOje+Xh1ykt0TE9yfbZunqP/D3wOk1\nzXLpDJFJurpzIVSFuth3xfSKXtubnqaXF1hNdOG4dD30qjo7QT1/u+XhI3T4v6Q66N0+2mObUW3R\nokeVGn0vlPQZ0jBH0WXtFf3j//vSAUyits5ONf3tpmxgE7qk1wCvBdaX1N1aah1m2GKCBtVatOid\npAS+H2kIbR5pqX3jevQ3XfoUBaqKRsTN/bxOuT/rdMfTQ22dnaqcG9+vgU3opAse15BW713btX0x\nZadM1ewDpIa+naJFZ1FH0aJdI+KLpL9lp9vMAaRYGzWDq4quWWi/VXR26lbp3Pi+eAxdemJE/Hny\nV5qkp5bsxj4eSVeMHQsuPYY+0/T6HZbab2cqcdOxTGSmfJ4G9ghd0nciYnfgF5KW+VYrfbGoUr+Q\ntJBUR/vM0nOsc0GuPUh1ZU7vemptoNZmG0Z9nZ26jTM3fkastRjYhA78U/5Z+0WjmmwKvJrUo/JI\nSScD34yIXxWK52LSBdCnAZ/t2r4YuLpIRDNX06Uga+3sBHXOje/LwA+5AORKeNuRLoRcFhH3FQ6p\nepL+jlTbe03gKuCQyjr02BRI2jo3vWh6v1V1dspz4/evcG58X2roWFKUpL2BK0in7nsCl0naq2xU\ndZL0VEkHSLoM+D+kmtFPA/43XR2CGoznwvxzsaSHu26LNdKzdqBJekDSH3rcHpC0dFiqRDLPqurs\nFBF/BXYvtf8VNfBH6JJuBHbsHJXno/ULI2KzspHVR9KvSHXGj42IO8c8d3BE1NDswrrkI85x5QRW\njKSFEbGNUmenNwIfAX4WEVsXjOnzpKmK1cyN79cgj6F3/AF4sOvxg/iC2ng2i4iQtJaktSLikc4T\npZO5pBcCO5KGzS6MiNIt6KowNmHnGu3dFx1/22xEy6itsxNUODe+X07ocCPwc0k/JP3RdgWukbQ/\nQER8qWRwldkil4V9CmmK7n3AXhFxTcmgJH2c1D2+s7r3m5JOiYhPFgyrKpLeAHwemAvcT1pL8CtS\nzfaSqurslFU3N75fHnKR/mWi5yPiY03FUjtJFwMfjYjz8uMh4F8j4qWF47oR2LpTxlfS6sBCD5uN\nyNNNdwLOiogXSNoJ2C0iSs8oqaqzU45nRsyN72Xgj9C7E3ZeFbZGRPxxgrcMsjU7yRwgIoYllVph\n2O23pGGEzpHdE0krWW3Ekoi4T9IsSYqIsyX9R+mgsio6O9U8N75fA5/QJR1PqgGyBLgEeKqkf4+I\nz038zoF0i6SPMdKAeU/glgle35SHSN2UziYNm+0EXCLpSwARsX/J4CrxkKS1gAuB4yXdSxrmKEp1\ndXaqeW58XzzkMnKVfQ/SXPSDSXPRS/ejrI6kdUm1UjpFnC4AFkTEA+WigsmmmUbEcU3FUitJawOP\nkqYqvwuYDRwfEb8vHFdVnZ2gvrnxUzHwR+jAE/Kp3i7Af0bEXyS52mIPOXFXd7TbnbDzl868iPBK\n0dEOjYjDSEfBxwBI+lfgsKJR1dfZCdLc+GtJZzBnklr3HRQR3yob1uSc0FMHnttJH6zzJT0beGTi\ntwwWSWcwcZOEop1cJA0DbyJ9ni8H7pV0Uel+mZV5Lcsm7zf02Na0qjo7ZTtHxD/nufG3kkox/4y0\nMrpqA5/Q8xLfpct8Jd3BDJhv2rBaLp6NZ3ZEPCzpfaRhhPmSfIQOSNqHVPZ4U0ndC2PWJn35lbag\ndAA91Dg3vi8Dn9Dz2OLHgJfnTecDnwRmXMfv6dLdwUWpM9DzSEfsN1bSGX1VSc8EdiN1m7IRJwM/\nBY5gdJ3/xRFRvAlypd2Bapwb3xdfFJVOIS2w6IzDvhPYPCJchXGMvDjlK8DNpOp8G5GaRP+4cFxv\nI30pXxQRH8yLQP49It5aMq7aSNoC+Nv88IKIuHai109zLFV1dlomiMrmxvfLCT3Pcplsm0E+anlj\nRPw6P34O8N8RUXq1oU1C0odI3aV+mDftAnw5Io4uF1W9lFotbkjXKEZElJhKOSUDP+QC/EnSiyPi\nFwCSXswMOb0qYHEnmWe3kOboFiVpLnAk8LK86QLggLEFxAbcPsD2nfo7eYbLxYAT+hiVzY2fEid0\n2Bc4QdITSad7j5Lm6dqyLpP0I9K4bJDqp1zaWVUXEd+f6M3T6FhS+d635cd75m07FYqnRmL0daHH\naL6pxUyxLZXNje/XwCf0XBJzizxmRkS40uL4ngTcA7wiP74PWJ20qi4YKY7VtKdHxLFdj78p6cBC\nsVRF0qoRsYS0uveXkk7NT72ZketGNlqNc+P7MrBj6J1qiuNxlcWZQ9JPSUfk38mbdgf2johXlYuq\nDt2FpiRtT9cq34i4tFxk9ZJ0HqmEbk1z4/syyEfoT++6/17y6jkbn6SNSF2KNmT0xaLSH/T3kMbQ\nP086U7gY2LtoRPVYOqwSEZeQkpRNbEHpAJbXwB6hd5N0ZUS8oHQctZN0FemLbxGwtDxCpXOJDZB0\nJzBuoTkXoWuXQT5C7+Zvtf78qaahKElHMnFJgurqzhSwCrAWvgA6qdrnxvfDCd2m4ouS5gNnMXps\nsVSvxcu67n8CmF8ojpr9LiIOn/xlFhFrl45hRQ3skIukKxn5Nn4ecEPnKdK38Qt7vnGASTqCtJL2\nZkaGXCIiite+8bBZb/69DJZBPkL30v6pexuwcSX1W8YazCOTyQ38TJ9BMrAJPSJu7ud1ki6MiB0n\nf+VAuIbUzaV4USfrj9dVDJaBTehTUEPPzFo8GbhB0qVUMD93zEWsNSQ93HmKGXIRy2xlckKfnE/l\nR1R10bENF7HMViYndOtbRJwvaQNgk4g4J5cVXaV0XGaWzCodwAzg+buZpPcD3wP+X960PiPlWM2s\nMCf0yb27dAAV+RCpRO3DABFxE/CMohGZ2VIDO+Qi6QEmXhXWqb54VaOB1e3PEfGXTn9FSaviawxm\n1RjYhE7qNm5Tc76kw4DVJe1EqiV/RuGYzCwb2JWiY+V66E/qPI6I3xYMp0qSZpEqU+5MOpP5CfD1\nmdgIwKyNBj6h58bHnwfmAveTLvT9yn0yp07SqW7MbFaOL4rCp0gX+m6MiHnAa0g9KW3qNi4dgNkg\nc0KHJRFxHzBLkiLibGD70kHNUIN9umdW2CBfFO14SNJawIXA8ZLuBf6ncExmZlPmMXRpbeBR0tnK\nu4DZwPER8fuigc1ALtVqVpaHXODQiPhrRDwWEcfkllwfKR3UDHVw6QDMBpmP0Lu6ondtuyoiti4V\nU20kLWLiRVhbNRySmfUwsGPokvYBPgBsKqm7hdrawOVloqrWG0sHYGaTG9gjdEnrAk8FjgAO6Xpq\ncUS4gYOZzTgDm9C7SdoC+Nv88IKIuLZkPLWS9GLgSGBzYDVS6dw/upGEWR0G/qKopA8BpwDPzreT\nJe1bNqpqHQXsDtwErA68D/hy0YjMbKmBP0KXdDXw0oh4JD9eC7jYF/qWJemyiNhW0tWd34+nKprV\nY2AvinYR0N3F/jHc1GI8j0paDVgo6TPA7/BZnlk1BjahS1o1IpYAJwC/lHRqfurNwHHlIqvaO0kJ\nfD/gIGAe8JaiEZnZUgM75NI9/1zS9sCO+akLIuLScpHVS9IBEfHFybaZWRmDnNA99jtF4yzC8u/R\nrBIDO+QCPF3SuEv8cwkAAyTtDuwBbCTp9K6n1gb+UCYqMxtrkBP6KsBa+AJoPy4mXQB9GvDZru2L\ngauLRGRmyxjkIZdlhg/MzGayQT5C95F5nyRdGBE7SlrM6CJdneJcXilqVoFBPkJ/SkR4/NfMWmNg\nE7otH0kvJE3xDODCiLiycEhmlnmVn/VN0sdJi66eSrpA+k1J/7dsVGbW4SN065ukG4GtI+JP+fHq\nwMKI2KxsZGYGPkK3qfkt8KSux08E7ioUi5mN4SN065ukHwLbAWeTxtB3Ai4B7gSIiP3LRWdmTujW\nN0l7TfR8RLiomVlBTui2XHILv3kR4ZWiZpXwGLr1TdKwpHUkPQW4AviaJNe8MauEE7pNxeyIeJhU\nA/34iNgBeHXhmMwsc0K3qVhV0jOB3YD/Kh2MmY3mhG5TcTjwE+DmiLhU0sakhtFmVgFfFDUzawkf\noVvfJM2V9ANJ9+bbqZLmlo7LzBIndJuKY4HTgWfl2xl5m5lVwEMu1jdJCyNim8m2mVkZPkK3qbhf\n0p6SVsm3PYH7SwdlZomP0K1vkjYAjgReQqrlcjGwf0TcXjQwMwOc0M3MWmOQe4panyQdyeheoqO4\nyqJZHZzQrR+Xdd3/BDC/VCBmNj4PudiUSLoyIl5QOg4zW5ZnudhU+QjArFJO6GZmLeEhF5uUpMWM\nHJmvATzaeQqIiFinSGBmNooTuplZS3jIxcysJZzQzcxawgndzKwlnNDNzFrCCd3MrCX+P7A9oki/\n9/baAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f279e975250>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.bar(range(len(predictors)), scores)\n",
"plt.xticks(range(len(predictors)), predictors, rotation='vertical')\n",
"plt.show()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"alg_c = RandomForestClassifier()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"alg_r = RandomForestRegressor()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"ss = cross_validation.ShuffleSplit(train.shape[0], 10, 0.1, 0.3)"
]
},
{
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": [
"scores = cross_validation.cross_val_score(alg_c, train[predictors_revised], train['Buy_or_not'], cv=ss)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"scores.mean()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"ground_truth = []\n",
"predictions = []\n",
"for tr, te in ss:\n",
" train_predictors = (train[predictors_revised].iloc[tr, :])\n",
" train_target = train['Buy_or_not'].iloc[tr]\n",
" alg_c.fit(train_predictors, train_target)\n",
" test_truth = train['Buy_or_not'].iloc[te]\n",
" ground_truth.append(test_truth)\n",
" test_predictions = alg_c.predict(train[predictors_revised].iloc[te, :])\n",
" predictions.append(test_predictions)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"ground_truth = np.concatenate(ground_truth, axis=0)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"predictions = np.concatenate(predictions, axis=0)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"predictions[predictions > .5] = 1\n",
"predictions[predictions <= .5] = 0"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"metrics.f1_score(ground_truth, predictions)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"scores = cross_validation.cross_val_score(alg_r, train[predictors_revised], train['Year Total'], cv=ss)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"scores.mean()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"np.count_nonzero(train['Buy_or_not'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Moving on to test dataset"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"solution = pd.read_csv('/home/devashish/datasets/ZS/Solution.csv')"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 193810\n",
"District_ID 193810\n",
"Instrument_ID 193810\n",
"Buy_or_not 0\n",
"Revenue 0\n",
"dtype: int64"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert to int"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(solution, 'Hospital_ID')"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"converttoint(solution, 'District_ID')"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"converttoint(solution, 'Instrument_ID')"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Buy_or_not</th>\n",
" <th>Revenue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>11</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Instrument_ID Buy_or_not Revenue\n",
"0 1 12 1 NaN NaN\n",
"1 1 12 10 NaN NaN\n",
"2 1 12 11 NaN NaN\n",
"3 1 12 13 NaN NaN\n",
"4 1 12 15 NaN NaN"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add Total_Hospital_employees"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = train[['Hospital_ID', 'Total_Hospital_employees']]"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 193810\n",
"District_ID 193810\n",
"Instrument_ID 193810\n",
"Buy_or_not 0\n",
"Revenue 0\n",
"dtype: int64"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.count()"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1518"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.count_nonzero(df['Hospital_ID'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1580"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.count_nonzero(solution[\"Hospital_ID\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"med = df['Total_Hospital_employees'].median()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution = pd.merge(solution, df, on='Hospital_ID', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution['Total_Hospital_employees'] = solution['Total_Hospital_employees'].fillna(med)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add hospital employees in district"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution = pd.merge(solution, hospital_profiling.groupby(['Hospital_ID', 'District_ID'], as_index=False).sum(),\n",
" on=['Hospital_ID', 'District_ID'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Buy_or_not</th>\n",
" <th>Revenue</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>Hospital_employees</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>11</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Instrument_ID Buy_or_not Revenue \\\n",
"0 1 12 1 NaN NaN \n",
"1 1 12 10 NaN NaN \n",
"2 1 12 11 NaN NaN \n",
"3 1 12 13 NaN NaN \n",
"4 1 12 15 NaN NaN \n",
"\n",
" Total_Hospital_employees Hospital_employees \n",
"0 13088.0 3.0 \n",
"1 13088.0 3.0 \n",
"2 13088.0 3.0 \n",
"3 13088.0 3.0 \n",
"4 13088.0 3.0 "
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.head()"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"solution = solution.rename(columns={'Hospital_employees':'Hospital_employees_in_district'})"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution['Hospital_employees_in_district'] = solution['Hospital_employees_in_district'].fillna(0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add Hospitals in district"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution = pd.merge(solution, hospital_profiling.groupby('District_ID', as_index=False).agg({'Hospital_ID' : np.count_nonzero}),\n",
" on='District_ID', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"solution = solution.rename(columns={'Hospital_ID_x':'Hospital_ID', 'Hospital_ID_y': 'Hospitals_in_District'})"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution['Hospitals_in_District'] = solution['Hospitals_in_District'].fillna(solution['Hospitals_in_District'].median())"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 193810\n",
"District_ID 193810\n",
"Instrument_ID 193810\n",
"Buy_or_not 0\n",
"Revenue 0\n",
"Total_Hospital_employees 193810\n",
"Hospital_employees_in_district 193810\n",
"Hospitals_in_District 193810\n",
"dtype: int64"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add total instrument demand"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(hospital_revenue[\"Instrument_ID\"].value_counts().reset_index())"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.columns = ['Instrument_ID', 'Total_Instr_Demand']"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"solution = pd.merge(solution, df, on='Instrument_ID', how='left').fillna(0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add Instrument Value"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = hospital_revenue[['Instrument_ID', 'Year Total']]"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.groupby('Instrument_ID', as_index=False).sum()"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mean = df['Year Total'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Instrument_Value'] = df['Year Total'].apply(lambda x: x / mean)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"del df['Year Total']"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Instrument_ID</th>\n",
" <th>Instrument_Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.230520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>7.172774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1.250719</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0.460606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>1.275721</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Instrument_ID Instrument_Value\n",
"0 1 0.230520\n",
"1 2 7.172774\n",
"2 3 1.250719\n",
"3 4 0.460606\n",
"4 5 1.275721"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"solution = pd.merge(solution, df, on=['Instrument_ID'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"solution['Instrument_Value'] = solution['Instrument_Value'].fillna(solution['Instrument_Value'].median())"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Hospital_ID 193810\n",
"District_ID 193810\n",
"Instrument_ID 193810\n",
"Buy_or_not 193810\n",
"Revenue 193810\n",
"Total_Hospital_employees 193810\n",
"Hospital_employees_in_district 193810\n",
"Hospitals_in_District 193810\n",
"Total_Instr_Demand 193810\n",
"Instrument_Value 193810\n",
"dtype: int64"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add instrument median"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = hospital_revenue[['Instrument_ID', 'Year Total']]"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.groupby('Instrument_ID', as_index=False).median()"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.columns = ['Instrument_ID', 'Instrument_Median']"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution = pd.merge(solution, df, on='Instrument_ID', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Buy_or_not</th>\n",
" <th>Revenue</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" <th>Instrument_Median</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>7399.0</td>\n",
" <td>0.230520</td>\n",
" <td>413.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>76.0</td>\n",
" <td>0.201387</td>\n",
" <td>16980.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>11</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>137.0</td>\n",
" <td>0.241652</td>\n",
" <td>16862.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>1.0</td>\n",
" <td>0.005706</td>\n",
" <td>583102.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>169.0</td>\n",
" <td>0.814948</td>\n",
" <td>49459.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Instrument_ID Buy_or_not Revenue \\\n",
"0 1 12 1 0.0 0.0 \n",
"1 1 12 10 0.0 0.0 \n",
"2 1 12 11 0.0 0.0 \n",
"3 1 12 13 0.0 0.0 \n",
"4 1 12 15 0.0 0.0 \n",
"\n",
" Total_Hospital_employees Hospital_employees_in_district \\\n",
"0 13088.0 3.0 \n",
"1 13088.0 3.0 \n",
"2 13088.0 3.0 \n",
"3 13088.0 3.0 \n",
"4 13088.0 3.0 \n",
"\n",
" Hospitals_in_District Total_Instr_Demand Instrument_Value \\\n",
"0 1086.0 7399.0 0.230520 \n",
"1 1086.0 76.0 0.201387 \n",
"2 1086.0 137.0 0.241652 \n",
"3 1086.0 1.0 0.005706 \n",
"4 1086.0 169.0 0.814948 \n",
"\n",
" Instrument_Median \n",
"0 413.0 \n",
"1 16980.5 \n",
"2 16862.0 \n",
"3 583102.0 \n",
"4 49459.0 "
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.head()"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution['Instrument_Median'] = solution['Instrument_Median'].fillna(solution['Instrument_Median'].median())"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hospital_ID</th>\n",
" <th>District_ID</th>\n",
" <th>Instrument_ID</th>\n",
" <th>Buy_or_not</th>\n",
" <th>Revenue</th>\n",
" <th>Total_Hospital_employees</th>\n",
" <th>Hospital_employees_in_district</th>\n",
" <th>Hospitals_in_District</th>\n",
" <th>Total_Instr_Demand</th>\n",
" <th>Instrument_Value</th>\n",
" <th>Instrument_Median</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>7399.0</td>\n",
" <td>0.230520</td>\n",
" <td>413.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>76.0</td>\n",
" <td>0.201387</td>\n",
" <td>16980.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>11</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>137.0</td>\n",
" <td>0.241652</td>\n",
" <td>16862.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>1.0</td>\n",
" <td>0.005706</td>\n",
" <td>583102.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13088.0</td>\n",
" <td>3.0</td>\n",
" <td>1086.0</td>\n",
" <td>169.0</td>\n",
" <td>0.814948</td>\n",
" <td>49459.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hospital_ID District_ID Instrument_ID Buy_or_not Revenue \\\n",
"0 1 12 1 0.0 0.0 \n",
"1 1 12 10 0.0 0.0 \n",
"2 1 12 11 0.0 0.0 \n",
"3 1 12 13 0.0 0.0 \n",
"4 1 12 15 0.0 0.0 \n",
"\n",
" Total_Hospital_employees Hospital_employees_in_district \\\n",
"0 13088.0 3.0 \n",
"1 13088.0 3.0 \n",
"2 13088.0 3.0 \n",
"3 13088.0 3.0 \n",
"4 13088.0 3.0 \n",
"\n",
" Hospitals_in_District Total_Instr_Demand Instrument_Value \\\n",
"0 1086.0 7399.0 0.230520 \n",
"1 1086.0 76.0 0.201387 \n",
"2 1086.0 137.0 0.241652 \n",
"3 1086.0 1.0 0.005706 \n",
"4 1086.0 169.0 0.814948 \n",
"\n",
" Instrument_Median \n",
"0 413.0 \n",
"1 16980.5 \n",
"2 16862.0 \n",
"3 583102.0 \n",
"4 49459.0 "
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"solution.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train on full training set"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
"from sklearn.linear_model import SGDRegressor, LinearRegression\n",
"from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"alg_c = AdaBoostClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"alg_r = AdaBoostRegressor()"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n",
" learning_rate=1.0, n_estimators=50, random_state=None)"
]
},
"execution_count": 152,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"alg_c.fit(train[predictors], train['Buy_or_not'])"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"classifications = alg_c.predict(solution[predictors].astype(float))"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"solution['Buy_or_not'] = classifications"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2019"
]
},
"execution_count": 155,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.count_nonzero(solution['Buy_or_not'])"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',\n",
" n_estimators=50, random_state=None)"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"alg_r.fit(train.loc[train['Buy_or_not'] == 1, predictors], train.loc[train['Buy_or_not'] == 1, 'Year Total'])"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"regressions = alg_r.predict(solution.loc[solution['Buy_or_not'] == 1, predictors])"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"solution.loc[solution['Buy_or_not'] == 1, 'Revenue'] = regressions"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_csv('/home/devashish/datasets/ZS/Solution.csv')"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Buy_or_not'] = solution['Buy_or_not']"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['Revenue'] = solution['Revenue']"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['Buy_or_not'] = df['Buy_or_not'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['Revenue'] = df['Revenue'].astype(int)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"df.loc[df['Buy_or_not'] == 0, 'Revenue'] = 0"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"np.count_nonzero(df['Buy_or_not'])"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.to_csv(path_or_buf='/home/devashish/submission.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment