Skip to content

Instantly share code, notes, and snippets.

@BabaCafe
Created July 20, 2018 16:00
Show Gist options
  • Save BabaCafe/a3149d8c09f438a5aba7d631d5a3e28e to your computer and use it in GitHub Desktop.
Save BabaCafe/a3149d8c09f438a5aba7d631d5a3e28e to your computer and use it in GitHub Desktop.
Mckinsey Hackathon (Healthcare Analytics)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\sklearn\\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
]
}
],
"source": [
"import operator\n",
"import xgboost as xgb\n",
"from xgboost.sklearn import XGBClassifier\n",
"import pandas as pd\n",
"import numpy as np\n",
"import random as rnd\n",
"from sklearn import preprocessing\n",
"\n",
"\n",
"from sklearn.cross_validation import train_test_split , StratifiedKFold\n",
"from sklearn.feature_selection import RFECV\n",
"\n",
"import seaborn as sns\n",
"\n",
"\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.pylab as pylab"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"label=preprocessing.LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def describe_more( df ):\n",
" var = [] ; l = [] ; t = []\n",
" for x in df:\n",
" var.append( x )\n",
" l.append( len( pd.value_counts( df[ x ] ) ) )\n",
" t.append( df[ x ].dtypes )\n",
" #print(var,l,t)\n",
" levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )\n",
" levels.sort_values( by = 'Levels' , inplace = True )\n",
" return levels"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train=pd.read_csv('train.csv')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>gender</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>ever_married</th>\n",
" <th>work_type</th>\n",
" <th>Residence_type</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30669</td>\n",
" <td>Male</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>children</td>\n",
" <td>Rural</td>\n",
" <td>95.12</td>\n",
" <td>18.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>30468</td>\n",
" <td>Male</td>\n",
" <td>58.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>87.96</td>\n",
" <td>39.2</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16523</td>\n",
" <td>Female</td>\n",
" <td>8.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>110.89</td>\n",
" <td>17.6</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>56543</td>\n",
" <td>Female</td>\n",
" <td>70.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Rural</td>\n",
" <td>69.04</td>\n",
" <td>35.9</td>\n",
" <td>formerly smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46136</td>\n",
" <td>Male</td>\n",
" <td>14.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>Never_worked</td>\n",
" <td>Rural</td>\n",
" <td>161.28</td>\n",
" <td>19.1</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
"0 30669 Male 3.0 0 0 No \n",
"1 30468 Male 58.0 1 0 Yes \n",
"2 16523 Female 8.0 0 0 No \n",
"3 56543 Female 70.0 0 0 Yes \n",
"4 46136 Male 14.0 0 0 No \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 children Rural 95.12 18.0 NaN \n",
"1 Private Urban 87.96 39.2 never smoked \n",
"2 Private Urban 110.89 17.6 NaN \n",
"3 Private Rural 69.04 35.9 formerly smoked \n",
"4 Never_worked Rural 161.28 19.1 NaN \n",
"\n",
" stroke \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 1462\n",
"smoking_status 13292\n",
"stroke 0\n",
"dtype: int64"
]
},
"execution_count": 278,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test=pd.read_csv('test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 280,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 591\n",
"smoking_status 5751\n",
"dtype: int64"
]
},
"execution_count": 280,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 281,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"43400"
]
},
"execution_count": 281,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(train)"
]
},
{
"cell_type": "code",
"execution_count": 282,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"18601"
]
},
"execution_count": 282,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(test)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Total=train.append(test,ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 344,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['bmi_interval'].fillna(data['bmi_interval'].mode()[0],inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 345,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['smoking_status'].fillna(data['smoking_status'].mode()[0],inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 321,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Residence_type</th>\n",
" <th>age</th>\n",
" <th>age_interval</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>bmi_interval</th>\n",
" <th>ever_married</th>\n",
" <th>gender</th>\n",
" <th>glucose_level</th>\n",
" <th>heart_disease</th>\n",
" <th>hypertension</th>\n",
" <th>id</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" <th>work_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Rural</td>\n",
" <td>3.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>95.12</td>\n",
" <td>18.0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>30669</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Urban</td>\n",
" <td>58.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>87.96</td>\n",
" <td>39.2</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>30468</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Urban</td>\n",
" <td>8.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>110.89</td>\n",
" <td>17.6</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>16523</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rural</td>\n",
" <td>70.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>69.04</td>\n",
" <td>35.9</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>56543</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Rural</td>\n",
" <td>14.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>161.28</td>\n",
" <td>19.1</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>46136</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Never_worked</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Urban</td>\n",
" <td>47.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>210.95</td>\n",
" <td>50.1</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>32257</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Urban</td>\n",
" <td>52.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>77.59</td>\n",
" <td>17.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>52800</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Rural</td>\n",
" <td>75.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>243.53</td>\n",
" <td>27.0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>41413</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Rural</td>\n",
" <td>32.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>77.67</td>\n",
" <td>32.3</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>15266</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Urban</td>\n",
" <td>74.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>205.84</td>\n",
" <td>54.6</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>28674</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Urban</td>\n",
" <td>79.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>77.08</td>\n",
" <td>35.0</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>10460</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Govt_job</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Urban</td>\n",
" <td>79.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>57.08</td>\n",
" <td>22.0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>64908</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Rural</td>\n",
" <td>37.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>162.96</td>\n",
" <td>39.4</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>63884</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Rural</td>\n",
" <td>37.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>73.50</td>\n",
" <td>26.1</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>37893</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Rural</td>\n",
" <td>40.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>95.04</td>\n",
" <td>42.4</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>67855</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Rural</td>\n",
" <td>35.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>85.37</td>\n",
" <td>33.0</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>25774</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Urban</td>\n",
" <td>20.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>84.62</td>\n",
" <td>19.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>19584</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Rural</td>\n",
" <td>42.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>82.67</td>\n",
" <td>22.5</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>24447</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Urban</td>\n",
" <td>44.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>57.33</td>\n",
" <td>24.6</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>49589</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Govt_job</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Urban</td>\n",
" <td>79.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>67.84</td>\n",
" <td>25.2</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>17986</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Rural</td>\n",
" <td>65.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>75.70</td>\n",
" <td>41.8</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>29217</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Rural</td>\n",
" <td>57.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>129.54</td>\n",
" <td>60.9</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>72911</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Rural</td>\n",
" <td>49.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>60.22</td>\n",
" <td>31.5</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47175</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Urban</td>\n",
" <td>71.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>198.21</td>\n",
" <td>27.3</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4057</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Urban</td>\n",
" <td>59.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>109.82</td>\n",
" <td>23.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>48588</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>Urban</td>\n",
" <td>25.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>60.84</td>\n",
" <td>24.5</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>70336</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Rural</td>\n",
" <td>67.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>94.61</td>\n",
" <td>28.4</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>66767</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>Govt_job</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Rural</td>\n",
" <td>38.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>97.49</td>\n",
" <td>26.9</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>45801</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Rural</td>\n",
" <td>54.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>206.72</td>\n",
" <td>26.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>36275</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Rural</td>\n",
" <td>70.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>214.45</td>\n",
" <td>31.2</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>11577</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18571</th>\n",
" <td>Rural</td>\n",
" <td>76.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>84.49</td>\n",
" <td>23.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>30015</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Self-employed</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18572</th>\n",
" <td>Urban</td>\n",
" <td>29.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>85.78</td>\n",
" <td>22.1</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>24761</td>\n",
" <td>smokes</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18573</th>\n",
" <td>Rural</td>\n",
" <td>16.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>80.14</td>\n",
" <td>22.1</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>27094</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18574</th>\n",
" <td>Rural</td>\n",
" <td>2.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>69.56</td>\n",
" <td>18.9</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>43495</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18575</th>\n",
" <td>Urban</td>\n",
" <td>51.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>97.49</td>\n",
" <td>26.2</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>56348</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18576</th>\n",
" <td>Urban</td>\n",
" <td>24.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>72.11</td>\n",
" <td>23.5</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>11876</td>\n",
" <td>formerly smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18577</th>\n",
" <td>Rural</td>\n",
" <td>45.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>75.37</td>\n",
" <td>NaN</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>585</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18578</th>\n",
" <td>Urban</td>\n",
" <td>23.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>84.13</td>\n",
" <td>29.6</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>43879</td>\n",
" <td>smokes</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18579</th>\n",
" <td>Rural</td>\n",
" <td>24.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>76.93</td>\n",
" <td>23.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38239</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18580</th>\n",
" <td>Urban</td>\n",
" <td>2.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>93.32</td>\n",
" <td>21.8</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>28943</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18581</th>\n",
" <td>Rural</td>\n",
" <td>22.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>81.87</td>\n",
" <td>27.9</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>17895</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18582</th>\n",
" <td>Rural</td>\n",
" <td>69.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>105.31</td>\n",
" <td>26.7</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>35147</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18583</th>\n",
" <td>Rural</td>\n",
" <td>39.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>76.09</td>\n",
" <td>32.7</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>72758</td>\n",
" <td>smokes</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18584</th>\n",
" <td>Urban</td>\n",
" <td>9.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>98.39</td>\n",
" <td>24.6</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>28661</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18585</th>\n",
" <td>Rural</td>\n",
" <td>21.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>114.73</td>\n",
" <td>24.8</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>46853</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18586</th>\n",
" <td>Rural</td>\n",
" <td>25.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>178.14</td>\n",
" <td>27.8</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>69190</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18587</th>\n",
" <td>Urban</td>\n",
" <td>79.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>110.27</td>\n",
" <td>36.2</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>26705</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18588</th>\n",
" <td>Urban</td>\n",
" <td>82.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>180.24</td>\n",
" <td>40.4</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>36976</td>\n",
" <td>formerly smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18589</th>\n",
" <td>Urban</td>\n",
" <td>24.0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>97.05</td>\n",
" <td>27.6</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>58578</td>\n",
" <td>smokes</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18590</th>\n",
" <td>Urban</td>\n",
" <td>52.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>85.57</td>\n",
" <td>21.4</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>61508</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18591</th>\n",
" <td>Urban</td>\n",
" <td>13.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>62.33</td>\n",
" <td>27.0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>70296</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18592</th>\n",
" <td>Rural</td>\n",
" <td>11.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>83.25</td>\n",
" <td>15.9</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>35299</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18593</th>\n",
" <td>Rural</td>\n",
" <td>80.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>102.85</td>\n",
" <td>25.3</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>15023</td>\n",
" <td>formerly smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18594</th>\n",
" <td>Urban</td>\n",
" <td>40.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>62.75</td>\n",
" <td>29.8</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>56291</td>\n",
" <td>formerly smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18595</th>\n",
" <td>Rural</td>\n",
" <td>5.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>126.32</td>\n",
" <td>17.0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>53431</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18596</th>\n",
" <td>Rural</td>\n",
" <td>20.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>74.43</td>\n",
" <td>18.4</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>67353</td>\n",
" <td>formerly smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18597</th>\n",
" <td>Rural</td>\n",
" <td>61.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>211.55</td>\n",
" <td>31.6</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>362</td>\n",
" <td>smokes</td>\n",
" <td>NaN</td>\n",
" <td>Govt_job</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18598</th>\n",
" <td>Rural</td>\n",
" <td>79.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>125.74</td>\n",
" <td>29.4</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>29839</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18599</th>\n",
" <td>Rural</td>\n",
" <td>55.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>69.46</td>\n",
" <td>33.8</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6438</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Govt_job</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18600</th>\n",
" <td>Rural</td>\n",
" <td>38.0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>91.23</td>\n",
" <td>24.4</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>16770</td>\n",
" <td>never smoked</td>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>62001 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" Residence_type age age_interval avg_glucose_level bmi \\\n",
"0 Rural 3.0 (0.079, 20.0] 95.12 18.0 \n",
"1 Urban 58.0 (50.0, 64.0] 87.96 39.2 \n",
"2 Urban 8.0 (0.079, 20.0] 110.89 17.6 \n",
"3 Rural 70.0 (64.0, 82.0] 69.04 35.9 \n",
"4 Rural 14.0 (0.079, 20.0] 161.28 19.1 \n",
"5 Urban 47.0 (36.0, 50.0] 210.95 50.1 \n",
"6 Urban 52.0 (50.0, 64.0] 77.59 17.7 \n",
"7 Rural 75.0 (64.0, 82.0] 243.53 27.0 \n",
"8 Rural 32.0 (20.0, 36.0] 77.67 32.3 \n",
"9 Urban 74.0 (64.0, 82.0] 205.84 54.6 \n",
"10 Urban 79.0 (64.0, 82.0] 77.08 35.0 \n",
"11 Urban 79.0 (64.0, 82.0] 57.08 22.0 \n",
"12 Rural 37.0 (36.0, 50.0] 162.96 39.4 \n",
"13 Rural 37.0 (36.0, 50.0] 73.50 26.1 \n",
"14 Rural 40.0 (36.0, 50.0] 95.04 42.4 \n",
"15 Rural 35.0 (20.0, 36.0] 85.37 33.0 \n",
"16 Urban 20.0 (0.079, 20.0] 84.62 19.7 \n",
"17 Rural 42.0 (36.0, 50.0] 82.67 22.5 \n",
"18 Urban 44.0 (36.0, 50.0] 57.33 24.6 \n",
"19 Urban 79.0 (64.0, 82.0] 67.84 25.2 \n",
"20 Rural 65.0 (64.0, 82.0] 75.70 41.8 \n",
"21 Rural 57.0 (50.0, 64.0] 129.54 60.9 \n",
"22 Rural 49.0 (36.0, 50.0] 60.22 31.5 \n",
"23 Urban 71.0 (64.0, 82.0] 198.21 27.3 \n",
"24 Urban 59.0 (50.0, 64.0] 109.82 23.7 \n",
"25 Urban 25.0 (20.0, 36.0] 60.84 24.5 \n",
"26 Rural 67.0 (64.0, 82.0] 94.61 28.4 \n",
"27 Rural 38.0 (36.0, 50.0] 97.49 26.9 \n",
"28 Rural 54.0 (50.0, 64.0] 206.72 26.7 \n",
"29 Rural 70.0 (64.0, 82.0] 214.45 31.2 \n",
"... ... ... ... ... ... \n",
"18571 Rural 76.0 (64.0, 82.0] 84.49 23.7 \n",
"18572 Urban 29.0 (20.0, 36.0] 85.78 22.1 \n",
"18573 Rural 16.0 (0.079, 20.0] 80.14 22.1 \n",
"18574 Rural 2.0 (0.079, 20.0] 69.56 18.9 \n",
"18575 Urban 51.0 (50.0, 64.0] 97.49 26.2 \n",
"18576 Urban 24.0 (20.0, 36.0] 72.11 23.5 \n",
"18577 Rural 45.0 (36.0, 50.0] 75.37 NaN \n",
"18578 Urban 23.0 (20.0, 36.0] 84.13 29.6 \n",
"18579 Rural 24.0 (20.0, 36.0] 76.93 23.7 \n",
"18580 Urban 2.0 (0.079, 20.0] 93.32 21.8 \n",
"18581 Rural 22.0 (20.0, 36.0] 81.87 27.9 \n",
"18582 Rural 69.0 (64.0, 82.0] 105.31 26.7 \n",
"18583 Rural 39.0 (36.0, 50.0] 76.09 32.7 \n",
"18584 Urban 9.0 (0.079, 20.0] 98.39 24.6 \n",
"18585 Rural 21.0 (20.0, 36.0] 114.73 24.8 \n",
"18586 Rural 25.0 (20.0, 36.0] 178.14 27.8 \n",
"18587 Urban 79.0 (64.0, 82.0] 110.27 36.2 \n",
"18588 Urban 82.0 (64.0, 82.0] 180.24 40.4 \n",
"18589 Urban 24.0 (20.0, 36.0] 97.05 27.6 \n",
"18590 Urban 52.0 (50.0, 64.0] 85.57 21.4 \n",
"18591 Urban 13.0 (0.079, 20.0] 62.33 27.0 \n",
"18592 Rural 11.0 (0.079, 20.0] 83.25 15.9 \n",
"18593 Rural 80.0 (64.0, 82.0] 102.85 25.3 \n",
"18594 Urban 40.0 (36.0, 50.0] 62.75 29.8 \n",
"18595 Rural 5.0 (0.079, 20.0] 126.32 17.0 \n",
"18596 Rural 20.0 (0.079, 20.0] 74.43 18.4 \n",
"18597 Rural 61.0 (50.0, 64.0] 211.55 31.6 \n",
"18598 Rural 79.0 (64.0, 82.0] 125.74 29.4 \n",
"18599 Rural 55.0 (50.0, 64.0] 69.46 33.8 \n",
"18600 Rural 38.0 (36.0, 50.0] 91.23 24.4 \n",
"\n",
" bmi_interval ever_married gender glucose_level heart_disease \\\n",
"0 (10.099, 27.7] No Male (82.16, 103.3] 0 \n",
"1 (27.7, 97.6] Yes Male (82.16, 103.3] 0 \n",
"2 (10.099, 27.7] No Female (103.3, 291.05] 0 \n",
"3 (27.7, 97.6] Yes Female (54.999, 82.16] 0 \n",
"4 (10.099, 27.7] No Male (103.3, 291.05] 0 \n",
"5 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"6 (10.099, 27.7] Yes Female (54.999, 82.16] 0 \n",
"7 (10.099, 27.7] Yes Female (103.3, 291.05] 1 \n",
"8 (27.7, 97.6] Yes Female (54.999, 82.16] 0 \n",
"9 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"10 (27.7, 97.6] Yes Female (54.999, 82.16] 0 \n",
"11 (10.099, 27.7] Yes Male (54.999, 82.16] 1 \n",
"12 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"13 (10.099, 27.7] Yes Female (54.999, 82.16] 0 \n",
"14 (27.7, 97.6] Yes Female (82.16, 103.3] 0 \n",
"15 (27.7, 97.6] No Male (82.16, 103.3] 0 \n",
"16 (10.099, 27.7] No Female (82.16, 103.3] 0 \n",
"17 (10.099, 27.7] Yes Female (82.16, 103.3] 0 \n",
"18 (10.099, 27.7] Yes Female (54.999, 82.16] 0 \n",
"19 (10.099, 27.7] Yes Female (54.999, 82.16] 1 \n",
"20 (27.7, 97.6] Yes Female (54.999, 82.16] 0 \n",
"21 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"22 (27.7, 97.6] Yes Female (54.999, 82.16] 0 \n",
"23 (10.099, 27.7] Yes Male (103.3, 291.05] 0 \n",
"24 (10.099, 27.7] Yes Female (103.3, 291.05] 0 \n",
"25 (10.099, 27.7] Yes Female (54.999, 82.16] 0 \n",
"26 (27.7, 97.6] Yes Female (82.16, 103.3] 0 \n",
"27 (10.099, 27.7] No Female (82.16, 103.3] 0 \n",
"28 (10.099, 27.7] Yes Female (103.3, 291.05] 0 \n",
"29 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"... ... ... ... ... ... \n",
"18571 (10.099, 27.7] Yes Male (82.16, 103.3] 0 \n",
"18572 (10.099, 27.7] No Female (82.16, 103.3] 0 \n",
"18573 (10.099, 27.7] No Female (54.999, 82.16] 0 \n",
"18574 (10.099, 27.7] No Female (54.999, 82.16] 0 \n",
"18575 (10.099, 27.7] Yes Female (82.16, 103.3] 0 \n",
"18576 (10.099, 27.7] No Female (54.999, 82.16] 0 \n",
"18577 (10.099, 27.7] Yes Male (54.999, 82.16] 0 \n",
"18578 (27.7, 97.6] No Male (82.16, 103.3] 0 \n",
"18579 (10.099, 27.7] No Female (54.999, 82.16] 0 \n",
"18580 (10.099, 27.7] No Male (82.16, 103.3] 0 \n",
"18581 (27.7, 97.6] No Female (54.999, 82.16] 0 \n",
"18582 (10.099, 27.7] Yes Male (103.3, 291.05] 1 \n",
"18583 (27.7, 97.6] No Female (54.999, 82.16] 0 \n",
"18584 (10.099, 27.7] No Female (82.16, 103.3] 0 \n",
"18585 (10.099, 27.7] No Female (103.3, 291.05] 0 \n",
"18586 (27.7, 97.6] No Male (103.3, 291.05] 0 \n",
"18587 (27.7, 97.6] No Male (103.3, 291.05] 1 \n",
"18588 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"18589 (10.099, 27.7] No Male (82.16, 103.3] 0 \n",
"18590 (10.099, 27.7] Yes Female (82.16, 103.3] 0 \n",
"18591 (10.099, 27.7] No Female (54.999, 82.16] 0 \n",
"18592 (10.099, 27.7] No Male (82.16, 103.3] 0 \n",
"18593 (10.099, 27.7] Yes Male (82.16, 103.3] 0 \n",
"18594 (27.7, 97.6] Yes Male (54.999, 82.16] 0 \n",
"18595 (10.099, 27.7] No Male (103.3, 291.05] 0 \n",
"18596 (10.099, 27.7] No Male (54.999, 82.16] 0 \n",
"18597 (27.7, 97.6] Yes Male (103.3, 291.05] 0 \n",
"18598 (27.7, 97.6] Yes Female (103.3, 291.05] 0 \n",
"18599 (27.7, 97.6] Yes Male (54.999, 82.16] 0 \n",
"18600 (10.099, 27.7] No Female (82.16, 103.3] 0 \n",
"\n",
" hypertension id smoking_status stroke work_type \n",
"0 0 30669 never smoked 0.0 children \n",
"1 1 30468 never smoked 0.0 Private \n",
"2 0 16523 never smoked 0.0 Private \n",
"3 0 56543 formerly smoked 0.0 Private \n",
"4 0 46136 never smoked 0.0 Never_worked \n",
"5 0 32257 never smoked 0.0 Private \n",
"6 0 52800 formerly smoked 0.0 Private \n",
"7 0 41413 never smoked 0.0 Self-employed \n",
"8 0 15266 smokes 0.0 Private \n",
"9 1 28674 never smoked 0.0 Self-employed \n",
"10 0 10460 never smoked 0.0 Govt_job \n",
"11 0 64908 formerly smoked 0.0 Private \n",
"12 0 63884 never smoked 0.0 Private \n",
"13 0 37893 formerly smoked 0.0 Private \n",
"14 0 67855 never smoked 0.0 Private \n",
"15 0 25774 never smoked 0.0 Private \n",
"16 0 19584 smokes 0.0 Private \n",
"17 0 24447 never smoked 0.0 Private \n",
"18 0 49589 smokes 0.0 Govt_job \n",
"19 0 17986 smokes 0.0 Self-employed \n",
"20 1 29217 never smoked 0.0 Private \n",
"21 1 72911 smokes 0.0 Private \n",
"22 0 47175 smokes 0.0 Private \n",
"23 0 4057 formerly smoked 0.0 Private \n",
"24 0 48588 never smoked 0.0 Private \n",
"25 0 70336 never smoked 0.0 Private \n",
"26 0 66767 smokes 0.0 Govt_job \n",
"27 0 45801 never smoked 0.0 Private \n",
"28 0 36275 never smoked 0.0 Private \n",
"29 0 11577 never smoked 0.0 Self-employed \n",
"... ... ... ... ... ... \n",
"18571 0 30015 never smoked NaN Self-employed \n",
"18572 0 24761 smokes NaN Private \n",
"18573 0 27094 never smoked NaN Private \n",
"18574 0 43495 never smoked NaN children \n",
"18575 0 56348 never smoked NaN Private \n",
"18576 0 11876 formerly smoked NaN Private \n",
"18577 0 585 never smoked NaN Private \n",
"18578 0 43879 smokes NaN Private \n",
"18579 0 38239 never smoked NaN Private \n",
"18580 0 28943 never smoked NaN children \n",
"18581 0 17895 never smoked NaN Private \n",
"18582 0 35147 never smoked NaN Private \n",
"18583 0 72758 smokes NaN Private \n",
"18584 0 28661 never smoked NaN children \n",
"18585 0 46853 never smoked NaN Private \n",
"18586 0 69190 never smoked NaN Private \n",
"18587 0 26705 never smoked NaN Private \n",
"18588 1 36976 formerly smoked NaN Private \n",
"18589 0 58578 smokes NaN Private \n",
"18590 0 61508 never smoked NaN Private \n",
"18591 0 70296 never smoked NaN Private \n",
"18592 0 35299 never smoked NaN children \n",
"18593 0 15023 formerly smoked NaN Private \n",
"18594 1 56291 formerly smoked NaN Private \n",
"18595 0 53431 never smoked NaN children \n",
"18596 0 67353 formerly smoked NaN Private \n",
"18597 0 362 smokes NaN Govt_job \n",
"18598 0 29839 never smoked NaN Private \n",
"18599 0 6438 never smoked NaN Govt_job \n",
"18600 0 16770 never smoked NaN Private \n",
"\n",
"[62001 rows x 15 columns]"
]
},
"execution_count": 321,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def order(data,output,on,by):\n",
" D=data[data[output]==on].groupby([output,by])[by].agg({'Frequency':'count'}).sort_values(by='Frequency',ascending=False)\n",
" return D"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def order1(data,output,by):\n",
" D=data.groupby([output,by],as_index=False)[by].agg({'Frequency':'count'})#.sort_values(by='Frequency',ascending=False)\n",
" return D"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def order2(data,output,bye):\n",
" D=data[[output,bye]].groupby([output],as_index=False).mean().sort_values(by=bye,ascending=False)\n",
" return D"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=train.append(test)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#data.drop('stroke',inplace=True, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 300,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"62001"
]
},
"execution_count": 300,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 328,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"G_s=order(train,'stroke',1,'gender')"
]
},
{
"cell_type": "code",
"execution_count": 329,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>gender</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>Female</th>\n",
" <td>431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Male</th>\n",
" <td>352</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke gender \n",
"1 Female 431\n",
" Male 352"
]
},
"execution_count": 329,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_s"
]
},
{
"cell_type": "code",
"execution_count": 330,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"G_s1=order1(train,'stroke','gender')"
]
},
{
"cell_type": "code",
"execution_count": 331,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>gender</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Female</td>\n",
" <td>25234</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Male</td>\n",
" <td>17372</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>Other</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Female</td>\n",
" <td>431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>352</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke gender Frequency\n",
"0 0 Female 25234\n",
"1 0 Male 17372\n",
"2 0 Other 11\n",
"3 1 Female 431\n",
"4 1 Male 352"
]
},
"execution_count": 331,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_s1"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['gender']=='Female','gender']=0.017"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['gender']=='Male','gender']=0.02"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['gender']=='Other','gender']=0"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train['age_interval'],bins=pd.qcut(train['age'],5,retbins=True)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test['age_interval']=pd.cut(test['age'],bins=bins,include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train['glucose_level'],bins=pd.qcut(train['avg_glucose_level'],3,retbins=True)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test['glucose_level']=pd.cut(test['avg_glucose_level'],bins=bins,include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train['bmi_interval'],bins=pd.qcut(train['bmi'],2,retbins=True)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test['bmi_interval']=pd.cut(test['bmi'],bins=bins,include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 338,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"B_s=order(train,'stroke',1,'bmi_interval')"
]
},
{
"cell_type": "code",
"execution_count": 339,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"B_s1=order1(train,'stroke','bmi_interval')"
]
},
{
"cell_type": "code",
"execution_count": 340,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>bmi_interval</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>20708</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>20587</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>(10.099, 27.7]</td>\n",
" <td>270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>(27.7, 97.6]</td>\n",
" <td>373</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke bmi_interval Frequency\n",
"0 0 (10.099, 27.7] 20708\n",
"1 0 (27.7, 97.6] 20587\n",
"2 1 (10.099, 27.7] 270\n",
"3 1 (27.7, 97.6] 373"
]
},
"execution_count": 340,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"B_s1"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def encod(col): \n",
" label.fit(col)\n",
" col=label.transform(col)\n",
" return col"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def wts(dist):\n",
" n=len(dist)-1\n",
" m=(n+1)/2-1\n",
" wt=[]\n",
" while m>0 or m==0:\n",
" wt.append(dist.loc[n,'Frequency']/dist.loc[m,'Frequency'])\n",
" m=m-1\n",
" n=n-1\n",
" return wt"
]
},
{
"cell_type": "code",
"execution_count": 304,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.018118229950939913, 0.013038439250531195]"
]
},
"execution_count": 304,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(B_s1)"
]
},
{
"cell_type": "code",
"execution_count": 352,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['bmi_interval']=encod(data['bmi_interval'])"
]
},
{
"cell_type": "code",
"execution_count": 354,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['bmi_interval']==1,'bmi_interval']=0.013"
]
},
{
"cell_type": "code",
"execution_count": 355,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['bmi_interval']==0,'bmi_interval']=0.018"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"Gl_s=order(train,'stroke',1,'glucose_level')"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>glucose_level</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">1</th>\n",
" <th>(103.3, 291.05]</th>\n",
" <td>396</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(54.999, 82.16]</th>\n",
" <td>211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(82.16, 103.3]</th>\n",
" <td>176</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke glucose_level \n",
"1 (103.3, 291.05] 396\n",
" (54.999, 82.16] 211\n",
" (82.16, 103.3] 176"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Gl_s"
]
},
{
"cell_type": "code",
"execution_count": 357,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Gl_s1=order1(train,'stroke','glucose_level')"
]
},
{
"cell_type": "code",
"execution_count": 358,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>glucose_level</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>14260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>14287</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>14070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>(54.999, 82.16]</td>\n",
" <td>211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>(82.16, 103.3]</td>\n",
" <td>176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>(103.3, 291.05]</td>\n",
" <td>396</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke glucose_level Frequency\n",
"0 0 (54.999, 82.16] 14260\n",
"1 0 (82.16, 103.3] 14287\n",
"2 0 (103.3, 291.05] 14070\n",
"3 1 (54.999, 82.16] 211\n",
"4 1 (82.16, 103.3] 176\n",
"5 1 (103.3, 291.05] 396"
]
},
"execution_count": 358,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Gl_s1"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.02814498933901919, 0.01231889129978302, 0.014796633941093968]"
]
},
"execution_count": 359,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(Gl_s1)"
]
},
{
"cell_type": "code",
"execution_count": 360,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['glucose_level']=encod(data['glucose_level'])"
]
},
{
"cell_type": "code",
"execution_count": 361,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Residence_type</th>\n",
" <th>age</th>\n",
" <th>age_interval</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>bmi_interval</th>\n",
" <th>ever_married</th>\n",
" <th>gender</th>\n",
" <th>glucose_level</th>\n",
" <th>heart_disease</th>\n",
" <th>hypertension</th>\n",
" <th>id</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" <th>work_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Rural</td>\n",
" <td>3.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>95.12</td>\n",
" <td>18.0</td>\n",
" <td>0.018</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>30669</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Urban</td>\n",
" <td>58.0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>87.96</td>\n",
" <td>39.2</td>\n",
" <td>0.013</td>\n",
" <td>Yes</td>\n",
" <td>Male</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>30468</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Urban</td>\n",
" <td>8.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>110.89</td>\n",
" <td>17.6</td>\n",
" <td>0.018</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>16523</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rural</td>\n",
" <td>70.0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>69.04</td>\n",
" <td>35.9</td>\n",
" <td>0.013</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>56543</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Rural</td>\n",
" <td>14.0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>161.28</td>\n",
" <td>19.1</td>\n",
" <td>0.018</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>46136</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>Never_worked</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Residence_type age age_interval avg_glucose_level bmi bmi_interval \\\n",
"0 Rural 3.0 (0.079, 20.0] 95.12 18.0 0.018 \n",
"1 Urban 58.0 (50.0, 64.0] 87.96 39.2 0.013 \n",
"2 Urban 8.0 (0.079, 20.0] 110.89 17.6 0.018 \n",
"3 Rural 70.0 (64.0, 82.0] 69.04 35.9 0.013 \n",
"4 Rural 14.0 (0.079, 20.0] 161.28 19.1 0.018 \n",
"\n",
" ever_married gender glucose_level heart_disease hypertension id \\\n",
"0 No Male 1 0 0 30669 \n",
"1 Yes Male 1 0 1 30468 \n",
"2 No Female 2 0 0 16523 \n",
"3 Yes Female 0 0 0 56543 \n",
"4 No Male 2 0 0 46136 \n",
"\n",
" smoking_status stroke work_type \n",
"0 never smoked 0.0 children \n",
"1 never smoked 0.0 Private \n",
"2 never smoked 0.0 Private \n",
"3 formerly smoked 0.0 Private \n",
"4 never smoked 0.0 Never_worked "
]
},
"execution_count": 361,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 362,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['glucose_level']==2,'glucose_level']=0.028"
]
},
{
"cell_type": "code",
"execution_count": 363,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['glucose_level']==1,'glucose_level']=0.0123"
]
},
{
"cell_type": "code",
"execution_count": 364,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['glucose_level']==0,'glucose_level']=0.0147"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"A_s=order(train,'stroke',1,'age_interval')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>age_interval</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">1</th>\n",
" <th>(64.0, 82.0]</th>\n",
" <td>528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(50.0, 64.0]</th>\n",
" <td>180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(36.0, 50.0]</th>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(20.0, 36.0]</th>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.079, 20.0]</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke age_interval \n",
"1 (64.0, 82.0] 528\n",
" (50.0, 64.0] 180\n",
" (36.0, 50.0] 63\n",
" (20.0, 36.0] 10\n",
" (0.079, 20.0] 2"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"A_s"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"A_s1=order1(train,'stroke','age_interval')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>age_interval</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>9013</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>8416</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>8694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>8863</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>7631</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>(0.079, 20.0]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>(20.0, 36.0]</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>(36.0, 50.0]</td>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>(50.0, 64.0]</td>\n",
" <td>180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>(64.0, 82.0]</td>\n",
" <td>528</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke age_interval Frequency\n",
"0 0 (0.079, 20.0] 9013\n",
"1 0 (20.0, 36.0] 8416\n",
"2 0 (36.0, 50.0] 8694\n",
"3 0 (50.0, 64.0] 8863\n",
"4 0 (64.0, 82.0] 7631\n",
"5 1 (0.079, 20.0] 2\n",
"6 1 (20.0, 36.0] 10\n",
"7 1 (36.0, 50.0] 63\n",
"8 1 (50.0, 64.0] 180\n",
"9 1 (64.0, 82.0] 528"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"A_s1"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.069191455903551302,\n",
" 0.020309150400541577,\n",
" 0.007246376811594203,\n",
" 0.001188212927756654,\n",
" 0.00022190169754798624]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(A_s1)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Data=train.append(test)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['age_interval']=encod(Data['age_interval'])"
]
},
{
"cell_type": "code",
"execution_count": 366,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['age_interval']=encod(data['age_interval'])"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['age_interval']==0.069,'age_interval']=4 #0.069"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['age_interval']==0.020,'age_interval']=3 #0.020"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['age_interval']==0.007,'age_interval']=2 #0.007"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Series([], Name: age_interval, dtype: float64)"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.loc[data['age_interval']==0.007,'age_interval']"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['age_interval']==0.0011,'age_interval']=1 #0.0011"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['age_interval']==0.0002,'age_interval']=0 #0.0002"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"H_s=order(train,'stroke',1,'hypertension')"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>hypertension</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>583</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke hypertension \n",
"1 0 583\n",
" 1 200"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"H_s"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"H_s1=order1(train,'stroke','hypertension')"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>hypertension</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>3861</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>583</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke hypertension Frequency\n",
"0 0 0 38756\n",
"1 0 1 3861\n",
"2 1 0 583\n",
"3 1 1 200"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"H_s1"
]
},
{
"cell_type": "code",
"execution_count": 368,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.051800051800051802, 0.015042832077613789]"
]
},
"execution_count": 368,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(H_s1)"
]
},
{
"cell_type": "code",
"execution_count": 369,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['hypertension']==1,'hypertension']=0.051"
]
},
{
"cell_type": "code",
"execution_count": 370,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['hypertension']==0,'hypertension']=0.015"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"Hd_s=order(train,'stroke',1,'heart_disease')"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>heart_disease</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>0</th>\n",
" <td>606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke heart_disease \n",
"1 0 606\n",
" 1 177"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Hd_s"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Hd_s1=order1(train,'stroke','heart_disease')"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Hd_s1=Hd_s1.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>heart_disease</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke heart_disease Frequency\n",
"0 0 0 40732\n",
"1 0 1 1885\n",
"2 1 0 606\n",
"3 1 1 177"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Hd_s1"
]
},
{
"cell_type": "code",
"execution_count": 371,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.093899204244031836, 0.014877737405479721]"
]
},
"execution_count": 371,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(Hd_s1)"
]
},
{
"cell_type": "code",
"execution_count": 372,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['heart_disease']==1,'heart_disease']=0.094"
]
},
{
"cell_type": "code",
"execution_count": 373,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['heart_disease']==0,'heart_disease']=0.014"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"W_s=order(train,'stroke',1,'work_type')"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>work_type</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">1</th>\n",
" <th>Private</th>\n",
" <td>441</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Self-employed</th>\n",
" <td>251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Govt_job</th>\n",
" <td>89</td>\n",
" </tr>\n",
" <tr>\n",
" <th>children</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke work_type \n",
"1 Private 441\n",
" Self-employed 251\n",
" Govt_job 89\n",
" children 2"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W_s"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"W_s1=order1(train,'stroke','work_type')"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>work_type</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Govt_job</td>\n",
" <td>5351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Never_worked</td>\n",
" <td>177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>Private</td>\n",
" <td>24393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>Self-employed</td>\n",
" <td>6542</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>children</td>\n",
" <td>6154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>Govt_job</td>\n",
" <td>89</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>Private</td>\n",
" <td>441</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>Self-employed</td>\n",
" <td>251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>children</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke work_type Frequency\n",
"0 0 Govt_job 5351\n",
"1 0 Never_worked 177\n",
"2 0 Private 24393\n",
"3 0 Self-employed 6542\n",
"4 0 children 6154\n",
"5 1 Govt_job 89\n",
"6 1 Private 441\n",
"7 1 Self-employed 251\n",
"8 1 children 2"
]
},
"execution_count": 147,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"W_s1"
]
},
{
"cell_type": "code",
"execution_count": 376,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['work_type']=='Govt_job','work_type']=0.016"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['work_type']=='Self-employed','work_type']=0.038"
]
},
{
"cell_type": "code",
"execution_count": 404,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['work_type']=='Private','work_type']=0.018"
]
},
{
"cell_type": "code",
"execution_count": 405,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['work_type']=='children','work_type']=0"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['work_type']=='Never_worked','work_type']=0"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"R_s1=order1(train,'stroke','Residence_type')"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>Residence_type</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Rural</td>\n",
" <td>21260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Urban</td>\n",
" <td>21357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>Rural</td>\n",
" <td>384</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Urban</td>\n",
" <td>399</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke Residence_type Frequency\n",
"0 0 Rural 21260\n",
"1 0 Urban 21357\n",
"2 1 Rural 384\n",
"3 1 Urban 399"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"R_s1"
]
},
{
"cell_type": "code",
"execution_count": 381,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['Residence_type']=='Rural','Residence_type']=0.018"
]
},
{
"cell_type": "code",
"execution_count": 382,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['Residence_type']=='Urban','Residence_type']=0.018"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"S_s=order1(train,'stroke','smoking_status')"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>smoking_status</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>formerly smoked</td>\n",
" <td>7272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>never smoked</td>\n",
" <td>15769</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>smokes</td>\n",
" <td>6429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>formerly smoked</td>\n",
" <td>221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>never smoked</td>\n",
" <td>284</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>smokes</td>\n",
" <td>133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke smoking_status Frequency\n",
"0 0 formerly smoked 7272\n",
"1 0 never smoked 15769\n",
"2 0 smokes 6429\n",
"3 1 formerly smoked 221\n",
"4 1 never smoked 284\n",
"5 1 smokes 133"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"S_s"
]
},
{
"cell_type": "code",
"execution_count": 383,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['smoking_status']=='formerly smoked','smoking_status']=0.03"
]
},
{
"cell_type": "code",
"execution_count": 384,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['smoking_status']=='never smoked','smoking_status']=0.018"
]
},
{
"cell_type": "code",
"execution_count": 385,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['smoking_status']=='smokes','smoking_status']=0.02"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"M_s=order1(train,'stroke','ever_married')"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>ever_married</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>15382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>27235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" <td>80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>703</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke ever_married Frequency\n",
"0 0 No 15382\n",
"1 0 Yes 27235\n",
"2 1 No 80\n",
"3 1 Yes 703"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"M_s"
]
},
{
"cell_type": "code",
"execution_count": 386,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['ever_married']=='Yes','ever_married']=0.0258"
]
},
{
"cell_type": "code",
"execution_count": 387,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.loc[data['ever_married']=='No','ever_married']=0.0052"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=pd.read_csv('data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=pd.get_dummies(data,columns=['age_interval','gender'])"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Residence_type', 'bmi_interval', 'ever_married', 'glucose_level',\n",
" 'heart_disease', 'hypertension', 'smoking_status', 'stroke',\n",
" 'work_type', 'age_interval_0', 'age_interval_1', 'age_interval_2',\n",
" 'age_interval_3', 'age_interval_4', 'gender_0.0', 'gender_0.017',\n",
" 'gender_0.02'],\n",
" dtype='object')"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.drop('weights',inplace=True, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.to_csv('Datafinal.csv')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['wts']=data['bmi_interval']+data['hypertension']+data['heart_disease']+data['glucose_level']+data['smoking_status']"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['weights']=data['wts']+data['wts1']+data['age_interval']"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.to_csv('data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Residence_type</th>\n",
" <th>bmi_interval</th>\n",
" <th>ever_married</th>\n",
" <th>glucose_level</th>\n",
" <th>heart_disease</th>\n",
" <th>hypertension</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" <th>work_type</th>\n",
" <th>age_interval_0</th>\n",
" <th>age_interval_1</th>\n",
" <th>age_interval_2</th>\n",
" <th>age_interval_3</th>\n",
" <th>age_interval_4</th>\n",
" <th>gender_0.0</th>\n",
" <th>gender_0.017</th>\n",
" <th>gender_0.02</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.018</td>\n",
" <td>0.018</td>\n",
" <td>0.0052</td>\n",
" <td>0.0123</td>\n",
" <td>0.014</td>\n",
" <td>0.015</td>\n",
" <td>0.018</td>\n",
" <td>0.0</td>\n",
" <td>0.000</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.018</td>\n",
" <td>0.013</td>\n",
" <td>0.0258</td>\n",
" <td>0.0123</td>\n",
" <td>0.014</td>\n",
" <td>0.051</td>\n",
" <td>0.018</td>\n",
" <td>0.0</td>\n",
" <td>0.018</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.018</td>\n",
" <td>0.018</td>\n",
" <td>0.0052</td>\n",
" <td>0.0280</td>\n",
" <td>0.014</td>\n",
" <td>0.015</td>\n",
" <td>0.018</td>\n",
" <td>0.0</td>\n",
" <td>0.018</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.018</td>\n",
" <td>0.013</td>\n",
" <td>0.0258</td>\n",
" <td>0.0147</td>\n",
" <td>0.014</td>\n",
" <td>0.015</td>\n",
" <td>0.030</td>\n",
" <td>0.0</td>\n",
" <td>0.018</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.018</td>\n",
" <td>0.018</td>\n",
" <td>0.0052</td>\n",
" <td>0.0280</td>\n",
" <td>0.014</td>\n",
" <td>0.015</td>\n",
" <td>0.018</td>\n",
" <td>0.0</td>\n",
" <td>0.000</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Residence_type bmi_interval ever_married glucose_level heart_disease \\\n",
"0 0.018 0.018 0.0052 0.0123 0.014 \n",
"1 0.018 0.013 0.0258 0.0123 0.014 \n",
"2 0.018 0.018 0.0052 0.0280 0.014 \n",
"3 0.018 0.013 0.0258 0.0147 0.014 \n",
"4 0.018 0.018 0.0052 0.0280 0.014 \n",
"\n",
" hypertension smoking_status stroke work_type age_interval_0 \\\n",
"0 0.015 0.018 0.0 0.000 1 \n",
"1 0.051 0.018 0.0 0.018 0 \n",
"2 0.015 0.018 0.0 0.018 1 \n",
"3 0.015 0.030 0.0 0.018 0 \n",
"4 0.015 0.018 0.0 0.000 1 \n",
"\n",
" age_interval_1 age_interval_2 age_interval_3 age_interval_4 gender_0.0 \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 1 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 1 0 \n",
"4 0 0 0 0 0 \n",
"\n",
" gender_0.017 gender_0.02 \n",
"0 0 1 \n",
"1 0 1 \n",
"2 1 0 \n",
"3 1 0 \n",
"4 0 1 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['Weights']=data['Residence_type'].astype(float)+data['ever_married'].astype(float)+data['work_type'].astype(float)+data['bmi_interval']+data['hypertension']+data['heart_disease']+data['glucose_level']+data['smoking_status']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Train=data[0:43400]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Test=data[43400:]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" if __name__ == '__main__':\n"
]
}
],
"source": [
"Train['Weights_interval'],bins=pd.qcut(Train['Weights'],4,retbins=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" if __name__ == '__main__':\n"
]
}
],
"source": [
"Test['Weights_interval']=pd.cut(Test['Weights'],bins=bins,include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"F:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: using a dict on a Series for aggregation\n",
"is deprecated and will be removed in a future version\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"Wt_s=order(Train,'stroke',1,'Weights_interval')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" <tr>\n",
" <th>stroke</th>\n",
" <th>Weights_interval</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">1.0</th>\n",
" <th>(0.157, 0.303]</th>\n",
" <td>468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.14, 0.157]</th>\n",
" <td>203</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.121, 0.14]</th>\n",
" <td>91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0945, 0.121]</th>\n",
" <td>21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Frequency\n",
"stroke Weights_interval \n",
"1.0 (0.157, 0.303] 468\n",
" (0.14, 0.157] 203\n",
" (0.121, 0.14] 91\n",
" (0.0945, 0.121] 21"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Wt_s"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Wt_s1=order1(Train,'stroke','Weights_interval')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stroke</th>\n",
" <th>Weights_interval</th>\n",
" <th>Frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>(0.0945, 0.121]</td>\n",
" <td>11026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>(0.121, 0.14]</td>\n",
" <td>10732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>(0.14, 0.157]</td>\n",
" <td>10744</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>(0.157, 0.303]</td>\n",
" <td>10115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>(0.0945, 0.121]</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>(0.121, 0.14]</td>\n",
" <td>91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1.0</td>\n",
" <td>(0.14, 0.157]</td>\n",
" <td>203</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1.0</td>\n",
" <td>(0.157, 0.303]</td>\n",
" <td>468</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stroke Weights_interval Frequency\n",
"0 0.0 (0.0945, 0.121] 11026\n",
"1 0.0 (0.121, 0.14] 10732\n",
"2 0.0 (0.14, 0.157] 10744\n",
"3 0.0 (0.157, 0.303] 10115\n",
"4 1.0 (0.0945, 0.121] 21\n",
"5 1.0 (0.121, 0.14] 91\n",
"6 1.0 (0.14, 0.157] 203\n",
"7 1.0 (0.157, 0.303] 468"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Wt_s1"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.046267918932278794,\n",
" 0.018894266567386447,\n",
" 0.008479314200521804,\n",
" 0.0019045891529113005]"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wts(Wt_s1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['Weights_interval']=Train['Weights_interval'].append(Test['Weights_interval'])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data['Weights_interval']=encod(data['Weights_interval'])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['Weights_interval']==3,'Weights_interval']=0.046"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['Weights_interval']==2,'Weights_interval']=0.019"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['Weights_interval']==1,'Weights_interval']=0.0084"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[data['Weights_interval']==0,'Weights_interval']=0.002"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Residence_type', 'bmi_interval', 'ever_married', 'glucose_level',\n",
" 'heart_disease', 'hypertension', 'smoking_status', 'stroke',\n",
" 'work_type', 'age_interval_0', 'age_interval_1', 'age_interval_2',\n",
" 'age_interval_3', 'age_interval_4', 'gender_0.0', 'gender_0.017',\n",
" 'gender_0.02', 'Weights', 'Weights_interval'],\n",
" dtype='object')"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"54.427841634738186"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(WW)/len(WWW)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.to_csv('DataFinal.csv')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"62001"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2053"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Total['bmi'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=pd.get_dummies(data,columns=['Residence_type','bmi_interval','ever_married','glucose_level','heart_disease','hypertension','smoking_status','work_type'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.drop('Unnamed: 0',inplace=True,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data.to_csv('Datafinal1.csv')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Data=pd.read_csv('Data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"62001"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(Total)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.loc[Total['bmi'].isnull(),'Weights_interval']=np.nan"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=pd.read_csv('Data_nan.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data=pd.read_csv('DataFinal.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Unnamed: 0', 'Unnamed: 0.1', 'Residence_type', 'bmi_interval',\n",
" 'ever_married', 'glucose_level', 'heart_disease', 'hypertension',\n",
" 'smoking_status', 'stroke', 'work_type', 'age_interval_0',\n",
" 'age_interval_1', 'age_interval_2', 'age_interval_3', 'age_interval_4',\n",
" 'gender_0.0', 'gender_0.017', 'gender_0.02', 'Weights',\n",
" 'Weights_interval', 'gender', 'age_interval'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"P=['age_interval','gender', 'Weights_interval']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"P=[ 'age_interval_0', 'age_interval_1',\n",
" 'age_interval_2', 'age_interval_3', 'age_interval_4', 'gender_0.0',\n",
" 'gender_0.017', 'gender_0.02','Weights_interval']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"P=['Residence_type', 'bmi_interval', 'ever_married', 'glucose_level',\n",
" 'heart_disease', 'hypertension', 'smoking_status',\n",
" 'work_type', 'age_interval_0', 'age_interval_1', 'age_interval_2',\n",
" 'age_interval_3', 'age_interval_4', 'gender_0.0', 'gender_0.017',\n",
" 'gender_0.02']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"P=[ 'age_interval', 'gender', 'Residence_type_0.018000000000000002', 'bmi_interval_0.013',\n",
" 'bmi_interval_0.018000000000000002', 'ever_married_0.0052',\n",
" 'ever_married_0.0258', 'glucose_level_0.0123', 'glucose_level_0.0147',\n",
" 'glucose_level_0.028', 'heart_disease_0.014', 'heart_disease_0.094',\n",
" 'hypertension_0.015', 'hypertension_0.051',\n",
" 'smoking_status_0.018000000000000002', 'smoking_status_0.02',\n",
" 'smoking_status_0.03', 'work_type_0.0', 'work_type_0.016',\n",
" 'work_type_0.018000000000000002', 'work_type_0.038']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"P=[ 'age_interval_0', 'age_interval_1',\n",
" 'age_interval_2', 'age_interval_3', 'age_interval_4', 'gender_0.0',\n",
" 'gender_0.017', 'gender_0.02', 'Residence_type_0.018000000000000002', 'bmi_interval_0.013',\n",
" 'bmi_interval_0.018000000000000002', 'ever_married_0.0052',\n",
" 'ever_married_0.0258', 'glucose_level_0.0123', 'glucose_level_0.0147',\n",
" 'glucose_level_0.028', 'heart_disease_0.014', 'heart_disease_0.094',\n",
" 'hypertension_0.015', 'hypertension_0.051',\n",
" 'smoking_status_0.018000000000000002', 'smoking_status_0.02',\n",
" 'smoking_status_0.03', 'work_type_0.0', 'work_type_0.016',\n",
" 'work_type_0.018000000000000002', 'work_type_0.038']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Xtest=pd.DataFrame(data[P][0:43400])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"target=pd.DataFrame(data['stroke'][0:43400])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(43400, 1)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(43400, 27)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtest.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_X, test_X, train_target, test_target= train_test_split(Xtest , target, train_size = 0.7,random_state=2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Train=data[0:43400]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_X=Train[P]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(43400, 9)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_X.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_target=pd.DataFrame(Train['stroke'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(43400, 1)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_target.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Test=data[43400:]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_X=Test[P]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"xgtest= xgb.DMatrix(test_X.values)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predict=model.predict(xgtest)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"predict=predict>0.5\n",
"predict=predict.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, ..., 0, 0, 0])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test_target=pd.DataFrame({'id':test['id'],'stroke':predict})"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(18601, 2)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_target.shape"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test_target.to_csv('submission2.csv')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(test[test_target['stroke']==1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 417,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"P=['bmi_interval','hypertension','heart_disease','glucose_level','Residence_type','ever_married','smoking_status','work_type','gender']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-auc:0.841246\n",
"[1]\ttrain-auc:0.841246\n",
"[2]\ttrain-auc:0.841327\n",
"[3]\ttrain-auc:0.841327\n",
"[4]\ttrain-auc:0.841327\n",
"[5]\ttrain-auc:0.841334\n",
"[6]\ttrain-auc:0.841367\n",
"[7]\ttrain-auc:0.841367\n"
]
}
],
"source": [
"params={'objective': 'binary:logistic','eval_metric': 'auc','scale_pos_weight':54,'silent':0} \n",
"xgtrain = xgb.DMatrix(train_X.values, label=train_target.values)\n",
"#xgtest= xgb.DMatrix(test_X.values,label=test_target.values) \n",
"\n",
"watchlist = [(xgtrain, 'train')]\n",
"num_round=8\n",
"\n",
"model=xgb.train(params,xgtrain,num_round,watchlist)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-auc:0.877561\teval-auc:0.845137\n",
"[1]\ttrain-auc:0.880451\teval-auc:0.847989\n",
"[2]\ttrain-auc:0.883226\teval-auc:0.850051\n",
"[3]\ttrain-auc:0.886788\teval-auc:0.851737\n",
"[4]\ttrain-auc:0.890359\teval-auc:0.851997\n",
"[5]\ttrain-auc:0.890853\teval-auc:0.851832\n",
"[6]\ttrain-auc:0.891175\teval-auc:0.85165\n",
"[7]\ttrain-auc:0.892022\teval-auc:0.851533\n",
"[8]\ttrain-auc:0.892413\teval-auc:0.852003\n",
"[9]\ttrain-auc:0.892878\teval-auc:0.851517\n",
"[10]\ttrain-auc:0.89464\teval-auc:0.851343\n",
"[11]\ttrain-auc:0.895186\teval-auc:0.850506\n",
"[12]\ttrain-auc:0.897878\teval-auc:0.849897\n",
"[13]\ttrain-auc:0.899182\teval-auc:0.850583\n",
"[14]\ttrain-auc:0.899595\teval-auc:0.84993\n",
"[15]\ttrain-auc:0.901118\teval-auc:0.850049\n",
"[16]\ttrain-auc:0.902494\teval-auc:0.849881\n"
]
}
],
"source": [
"params={'objective': 'binary:logistic','eval_metric': 'auc', 'scale_pos_weight':54,'silent':0} \n",
"xgtrain = xgb.DMatrix(train_X.values, label=train_target.values)\n",
"xgtest= xgb.DMatrix(test_X.values,label=test_target.values) \n",
"\n",
"watchlist = [(xgtrain, 'train'),(xgtest,'eval')]\n",
"num_round=8\n",
"\n",
"model=xgb.train(params,xgtrain,num_round,watchlist)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-auc:0.86673\teval-auc:0.799802\n",
"[1]\ttrain-auc:0.871534\teval-auc:0.800611\n",
"[2]\ttrain-auc:0.873838\teval-auc:0.802685\n",
"[3]\ttrain-auc:0.876299\teval-auc:0.803171\n",
"[4]\ttrain-auc:0.876774\teval-auc:0.802577\n",
"[5]\ttrain-auc:0.878146\teval-auc:0.808125\n",
"[6]\ttrain-auc:0.879084\teval-auc:0.809344\n",
"[7]\ttrain-auc:0.87952\teval-auc:0.810456\n",
"[8]\ttrain-auc:0.880245\teval-auc:0.811217\n",
"[9]\ttrain-auc:0.880607\teval-auc:0.811298\n",
"[10]\ttrain-auc:0.881696\teval-auc:0.810341\n",
"[11]\ttrain-auc:0.882438\teval-auc:0.810638\n",
"[12]\ttrain-auc:0.883275\teval-auc:0.810752\n",
"[13]\ttrain-auc:0.884984\teval-auc:0.809\n",
"[14]\ttrain-auc:0.886189\teval-auc:0.807483\n",
"[15]\ttrain-auc:0.887482\teval-auc:0.806357\n",
"[16]\ttrain-auc:0.887948\teval-auc:0.806266\n",
"[17]\ttrain-auc:0.888362\teval-auc:0.805683\n",
"[18]\ttrain-auc:0.889922\teval-auc:0.803806\n",
"[19]\ttrain-auc:0.890519\teval-auc:0.803836\n"
]
}
],
"source": [
"params={'objective': 'binary:logistic','eval_metric': 'auc', 'scale_pos_weight':54,'silent':0} \n",
"xgtrain = xgb.DMatrix(train_X.values, label=train_target.values)\n",
"xgtest= xgb.DMatrix(test_X.values,label=test_target.values) \n",
"\n",
"watchlist = [(xgtrain, 'train'),(xgtest,'eval')]\n",
"num_round=20\n",
"\n",
"model=xgb.train(params,xgtrain,num_round,watchlist)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-auc:0.860487\teval-auc:0.843272\n",
"[1]\ttrain-auc:0.864726\teval-auc:0.841243\n",
"[2]\ttrain-auc:0.868072\teval-auc:0.843214\n",
"[3]\ttrain-auc:0.869415\teval-auc:0.844786\n",
"[4]\ttrain-auc:0.871723\teval-auc:0.843371\n",
"[5]\ttrain-auc:0.87383\teval-auc:0.845329\n",
"[6]\ttrain-auc:0.874583\teval-auc:0.845435\n",
"[7]\ttrain-auc:0.875655\teval-auc:0.845041\n",
"[8]\ttrain-auc:0.875792\teval-auc:0.844825\n",
"[9]\ttrain-auc:0.87612\teval-auc:0.844958\n",
"[10]\ttrain-auc:0.876931\teval-auc:0.843441\n",
"[11]\ttrain-auc:0.877273\teval-auc:0.842603\n",
"[12]\ttrain-auc:0.877801\teval-auc:0.843569\n",
"[13]\ttrain-auc:0.87879\teval-auc:0.843148\n",
"[14]\ttrain-auc:0.879359\teval-auc:0.843363\n",
"[15]\ttrain-auc:0.880228\teval-auc:0.842602\n",
"[16]\ttrain-auc:0.881113\teval-auc:0.841346\n",
"[17]\ttrain-auc:0.882921\teval-auc:0.840363\n",
"[18]\ttrain-auc:0.883353\teval-auc:0.840222\n",
"[19]\ttrain-auc:0.883529\teval-auc:0.84016\n"
]
}
],
"source": [
"params={'objective': 'binary:logistic','eval_metric': 'auc', 'scale_pos_weight':54,'silent':0} \n",
"xgtrain = xgb.DMatrix(train_X.values, label=train_target.values)\n",
"xgtest= xgb.DMatrix(test_X.values,label=test_target.values) \n",
"\n",
"watchlist = [(xgtrain, 'train'),(xgtest,'eval')]\n",
"num_round=20\n",
"\n",
"model=xgb.train(params,xgtrain,num_round,watchlist)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-auc:0.859473\teval-auc:0.849337\n",
"[1]\ttrain-auc:0.864813\teval-auc:0.849273\n",
"[2]\ttrain-auc:0.86703\teval-auc:0.848219\n",
"[3]\ttrain-auc:0.869633\teval-auc:0.846106\n",
"[4]\ttrain-auc:0.871101\teval-auc:0.844758\n",
"[5]\ttrain-auc:0.872499\teval-auc:0.845837\n",
"[6]\ttrain-auc:0.872957\teval-auc:0.846623\n",
"[7]\ttrain-auc:0.873198\teval-auc:0.847826\n",
"[8]\ttrain-auc:0.873717\teval-auc:0.847706\n",
"[9]\ttrain-auc:0.874418\teval-auc:0.846017\n",
"[10]\ttrain-auc:0.87503\teval-auc:0.84529\n",
"[11]\ttrain-auc:0.875315\teval-auc:0.844226\n",
"[12]\ttrain-auc:0.875688\teval-auc:0.844982\n",
"[13]\ttrain-auc:0.876293\teval-auc:0.844024\n",
"[14]\ttrain-auc:0.877974\teval-auc:0.845358\n",
"[15]\ttrain-auc:0.878252\teval-auc:0.844628\n",
"[16]\ttrain-auc:0.878822\teval-auc:0.844068\n",
"[17]\ttrain-auc:0.879803\teval-auc:0.843451\n",
"[18]\ttrain-auc:0.880926\teval-auc:0.84163\n",
"[19]\ttrain-auc:0.881054\teval-auc:0.841593\n"
]
}
],
"source": [
"params={'objective': 'binary:logistic','eval_metric': 'auc', 'scale_pos_weight':54,'silent':0} \n",
"xgtrain = xgb.DMatrix(train_X.values, label=train_target.values)\n",
"xgtest= xgb.DMatrix(test_X.values,label=test_target.values) \n",
"\n",
"watchlist = [(xgtrain, 'train'),(xgtest,'eval')]\n",
"num_round=20\n",
"\n",
"model=xgb.train(params,xgtrain,num_round,watchlist)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dtest_predictions = xgb1.predict(test_X) \n",
"dtest_predprob = xgb1.predict_proba(test_X)[:,1]\n",
"#Print model report:\n",
"print(\"\\nModel Report\")\n",
"print(\"Accuracy on Test set: %.4g\" % metrics.accuracy_score(test_target.values, dtest_predictions))\n",
"print(\"AUC Score on test: %f\" % metrics.roc_auc_score(test_target, dtest_predprob))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment