-
-
Save ocoyawale/10b757302b6deec45ced16e314254584 to your computer and use it in GitHub Desktop.
pandas scikit learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 151, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt\n", | |
"import sklearn as skl\n", | |
"import scipy.stats as stats\n", | |
"import pylab\n", | |
"import statsmodels.api as sm\n", | |
"import statsmodels.formula.api as smf\n", | |
"from sklearn import linear_model\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"from sklearn.tree import DecisionTreeRegressor\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.ensemble import RandomForestRegressor\n", | |
"from sklearn.cross_validation import cross_val_score\n", | |
"from sklearn import metrics" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 152, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"train_path='D:/kaggle/Insurance/train.csv'\n", | |
"test_path='D:/kaggle/Insurance/test.csv'\n", | |
"train=pd.read_csv(train_path)\n", | |
"test=pd.read_csv(test_path)\n", | |
"train=pd.DataFrame(train)\n", | |
"test=pd.DataFrame(test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#EDA\n", | |
"train.describe(percentiles=[.01,.05,.95,.99])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#list of continuous variables\n", | |
"cont_var=pd.DataFrame(train,columns=['Hazard','T1_V1','T1_V2','T1_V3','T2_V1','T2_V15','T2_V2','T2_V4','T2_V9'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"pd.scatter_matrix(cont_var,diagonal='kde',color='k',alpha=0.3)\n", | |
"plt.show()\n", | |
"\n", | |
"#pd.scatter_matrix(train,diagonal='kde',color='k',alpha=0.3)\n", | |
"#plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T1_V1'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['Hazard'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T1_V2'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T1_V2'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T2_V1'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T2_V1'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T2_V15'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T2_V15'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T2_V2'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T2_V2'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T2_V4'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T2_V4'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"plt.scatter(train['Hazard'],train['T2_V9'])\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stats.probplot(train['T2_V9'], dist=\"norm\", plot=pylab)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#indicator variables\n", | |
"ind1=pd.get_dummies(train['T1_V4'],prefix='imp_T1_V4') #categorical\n", | |
"ind2=pd.get_dummies(train['T1_V5'],prefix='imp_T1_V5') #categorical\n", | |
"ind3=pd.get_dummies(train['T1_V6'],prefix='imp_T1_V6')#categorical\n", | |
"ind4=pd.get_dummies(train['T1_V7'],prefix='imp_T1_V7')#categorical\n", | |
"ind5=pd.get_dummies(train['T1_V8'],prefix='imp_T1_V8') #categorical\n", | |
"ind6=pd.get_dummies(train['T1_V9'],prefix='imp_T1_V9') #categorical\n", | |
"ind7=pd.get_dummies(train['T1_V10'],prefix='imp_T1_V10') #categorical\n", | |
"ind8=pd.get_dummies(train['T1_V11'],prefix='imp_T1_V11') #categorical\n", | |
"ind9=pd.get_dummies(train['T1_V12'],prefix='imp_T1_V12')#categorical\n", | |
"ind10=pd.get_dummies(train['T1_V13'],prefix='imp_T1_V13') #categorical\n", | |
"ind11=pd.get_dummies(train['T1_V15'],prefix='imp_T1_V15') #categorical\n", | |
"ind12=pd.get_dummies(train['T1_V16'],prefix='imp_T1_V16') #categorical\n", | |
"ind13=pd.get_dummies(train['T1_V17'],prefix='imp_T1_V17') #categorical\n", | |
"ind14=pd.get_dummies(train['T2_V3'],prefix='imp_T2_V3') #categorical\n", | |
"ind15=pd.get_dummies(train['T2_V5'],prefix='imp_T2_V5')#categorical\n", | |
"ind16=pd.get_dummies(train['T2_V7'],prefix='imp_T2_V7') #categorical\n", | |
"ind17=pd.get_dummies(train['T2_V8'],prefix='imp_T2_V8') #ordinal\n", | |
"ind18=pd.get_dummies(train['T2_V11'],prefix='imp_T2_V11') #categorical\n", | |
"ind19=pd.get_dummies(train['T2_V12'],prefix='imp_T2_V12') #categorical\n", | |
"ind20=pd.get_dummies(train['T2_V13'],prefix='imp_T2_V13') #categorical\n", | |
"#Transform to same scale, dampen outlier influence, etc.\n", | |
"train['imp_T1_V1']=np.log(train['T1_V1']+1)\n", | |
"train['imp_T1_V2']=np.log(train['T1_V2']+1)\n", | |
"train['imp_T1_V3']=np.log(train['T1_V3']+1)\n", | |
"train['imp_T1_V14']=np.log(train['T1_V14']+1)\n", | |
"train['imp_T2_V1']=np.log(train['T2_V1']+1) #0-100\n", | |
"train['imp_T2_V2']=np.log(train['T2_V2']+1)\n", | |
"train['imp_T2_V4']=np.log(train['T2_V4']+1)\n", | |
"train['imp_T2_V6']=np.log(train['T2_V6']+1)\n", | |
"train['imp_T2_V9']=np.log(train['T2_V9']+1)\n", | |
"train['imp_T2_V10']=np.log(train['T2_V10']+1)\n", | |
"train['imp_T2_V14']=np.log(train['T2_V14']+1)\n", | |
"train['imp_T2_V15']=np.log(train['T2_V15']+1)\n", | |
"train['imp_hazard']=train['Hazard']\n", | |
"#train['imp_hazard']=np.where(train['Hazard'] > 100,100,train['Hazard'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 153, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#indicator variables all \n", | |
"ind1=pd.get_dummies(train['T1_V4'],prefix='imp_T1_V4') #categorical\n", | |
"ind2=pd.get_dummies(train['T1_V5'],prefix='imp_T1_V5') #categorical\n", | |
"ind3=pd.get_dummies(train['T1_V6'],prefix='imp_T1_V6')#categorical\n", | |
"ind4=pd.get_dummies(train['T1_V7'],prefix='imp_T1_V7')#categorical\n", | |
"ind5=pd.get_dummies(train['T1_V8'],prefix='imp_T1_V8') #categorical\n", | |
"ind6=pd.get_dummies(train['T1_V9'],prefix='imp_T1_V9') #categorical\n", | |
"ind7=pd.get_dummies(train['T1_V10'],prefix='imp_T1_V10') #categorical\n", | |
"ind8=pd.get_dummies(train['T1_V11'],prefix='imp_T1_V11') #categorical\n", | |
"ind9=pd.get_dummies(train['T1_V12'],prefix='imp_T1_V12')#categorical\n", | |
"ind10=pd.get_dummies(train['T1_V13'],prefix='imp_T1_V13') #categorical\n", | |
"ind11=pd.get_dummies(train['T1_V15'],prefix='imp_T1_V15') #categorical\n", | |
"ind12=pd.get_dummies(train['T1_V16'],prefix='imp_T1_V16') #categorical\n", | |
"ind13=pd.get_dummies(train['T1_V17'],prefix='imp_T1_V17') #categorical\n", | |
"ind14=pd.get_dummies(train['T2_V3'],prefix='imp_T2_V3') #categorical\n", | |
"ind15=pd.get_dummies(train['T2_V5'],prefix='imp_T2_V5')#categorical\n", | |
"ind16=pd.get_dummies(train['T2_V7'],prefix='imp_T2_V7') #categorical\n", | |
"ind17=pd.get_dummies(train['T2_V8'],prefix='imp_T2_V8') #ordinal\n", | |
"ind18=pd.get_dummies(train['T2_V11'],prefix='imp_T2_V11') #categorical\n", | |
"ind19=pd.get_dummies(train['T2_V12'],prefix='imp_T2_V12') #categorical\n", | |
"ind20=pd.get_dummies(train['T2_V13'],prefix='imp_T2_V13') #categorical\n", | |
"ind21=pd.get_dummies(train['T1_V1'],prefix='imp_T1_V1') #categorical\n", | |
"ind22=pd.get_dummies(train['T1_V2'],prefix='imp_T1_V2') #categorical\n", | |
"ind23=pd.get_dummies(train['T1_V3'],prefix='imp_T1_V3') #categorical\n", | |
"ind24=pd.get_dummies(train['T1_V14'],prefix='imp_T1_V14') #categorical\n", | |
"ind25=pd.get_dummies(train['T2_V1'],prefix='imp_T2_V1') #categorical\n", | |
"ind26=pd.get_dummies(train['T2_V2'],prefix='imp_T2_V2') #categorical\n", | |
"ind27=pd.get_dummies(train['T2_V4'],prefix='imp_T2_V4') #categorical\n", | |
"ind28=pd.get_dummies(train['T2_V6'],prefix='imp_T2_V6') #categorical\n", | |
"ind29=pd.get_dummies(train['T2_V9'],prefix='imp_T2_V9') #categorical\n", | |
"ind30=pd.get_dummies(train['T2_V10'],prefix='imp_T2_V10') #categorical\n", | |
"ind31=pd.get_dummies(train['T2_V14'],prefix='imp_T2_V14') #categorical\n", | |
"ind32=pd.get_dummies(train['T2_V15'],prefix='imp_T2_V15') #categorical\n", | |
"train['imp_hazard']=train['Hazard']\n", | |
"in1=pd.get_dummies(test['T1_V4'],prefix='imp_T1_V4') #categorical\n", | |
"in2=pd.get_dummies(test['T1_V5'],prefix='imp_T1_V5') #categorical\n", | |
"in3=pd.get_dummies(test['T1_V6'],prefix='imp_T1_V6')#categorical\n", | |
"in4=pd.get_dummies(test['T1_V7'],prefix='imp_T1_V7')#categorical\n", | |
"in5=pd.get_dummies(test['T1_V8'],prefix='imp_T1_V8') #categorical\n", | |
"in6=pd.get_dummies(test['T1_V9'],prefix='imp_T1_V9') #categorical\n", | |
"in7=pd.get_dummies(test['T1_V10'],prefix='imp_T1_V10') #categorical\n", | |
"in8=pd.get_dummies(test['T1_V11'],prefix='imp_T1_V11') #categorical\n", | |
"in9=pd.get_dummies(test['T1_V12'],prefix='imp_T1_V12')#categorical\n", | |
"in10=pd.get_dummies(test['T1_V13'],prefix='imp_T1_V13') #categorical\n", | |
"in11=pd.get_dummies(test['T1_V15'],prefix='imp_T1_V15') #categorical\n", | |
"in12=pd.get_dummies(test['T1_V16'],prefix='imp_T1_V16') #categorical\n", | |
"in13=pd.get_dummies(test['T1_V17'],prefix='imp_T1_V17') #categorical\n", | |
"in14=pd.get_dummies(test['T2_V3'],prefix='imp_T2_V3') #categorical\n", | |
"in15=pd.get_dummies(test['T2_V5'],prefix='imp_T2_V5')#categorical\n", | |
"in16=pd.get_dummies(test['T2_V7'],prefix='imp_T2_V7') #categorical\n", | |
"in17=pd.get_dummies(test['T2_V8'],prefix='imp_T2_V8') #ordinal\n", | |
"in18=pd.get_dummies(test['T2_V11'],prefix='imp_T2_V11') #categorical\n", | |
"in19=pd.get_dummies(test['T2_V12'],prefix='imp_T2_V12') #categorical\n", | |
"in20=pd.get_dummies(test['T2_V13'],prefix='imp_T2_V13') #categorical\n", | |
"in21=pd.get_dummies(test['T1_V1'],prefix='imp_T1_V1') #categorical\n", | |
"in22=pd.get_dummies(test['T1_V2'],prefix='imp_T1_V2') #categorical\n", | |
"in23=pd.get_dummies(test['T1_V3'],prefix='imp_T1_V3') #categorical\n", | |
"in24=pd.get_dummies(test['T1_V14'],prefix='imp_T1_V14') #categorical\n", | |
"in25=pd.get_dummies(test['T2_V1'],prefix='imp_T2_V1') #categorical\n", | |
"in26=pd.get_dummies(test['T2_V2'],prefix='imp_T2_V2') #categorical\n", | |
"in27=pd.get_dummies(test['T2_V4'],prefix='imp_T2_V4') #categorical\n", | |
"in28=pd.get_dummies(test['T2_V6'],prefix='imp_T2_V6') #categorical\n", | |
"in29=pd.get_dummies(test['T2_V9'],prefix='imp_T2_V9') #categorical\n", | |
"in30=pd.get_dummies(test['T2_V10'],prefix='imp_T2_V10') #categorical\n", | |
"in31=pd.get_dummies(test['T2_V14'],prefix='imp_T2_V14') #categorical\n", | |
"in32=pd.get_dummies(test['T2_V15'],prefix='imp_T2_V15') #categorical\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"in1=pd.get_dummies(test['T1_V4'],prefix='imp_T1_V4') #categorical\n", | |
"in2=pd.get_dummies(test['T1_V5'],prefix='imp_T1_V5') #categorical\n", | |
"in3=pd.get_dummies(test['T1_V6'],prefix='imp_T1_V6')#categorical\n", | |
"in4=pd.get_dummies(test['T1_V7'],prefix='imp_T1_V7')#categorical\n", | |
"in5=pd.get_dummies(test['T1_V8'],prefix='imp_T1_V8') #categorical\n", | |
"in6=pd.get_dummies(test['T1_V9'],prefix='imp_T1_V9') #categorical\n", | |
"in7=pd.get_dummies(test['T1_V10'],prefix='imp_T1_V10') #categorical\n", | |
"in8=pd.get_dummies(test['T1_V11'],prefix='imp_T1_V11') #categorical\n", | |
"in9=pd.get_dummies(test['T1_V12'],prefix='imp_T1_V12')#categorical\n", | |
"in10=pd.get_dummies(test['T1_V13'],prefix='imp_T1_V13') #categorical\n", | |
"in11=pd.get_dummies(test['T1_V15'],prefix='imp_T1_V15') #categorical\n", | |
"in12=pd.get_dummies(test['T1_V16'],prefix='imp_T1_V16') #categorical\n", | |
"in13=pd.get_dummies(test['T1_V17'],prefix='imp_T1_V17') #categorical\n", | |
"in14=pd.get_dummies(test['T2_V3'],prefix='imp_T2_V3') #categorical\n", | |
"in15=pd.get_dummies(test['T2_V5'],prefix='imp_T2_V5')#categorical\n", | |
"in16=pd.get_dummies(test['T2_V7'],prefix='imp_T2_V7') #categorical\n", | |
"in17=pd.get_dummies(test['T2_V8'],prefix='imp_T2_V8') #ordinal\n", | |
"in18=pd.get_dummies(test['T2_V11'],prefix='imp_T2_V11') #categorical\n", | |
"in19=pd.get_dummies(test['T2_V12'],prefix='imp_T2_V12') #categorical\n", | |
"in20=pd.get_dummies(test['T2_V13'],prefix='imp_T2_V13') #categorical\n", | |
"#Transform to same scale, dampen outlier influence, etc.\n", | |
"test['imp_T1_V1']=np.log(test['T1_V1']+1)\n", | |
"test['imp_T1_V2']=np.log(test['T1_V2']+1)\n", | |
"test['imp_T1_V3']=np.log(test['T1_V3']+1)\n", | |
"test['imp_T1_V14']=np.log(test['T1_V14']+1)\n", | |
"test['imp_T2_V1']=np.log(test['T2_V1']+1)\n", | |
"test['imp_T2_V2']=np.log(test['T2_V2']+1)\n", | |
"test['imp_T2_V4']=np.log(test['T2_V4']+1)\n", | |
"test['imp_T2_V6']=np.log(test['T2_V6']+1)\n", | |
"test['imp_T2_V9']=np.log(test['T2_V9']+1)\n", | |
"test['imp_T2_V10']=np.log(test['T2_V10']+1)\n", | |
"test['imp_T2_V14']=np.log(test['T2_V14']+1)\n", | |
"test['imp_T2_V15']=np.log(test['T2_V15']+1)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 154, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"train_imp=train[['Id','imp_hazard']].join(ind1).join(ind2).join(ind3).join(ind4).join(ind5).join(ind6).join(ind7).join(ind8).join(ind9).join(ind10).join(ind11).join(ind12).join(ind13).join(ind14).join(ind15).join(ind16).join(ind17).join(ind18).join(ind19).join(ind20).join(ind21).join(ind22).join(ind23).join(ind24).join(ind25).join(ind26).join(ind27).join(ind28).join(ind29).join(ind30).join(ind31).join(ind32)\n", | |
"#train_imp=train[['Id','imp_hazard','imp_T1_V1','imp_T1_V2','imp_T1_V3','imp_T1_V14','imp_T2_V1','imp_T2_V2','imp_T2_V4','imp_T2_V6','imp_T2_V9','imp_T2_V10','imp_T2_V14','imp_T2_V15']].join(ind1).join(ind2).join(ind3).join(ind4).join(ind5).join(ind6).join(ind7).join(ind8).join(ind9).join(ind10).join(ind11).join(ind12).join(ind13).join(ind14).join(ind15).join(ind16).join(ind17).join(ind18).join(ind19).join(ind20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#train_imp2=train_imp.drop(['imp_T1_V4_H','imp_T1_V5_L','imp_T1_V6_N','imp_T1_V7_C','imp_T1_V8_A','imp_T1_V9_G','imp_T1_V10_2','imp_T1_V11_K','imp_T1_V12_A','imp_T1_V13_5','imp_T1_V15_F','imp_T1_V16_O','imp_T1_V17_N','imp_T2_V3_N','imp_T2_V5_F','imp_T2_V7_22','imp_T2_V8_1','imp_T2_V11_N','imp_T2_V12_N','imp_T2_V13_B'],1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 155, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"test_imp=test[['Id']].join(in1).join(in2).join(in3).join(in4).join(in5).join(in6).join(in7).join(in8).join(in9).join(in10).join(in11).join(in12).join(in13).join(in14).join(in15).join(in16).join(in17).join(in18).join(in19).join(in20).join(in21).join(in22).join(in23).join(in24).join(in25).join(in26).join(in27).join(in28).join(in29).join(in30).join(in31).join(in32)\n", | |
"#test_imp=test[['Id','imp_T1_V1','imp_T1_V2','imp_T1_V3','imp_T1_V14','imp_T2_V1','imp_T2_V2','imp_T2_V4','imp_T2_V6','imp_T2_V9','imp_T2_V10','imp_T2_V14','imp_T2_V15']].join(in1).join(in2).join(in3).join(in4).join(in5).join(in6).join(in7).join(in8).join(in9).join(in10).join(in11).join(in12).join(in13).join(in14).join(in15).join(in16).join(in17).join(in18).join(in19).join(in20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#test_imp2=test_imp.drop(['imp_T1_V4_H','imp_T1_V5_L','imp_T1_V6_N','imp_T1_V7_C','imp_T1_V8_A','imp_T1_V9_G','imp_T1_V10_2','imp_T1_V11_K','imp_T1_V12_A','imp_T1_V13_5','imp_T1_V15_F','imp_T1_V16_O','imp_T1_V17_N','imp_T2_V3_N','imp_T2_V5_F','imp_T2_V7_22','imp_T2_V8_1','imp_T2_V11_N','imp_T2_V12_N','imp_T2_V13_B'],1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"train_imp.to_csv('d:/kaggle/Insurance/out_train.csv', index=False)\n", | |
"test_imp.to_csv('d:/kaggle/Insurance/out_test.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"train_imp2.to_csv('d:/kaggle/Insurance/out_train.csv', index=False)\n", | |
"test_imp2.to_csv('d:/kaggle/Insurance/out_test.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 156, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"x = train_imp2.ix[:,2:392] #predictors train\n", | |
"y = train_imp2.ix[:,1]# response train\n", | |
"test_data=test_imp2.ix[:,1:391] # predictors test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 157, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#classification\n", | |
"from sklearn.svm import SVC\n", | |
"from sklearn.svm import NuSVC\n", | |
"from sklearn.svm import LinearSVC\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.naive_bayes import GaussianNB\n", | |
"from sklearn.naive_bayes import MultinomialNB\n", | |
"from sklearn.naive_bayes import BernoulliNB\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn.tree import DecisionTreeClassifier\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#SVC\n", | |
"\n", | |
"svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,gamma=0.0, kernel='rbf', max_iter=-1, probability=False,random_state=None, shrinking=True, tol=0.001, verbose=False)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"svc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', svc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\" % np.mean((svc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % svc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(svc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(svc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(svc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(svc.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(svc.predict(x),y))\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(svc.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(svc.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(svc.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(svc.predict(x),y))\n", | |
"\n", | |
"train_svc=pd.DataFrame(np.round(np.exp(svc.predict(x))-1,0),columns=['yhat'])\n", | |
"train_svc2=train_imp2[['Id','imp_hazard']].join(train_svc)\n", | |
"score_svc=pd.DataFrame(np.round(np.exp(svc.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_svc2=test_imp2[['Id']].join(score_svc)\n", | |
"\n", | |
"train_svc2.to_csv('d:/kaggle/Insurance/train_svc2.csv', index=False)\n", | |
"score_svc2.to_csv('d:/kaggle/Insurance/score_svc2.csv', index=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#NuSVC\n", | |
"\n", | |
"nusvc = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"nusvc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', nusvc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((nusvc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % nusvc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(nusvc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(nusvc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(nusvc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(nusvc.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(nusvc.predict(x),y))\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(nusvc.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(nusvc.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(nusvc.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(nusvc.predict(x),y))\n", | |
"\n", | |
"train_nusvc=pd.DataFrame(np.round(np.exp(nusvc.predict(x))-1,0),columns=['yhat'])\n", | |
"train_nusvc2=train_imp2[['Id','imp_hazard']].join(train_nusvc)\n", | |
"score_nusvc=pd.DataFrame(np.round(np.exp(nusvc.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_nusvc2=test_imp2[['Id']].join(score_nusvc)\n", | |
"\n", | |
"train_nusvc2.to_csv('d:/kaggle/Insurance/train_nusvc2.csv', index=False)\n", | |
"score_nusvc2.to_csv('d:/kaggle/Insurance/score_nusvc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#LinearSVC\n", | |
"\n", | |
"lsvc = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"lsvc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', lsvc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((lsvc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % lsvc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(lsvc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(lsvc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(lsvc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(lsvc.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(lsvc.predict(x),y))\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(lsvc.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(lsvc.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(lsvc.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(lsvc.predict(x),y))\n", | |
"\n", | |
"train_lsvc=pd.DataFrame(np.round(np.exp(lsvc.predict(x))-1,0),columns=['yhat'])\n", | |
"train_lsvc2=train_imp2[['Id','imp_hazard']].join(train_lsvc)\n", | |
"score_lsvc=pd.DataFrame(np.round(np.exp(lsvc.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_lsvc2=test_imp2[['Id']].join(score_lsvc)\n", | |
"\n", | |
"train_lsvc2.to_csv('d:/kaggle/Insurance/train_lsvc2.csv', index=False)\n", | |
"score_lsvc2.to_csv('d:/kaggle/Insurance/score_lsvc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#random forest classifier\n", | |
"\n", | |
"rfc = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)\n", | |
"# Train the model using the training sets\n", | |
"rfc=rfc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', rfc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((rfc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % rfc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(rfc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(rfc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(rfc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(rfc.predict(x),y))\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(rfc.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(rfc.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(rfc.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(rfc.predict(x),y))\n", | |
"\n", | |
"train_rfc=pd.DataFrame(np.round(np.exp(rfc.predict(x))-1,0),columns=['yhat'])\n", | |
"train_rfc2=train_imp2[['Id','imp_hazard']].join(train_rfc)\n", | |
"score_rfc=pd.DataFrame(np.round(np.exp(rfc.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_rfc2=test_imp2[['Id']].join(score_rfc)\n", | |
"\n", | |
"train_rfc2.to_csv('d:/kaggle/Insurance/train_rfc2.csv', index=False)\n", | |
"score_rfc2.to_csv('d:/kaggle/Insurance/score_rfc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#decision Tree\n", | |
"\n", | |
"dtc = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"dtc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', dtc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((dtc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % dtc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(dtc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(dtc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(dtc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(dtc.predict(x),y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(dtc.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(dtc.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(dtc.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(dtc.predict(x),y))\n", | |
"\n", | |
"train_dtc=pd.DataFrame(np.round(np.exp(dtc.predict(x))-1,0),columns=['yhat'])\n", | |
"train_dtc2=train_imp2[['Id','imp_hazard']].join(train_dtc)\n", | |
"score_dtc=pd.DataFrame(np.round(np.exp(dtc.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_dtc2=test_imp2[['Id']].join(score_dtc)\n", | |
"\n", | |
"train_dtc2.to_csv('d:/kaggle/Insurance/train_dtc2.csv', index=False)\n", | |
"score_dtc2.to_csv('d:/kaggle/Insurance/score_dtc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#OLS Regression\n", | |
"\n", | |
"# Create linear regression object\n", | |
"regr_ols = linear_model.LinearRegression()\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_ols.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_ols.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_ols.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_ols.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_ols.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_ols.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_ols.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_ols.predict(x),y))\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Lasso\n", | |
"regr_lasso = linear_model.Lasso(alpha=.1, copy_X=True, fit_intercept=True, max_iter=1000,\n", | |
" normalize=False, positive=False, precompute=False, random_state=None,\n", | |
" selection='cyclic', tol=0.0001, warm_start=False)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_lasso.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_lasso.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_lasso.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_lasso.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_lasso.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_lasso.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_lasso.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_lasso.predict(x),y))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Ridge\n", | |
"regr_ridge = linear_model.Ridge(alpha=.5,copy_X=True, fit_intercept=True, max_iter=None,\n", | |
" normalize=False, solver='auto', tol=0.001)\n", | |
" \n", | |
"# Train the model using the training sets\n", | |
"regr_ridge.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_ridge.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_ridge.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_ridge.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_ridge.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_ridge.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_ridge.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_ridge.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#elastic net\n", | |
"regr_enet = linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_enet.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_enet.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_enet.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_enet.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_enet.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_enet.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_enet.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_enet.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#lars\n", | |
"regr_lars = linear_model.Lars(fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=2.2204460492503131e-16, copy_X=True, fit_path=True)\n", | |
"# Train the model using the training sets\n", | |
"regr_lars.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_lars.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_lars.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_lars.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_lars.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_lars.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_lars.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_lars.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#lasso lars\n", | |
"regr_llars = linear_model.LassoLars(alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, copy_X=True, fit_path=True)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_llars.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_llars.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_llars.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_llars.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_llars.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_llars.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_llars.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_llars.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#OrthogonalMatchingPursuit\n", | |
"regr_omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=None, tol=None, fit_intercept=True, normalize=True, precompute='auto')\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_omp.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_omp.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_omp.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_omp.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_omp.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_omp.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_omp.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_omp.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Bayesian Ridge\n", | |
"regr_bridge = linear_model.BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_bridge.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_bridge.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_bridge.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_bridge.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_bridge.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_bridge.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_bridge.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_bridge.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#passive aggressive regressor\n", | |
"regr_par = linear_model.PassiveAggressiveRegressor(C=1.0, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, class_weight=None, warm_start=False)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_par.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_par.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_par.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_par.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_par.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_par.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_par.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_par.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#TheilSenRegressor\n", | |
"regr_tsr = linear_model.TheilSenRegressor(fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, max_iter=300, tol=0.001, random_state=None, n_jobs=1, verbose=False)\n", | |
"\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_tsr.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_tsr.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_tsr.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_tsr.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_tsr.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_tsr.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_tsr.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_tsr.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Stochastic Gradient Descent Regression\n", | |
"\n", | |
"\n", | |
"regr_sgdr = linear_model.SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,\n", | |
" fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',\n", | |
" loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,\n", | |
" random_state=None, shuffle=True, verbose=0, warm_start=False)\n", | |
"# Train the model using the training sets\n", | |
"regr_sgdr.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', regr_sgdr.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_sgdr.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_sgdr.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_sgdr.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_sgdr.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_sgdr.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_sgdr.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#decision Tree regressor\n", | |
"\n", | |
"\n", | |
"regr_dtr = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"regr_dtr.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', regr_dtr.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_dtr.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_dtr.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_dtr.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_dtr.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_dtr.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_dtr.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#random forest regressor\n", | |
"\n", | |
"\n", | |
"regr_rfr = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False)\n", | |
"\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"model_rfr = regr_rfr.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Estimators: \\n', regr_rfr.estimators_)\n", | |
"#print('feature importances: \\n', regr_rfr.feature_importances_)\n", | |
"\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((regr_rfr.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % regr_rfr.score(x, y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(regr_rfr.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(regr_rfr.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(regr_rfr.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(regr_rfr.predict(x),y))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Gaussian Naive Bayes\n", | |
"\n", | |
"gnb = GaussianNB()\n", | |
"# Train the model using the training sets\n", | |
"gnb.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', gnb.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((gnb.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % gnb.score(x, y))\n", | |
"print(\"Accuracy: %.2f\" % metrics.accuracy_score(gnb.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(gnb.predict(x),y))\n", | |
"print(\"Precision: %.2f\" % metrics.precision_score(gnb.predict(x),y))\n", | |
"print(\"Recall: %.2f\" % metrics.recall_score(gnb.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(gnb.predict(x),y))\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(gnb.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(gnb.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(gnb.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(gnb.predict(x),y))\n", | |
"\n", | |
"train_gnb=pd.DataFrame(np.round(np.exp(gnb.predict(x))-1,0),columns=['yhat'])\n", | |
"train_gnb2=train_imp2[['Id','imp_hazard']].join(train_gnb)\n", | |
"score_gnb=pd.DataFrame(np.round(np.exp(gnb.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_gnb2=test_imp2[['Id']].join(score_gnb)\n", | |
"\n", | |
"train_gnb2.to_csv('d:/kaggle/Insurance/train_gnb2.csv', index=False)\n", | |
"score_gnb2.to_csv('d:/kaggle/Insurance/score_gnb2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#multinomial nb\n", | |
"\n", | |
"#mnb = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)\n", | |
"mnb = MultinomialNB()\n", | |
"# Train the model using the training sets\n", | |
"mnb.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', mnb.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((mnb.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % mnb.score(x, y))\n", | |
"print(\"Accuracy: %.2f\" % metrics.accuracy_score(mnb.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(mnb.predict(x),y))\n", | |
"print(\"Precision: %.2f\" % metrics.precision_score(mnb.predict(x),y))\n", | |
"print(\"Recall: %.2f\" % metrics.recall_score(mnb.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(mnb.predict(x),y))\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(mnb.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(mnb.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(mnb.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(mnb.predict(x),y))\n", | |
"\n", | |
"train_mnb=pd.DataFrame(np.round(np.exp(mnb.predict(x))-1,0),columns=['yhat'])\n", | |
"train_mnb2=train_imp2[['Id','imp_hazard']].join(train_mnb)\n", | |
"score_mnb=pd.DataFrame(np.round(np.exp(mnb.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_mnb2=test_imp2[['Id']].join(score_mnb)\n", | |
"\n", | |
"train_mnb2.to_csv('d:/kaggle/Insurance/train_mnb2.csv', index=False)\n", | |
"score_mnb2.to_csv('d:/kaggle/Insurance/score_mnb2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#BernoulliNB\n", | |
"\n", | |
"bnb = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)\n", | |
"# Train the model using the training sets\n", | |
"bnb.fit(x, y)\n", | |
"# The coefficients\n", | |
"print('Coefficients: \\n', bnb.coef_)\n", | |
"# The mean square error\n", | |
"print(\"Residual sum of squares: %.2f\"\n", | |
" % np.mean((bnb.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"print('Variance score: %.2f' % bnb.score(x, y))\n", | |
"print(\"Accuracy: %.2f\" % metrics.accuracy_score(bnb.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(bnb.predict(x),y))\n", | |
"print(\"Precision: %.2f\" % metrics.precision_score(bnb.predict(x),y))\n", | |
"print(\"Recall: %.2f\" % metrics.recall_score(bnb.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(bnb.predict(x),y))\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(bnb.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(bnb.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(bnb.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(bnb.predict(x),y))\n", | |
"\n", | |
"train_bnb=pd.DataFrame(np.round(np.exp(bnb.predict(x))-1,0),columns=['yhat'])\n", | |
"train_bnb2=train_imp2[['Id','imp_hazard']].join(train_bnb)\n", | |
"score_bnb=pd.DataFrame(np.round(np.exp(bnb.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_bnb2=test_imp2[['Id']].join(score_bnb)\n", | |
"\n", | |
"train_bnb2.to_csv('d:/kaggle/Insurance/train_bnb2.csv', index=False)\n", | |
"score_bnb2.to_csv('d:/kaggle/Insurance/score_bnb2.csv', index=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#sgd\n", | |
"\n", | |
"sgd = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,eta0=0.0, fit_intercept=True,l1_ratio=0.15,learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,penalty='l2', \n", | |
" power_t=0.5, random_state=None, shuffle=True,verbose=0, warm_start=False)\n", | |
"# Train the model using the training sets\n", | |
"sgd.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', sgd.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((sgd.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % sgd.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(sgd.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(sgd.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(sgd.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(sgd.predict(x),y))\n", | |
"#print(\"AUC: %.2f\" % metrics.roc_auc_score(sgd.predict(x),y))\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(sgd.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(sgd.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(sgd.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(sgd.predict(x),y))\n", | |
"\n", | |
"train_sgd=pd.DataFrame(np.round(np.exp(sgd.predict(x))-1,0),columns=['yhat'])\n", | |
"train_sgd2=train_imp2[['Id','imp_hazard']].join(train_sgd)\n", | |
"score_sgd=pd.DataFrame(np.round(np.exp(sgd.predict(test_data))-1,0),columns=['Hazard'])\n", | |
"score_sgd2=test_imp2[['Id']].join(score_sgd)\n", | |
"\n", | |
"train_sgd2.to_csv('d:/kaggle/Insurance/train_sgd2.csv', index=False)\n", | |
"score_sgd2.to_csv('d:/kaggle/Insurance/score_sgd2.csv', index=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#ExtratreesRegressor\n", | |
"from sklearn.ensemble import ExtraTreesRegressor\n", | |
"\n", | |
"etr =ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False)\n", | |
"\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"etr.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', dtc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((dtc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % dtc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(dtc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(dtc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(dtc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(dtc.predict(x),y))\n", | |
"\n", | |
"print(\"MAE: %.2f\" % metrics.mean_absolute_error(etr.predict(x),y))\n", | |
"print(\"MSE: %.2f\" % metrics.mean_squared_error(etr.predict(x),y))\n", | |
"print(\"MedianAE: %.2f\" % metrics.median_absolute_error(etr.predict(x),y))\n", | |
"print(\"r^2: %.2f\" % metrics.r2_score(etr.predict(x),y))\n", | |
"\n", | |
"train_etr=pd.DataFrame(etr.predict(x),columns=['yhat'])\n", | |
"train_etr2=train_imp2[['Id','imp_hazard']].join(train_etr)\n", | |
"score_etr=pd.DataFrame(etr.predict(test_data),columns=['Hazard'])\n", | |
"score_etr2=test_imp2[['Id']].join(score_etr)\n", | |
"\n", | |
"train_etr2.to_csv('d:/kaggle/Insurance/train_etr2.csv', index=False)\n", | |
"score_etr2.to_csv('d:/kaggle/Insurance/score_etr2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#GradientBoostingClassifier\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"\n", | |
"gbc = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)\n", | |
"\n", | |
"# Train the model using the training sets\n", | |
"gbc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', dtc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((dtc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % dtc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(dtc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(dtc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(dtc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(dtc.predict(x),y))\n", | |
"\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(gbc.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(gbc.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(gbc.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(gbc.predict(x),y))\n", | |
"\n", | |
"train_gbc=pd.DataFrame(gbc.predict(x),columns=['yhat'])\n", | |
"train_gbc2=train_imp2[['Id','imp_hazard']].join(train_gbc)\n", | |
"score_gbc=pd.DataFrame(gbc.predict(test_data),columns=['Hazard'])\n", | |
"score_gbc2=test_imp2[['Id']].join(score_gbc)\n", | |
"\n", | |
"train_gbc2.to_csv('d:/kaggle/Insurance/train_gbc2.csv', index=False)\n", | |
"score_gbc2.to_csv('d:/kaggle/Insurance/score_gbc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#AdaBoostClassifier\n", | |
"from sklearn.ensemble import AdaBoostClassifier\n", | |
"\n", | |
"abc = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)\n", | |
"# Train the model using the training sets\n", | |
"abc.fit(x, y)\n", | |
"# The coefficients\n", | |
"#print('Coefficients: \\n', dtc.coef_)\n", | |
"# The mean square error\n", | |
"#print(\"Residual sum of squares: %.2f\"\n", | |
"# % np.mean((dtc.predict(x) - y) ** 2))\n", | |
"# Explained variance score: 1 is perfect prediction\n", | |
"#print('Variance score: %.2f' % dtc.score(x, y))\n", | |
"#print(\"Accuracy: %.2f\" % metrics.accuracy_score(dtc.predict(x),y))\n", | |
"#print(\"avg Precision: %.2f\" % metrics.average_precision_score(dtc.predict(x),y))\n", | |
"#print(\"Precision: %.2f\" % metrics.precision_score(dtc.predict(x),y))\n", | |
"#print(\"Recall: %.2f\" % metrics.recall_score(dtc.predict(x),y))\n", | |
"\n", | |
"#print(\"MAE: %.2f\" % metrics.mean_absolute_error(gbc.predict(x),y))\n", | |
"#print(\"MSE: %.2f\" % metrics.mean_squared_error(gbc.predict(x),y))\n", | |
"#print(\"MedianAE: %.2f\" % metrics.median_absolute_error(gbc.predict(x),y))\n", | |
"#print(\"r^2: %.2f\" % metrics.r2_score(gbc.predict(x),y))\n", | |
"\n", | |
"train_abc=pd.DataFrame(abc.predict(x),columns=['yhat'])\n", | |
"train_abc2=train_imp2[['Id','imp_hazard']].join(train_abc)\n", | |
"score_abc=pd.DataFrame(abc.predict(test_data),columns=['Hazard'])\n", | |
"score_abc2=test_imp2[['Id']].join(score_abc)\n", | |
"\n", | |
"train_abc2.to_csv('d:/kaggle/Insurance/train_abc2.csv', index=False)\n", | |
"score_abc2.to_csv('d:/kaggle/Insurance/score_abc2.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import PCA\n", | |
"\n", | |
"pca = PCA(copy=True, n_components=None, whiten=False)\n", | |
"model_pca=pca.fit(x)\n", | |
"print(pca.explained_variance_ratio_)\n", | |
"print(pca.components_)\n", | |
"print(pca.mean_)\n", | |
"print(pca.n_components_)\n", | |
"print(pca.noise_variance_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import IncrementalPCA\n", | |
"ipca = IncrementalPCA(copy=True, n_components=None, whiten=False,batch_size=None)\n", | |
"ipca.fit(x)\n", | |
"print(ipca.explained_variance_ratio_)\n", | |
"# print(ipca.explained_variance_)\n", | |
"print(ipca.components_)\n", | |
"print(ipca.mean_)\n", | |
"# print(ipca.var_)\n", | |
"# print(ipca.n_components_)\n", | |
"# print(ipca.noise_variance_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import RandomizedPCA\n", | |
"rpca = RandomizedPCA(copy=True, n_components=None, whiten=False,iterated_power=3,random_state=None)\n", | |
"rpca.fit(x)\n", | |
"print(rpca.explained_variance_ratio_)\n", | |
"print(rpca.components_)\n", | |
"print(rpca.mean_)\n", | |
"#print(rpca.n_components_)\n", | |
"#print(rpca.noise_variance_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import MiniBatchSparsePCA\n", | |
"mspca = MiniBatchSparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=1, method='lars', random_state=None)\n", | |
"mspca.fit(x)\n", | |
"#print(mpca.explained_variance_ratio_)\n", | |
"print(mspca.components_)\n", | |
"#print(mspca.error_)\n", | |
"print(mspca.n_iter_)\n", | |
"#print(mpca.noise_variance_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"poi_res.predict(test_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import statsmodels.api as sm\n", | |
"\n", | |
"#statsmodels.discrete.discrete_model.Poisson\n", | |
"poi_model = sm.Poisson(y, x) # offset=None, exposure=None, missing='none')\n", | |
"poi_res = poi_model.fit(method=\"newton\")\n", | |
"print(poi_res.summary())\n", | |
"#poi_model.predict(test_data)\n", | |
"poi_res.predict(test_data)\n", | |
"\n", | |
"train_poi=pd.DataFrame(exp(poi_res.predict(x)),columns=['yhat'])\n", | |
"train_poi2=train_imp2[['Id','imp_hazard']].join(train_poi)\n", | |
"score_poi=pd.DataFrame(exp(poi_res.predict(test_data)),columns=['Hazard'])\n", | |
"score_poi2=test_imp2[['Id']].join(score_poi)\n", | |
"\n", | |
"train_poi2.to_csv('d:/kaggle/Insurance/train_poi2.csv', index=False)\n", | |
"score_poi2.to_csv('d:/kaggle/Insurance/score_poi2.csv', index=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"poisummary()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class statsmodels.discrete.discrete_model.Poisson(endog, exog, offset=None, exposure=None, missing='none', **kwargs)\n", | |
"class statsmodels.discrete.discrete_model.CountResults(model, mlefit, cov_type='nonrobust', cov_kwds=None, use_t=None)\n", | |
"\n", | |
"class statsmodels.discrete.discrete_model.NegativeBinomial(endog, exog, loglike_method='nb2', offset=None, exposure=None, missing='none', **kwargs)\n", | |
"class statsmodels.discrete.discrete_model.NegativeBinomialResults(model, mlefit, cov_type='nonrobust', cov_kwds=None, use_t=None)\n", | |
"\n", | |
"\n", | |
"\n", | |
"class statsmodels.discrete.discrete_model.Logit(endog, exog, **kwargs)\n", | |
"class statsmodels.discrete.discrete_model.LogitResults(model, mlefit, cov_type='nonrobust', cov_kwds=None, use_t=None)\n", | |
"\n", | |
"class statsmodels.discrete.discrete_model.Probit(endog, exog, **kwargs)\n", | |
"class statsmodels.discrete.discrete_model.ProbitResults(model, mlefit, cov_type='nonrobust', cov_kwds=None, use_t=None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment