Last active
June 2, 2018 11:30
-
-
Save satory074/61c42da4da242d03ffd2997329539396 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# load\n", | |
"train = pd.read_csv('input/train.csv')\n", | |
"test = pd.read_csv('input/test.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# extract\n", | |
"train = train.loc[:, ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]\n", | |
"test = test.loc[:, ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Survived 0\n", | |
"Pclass 0\n", | |
"Sex 0\n", | |
"Age 177\n", | |
"SibSp 0\n", | |
"Parch 0\n", | |
"Fare 0\n", | |
"Embarked 2\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"PassengerId 0\n", | |
"Pclass 0\n", | |
"Sex 0\n", | |
"Age 86\n", | |
"SibSp 0\n", | |
"Parch 0\n", | |
"Fare 1\n", | |
"Embarked 0\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# listwise deletion\n", | |
"train.replace('', np.nan, inplace=True)\n", | |
"train.dropna(subset=['Embarked'], inplace=True)\n", | |
"\n", | |
"test.fillna({'Fare': test['Fare'].mean()}, inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# replace String with value\n", | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"\n", | |
"le = LabelEncoder()\n", | |
"for label in ['Embarked','Sex']:\n", | |
" train[label] = le.fit_transform(train[label])\n", | |
" test[label] = le.fit_transform(test[label])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# complements missing value of age\n", | |
"# [reference] kernel by Poonam Ligade\n", | |
"from sklearn.ensemble import RandomForestRegressor\n", | |
"def fill_missing_age(df):\n", | |
" age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Pclass']]\n", | |
"\n", | |
" train = age_df.loc[ (df.Age.notnull()) ]\n", | |
" test = age_df.loc[ (df.Age.isnull()) ]\n", | |
"\n", | |
" y = train.values[:, 0] # Age\n", | |
" X = train.values[:, 1::] # except Age\n", | |
"\n", | |
" # Random Forest\n", | |
" rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)\n", | |
" rtr.fit(X, y)\n", | |
"\n", | |
" predictedAges = rtr.predict(test.values[:, 1::])\n", | |
"\n", | |
" df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges\n", | |
"\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train = fill_missing_age(train)\n", | |
"test = fill_missing_age(test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# standardize features\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"\n", | |
"std_scale = StandardScaler().fit(train[['Age', 'Fare']])\n", | |
"train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])\n", | |
"\n", | |
"std_scale = StandardScaler().fit(test[['Age', 'Fare']])\n", | |
"test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# dataset\n", | |
"X_train = train.drop(\"Survived\", axis=1)\n", | |
"Y_train = train[\"Survived\"]\n", | |
"X_test = test.drop(\"PassengerId\", axis=1).copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# model selection\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"from sklearn.svm import SVC\n", | |
"from sklearn.ensemble import AdaBoostClassifier\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"\n", | |
"from sklearn.model_selection import KFold\n", | |
"from sklearn.model_selection import cross_val_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--- accuracy ---\n", | |
"KNC: 0.834\n", | |
"DTC: 0.751\n", | |
"SVM: 0.852\n", | |
"AdaBoost: 0.85\n", | |
"GradientBoosting: 0.864\n" | |
] | |
} | |
], | |
"source": [ | |
"models = [(\"KNC\", KNeighborsClassifier()),\n", | |
" (\"DTC\", DecisionTreeClassifier()),\n", | |
" (\"SVM\", SVC()),\n", | |
" (\"AdaBoost\", AdaBoostClassifier()),\n", | |
" (\"GradientBoosting\",GradientBoostingClassifier())]\n", | |
"\n", | |
"print ('--- accuracy ---')\n", | |
"for name, model in models:\n", | |
" kfold = KFold(n_splits=10, random_state=42)\n", | |
" result = cross_val_score(model, X_train,Y_train,\n", | |
" cv=kfold,scoring=\"roc_auc\")\n", | |
" \n", | |
" print ('{}: {:.3}'.format(name, result.mean()))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Select GBM as the model." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# tuning the hyper-parameters of an estimator\n", | |
"from sklearn.model_selection import GridSearchCV" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.8688672260850385\n", | |
"{'n_estimators': 98}\n" | |
] | |
} | |
], | |
"source": [ | |
"# tuning 'n_estimators'\n", | |
"param1 = {'n_estimators':range(20,101)}\n", | |
"gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), \n", | |
" param_grid=param1, scoring='roc_auc', cv=5)\n", | |
"\n", | |
"gsearch1.fit(X_train, Y_train)\n", | |
"\n", | |
"print(gsearch1.best_score_)\n", | |
"print(gsearch1.best_params_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.8759890738587764\n", | |
"{'max_depth': 5, 'min_samples_split': 302}\n" | |
] | |
} | |
], | |
"source": [ | |
"# tuning 'max_depth' and 'min_samples_split'\n", | |
"param2 = {'max_depth':range(2,16), 'min_samples_split':range(2, 1002, 50)}\n", | |
"gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=84), \n", | |
" param_grid=param2, scoring='roc_auc', cv=5)\n", | |
"\n", | |
"gsearch2.fit(X_train, Y_train)\n", | |
"\n", | |
"print(gsearch2.best_score_)\n", | |
"print(gsearch2.best_params_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# predict\n", | |
"clf = GradientBoostingClassifier(n_estimators=98, max_depth=5, min_samples_split=302)\n", | |
"clf.fit(X_train, Y_train)\n", | |
"result = clf.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# output\n", | |
"submission = pd.DataFrame({\n", | |
" \"PassengerId\": test[\"PassengerId\"],\n", | |
" \"Survived\": result})\n", | |
"\n", | |
"submission.to_csv(\"output/submission.csv\", index=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment