Skip to content

Instantly share code, notes, and snippets.

@satory074
Last active June 2, 2018 11:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satory074/61c42da4da242d03ffd2997329539396 to your computer and use it in GitHub Desktop.
Save satory074/61c42da4da242d03ffd2997329539396 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# load\n",
"train = pd.read_csv('input/train.csv')\n",
"test = pd.read_csv('input/test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# extract\n",
"train = train.loc[:, ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]\n",
"test = test.loc[:, ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Survived 0\n",
"Pclass 0\n",
"Sex 0\n",
"Age 177\n",
"SibSp 0\n",
"Parch 0\n",
"Fare 0\n",
"Embarked 2\n",
"dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PassengerId 0\n",
"Pclass 0\n",
"Sex 0\n",
"Age 86\n",
"SibSp 0\n",
"Parch 0\n",
"Fare 1\n",
"Embarked 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# listwise deletion\n",
"train.replace('', np.nan, inplace=True)\n",
"train.dropna(subset=['Embarked'], inplace=True)\n",
"\n",
"test.fillna({'Fare': test['Fare'].mean()}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# replace String with value\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"le = LabelEncoder()\n",
"for label in ['Embarked','Sex']:\n",
" train[label] = le.fit_transform(train[label])\n",
" test[label] = le.fit_transform(test[label])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# complements missing value of age\n",
"# [reference] kernel by Poonam Ligade\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"def fill_missing_age(df):\n",
" age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Pclass']]\n",
"\n",
" train = age_df.loc[ (df.Age.notnull()) ]\n",
" test = age_df.loc[ (df.Age.isnull()) ]\n",
"\n",
" y = train.values[:, 0] # Age\n",
" X = train.values[:, 1::] # except Age\n",
"\n",
" # Random Forest\n",
" rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)\n",
" rtr.fit(X, y)\n",
"\n",
" predictedAges = rtr.predict(test.values[:, 1::])\n",
"\n",
" df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"train = fill_missing_age(train)\n",
"test = fill_missing_age(test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# standardize features\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"std_scale = StandardScaler().fit(train[['Age', 'Fare']])\n",
"train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])\n",
"\n",
"std_scale = StandardScaler().fit(test[['Age', 'Fare']])\n",
"test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# dataset\n",
"X_train = train.drop(\"Survived\", axis=1)\n",
"Y_train = train[\"Survived\"]\n",
"X_test = test.drop(\"PassengerId\", axis=1).copy()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# model selection\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.model_selection import cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- accuracy ---\n",
"KNC: 0.834\n",
"DTC: 0.751\n",
"SVM: 0.852\n",
"AdaBoost: 0.85\n",
"GradientBoosting: 0.864\n"
]
}
],
"source": [
"models = [(\"KNC\", KNeighborsClassifier()),\n",
" (\"DTC\", DecisionTreeClassifier()),\n",
" (\"SVM\", SVC()),\n",
" (\"AdaBoost\", AdaBoostClassifier()),\n",
" (\"GradientBoosting\",GradientBoostingClassifier())]\n",
"\n",
"print ('--- accuracy ---')\n",
"for name, model in models:\n",
" kfold = KFold(n_splits=10, random_state=42)\n",
" result = cross_val_score(model, X_train,Y_train,\n",
" cv=kfold,scoring=\"roc_auc\")\n",
" \n",
" print ('{}: {:.3}'.format(name, result.mean()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Select GBM as the model."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# tuning the hyper-parameters of an estimator\n",
"from sklearn.model_selection import GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8688672260850385\n",
"{'n_estimators': 98}\n"
]
}
],
"source": [
"# tuning 'n_estimators'\n",
"param1 = {'n_estimators':range(20,101)}\n",
"gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), \n",
" param_grid=param1, scoring='roc_auc', cv=5)\n",
"\n",
"gsearch1.fit(X_train, Y_train)\n",
"\n",
"print(gsearch1.best_score_)\n",
"print(gsearch1.best_params_)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8759890738587764\n",
"{'max_depth': 5, 'min_samples_split': 302}\n"
]
}
],
"source": [
"# tuning 'max_depth' and 'min_samples_split'\n",
"param2 = {'max_depth':range(2,16), 'min_samples_split':range(2, 1002, 50)}\n",
"gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=84), \n",
" param_grid=param2, scoring='roc_auc', cv=5)\n",
"\n",
"gsearch2.fit(X_train, Y_train)\n",
"\n",
"print(gsearch2.best_score_)\n",
"print(gsearch2.best_params_)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# predict\n",
"clf = GradientBoostingClassifier(n_estimators=98, max_depth=5, min_samples_split=302)\n",
"clf.fit(X_train, Y_train)\n",
"result = clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# output\n",
"submission = pd.DataFrame({\n",
" \"PassengerId\": test[\"PassengerId\"],\n",
" \"Survived\": result})\n",
"\n",
"submission.to_csv(\"output/submission.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment