satory074/kaggle_titanic.ipynb

## kaggle_titanic.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load\n",
    "train = pd.read_csv('input/train.csv')\n",
    "test = pd.read_csv('input/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract\n",
    "train = train.loc[:, ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]\n",
    "test = test.loc[:, ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived      0\n",
       "Pclass        0\n",
       "Sex           0\n",
       "Age         177\n",
       "SibSp         0\n",
       "Parch         0\n",
       "Fare          0\n",
       "Embarked      2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId     0\n",
       "Pclass          0\n",
       "Sex             0\n",
       "Age            86\n",
       "SibSp           0\n",
       "Parch           0\n",
       "Fare            1\n",
       "Embarked        0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# listwise deletion\n",
    "train.replace('', np.nan, inplace=True)\n",
    "train.dropna(subset=['Embarked'], inplace=True)\n",
    "\n",
    "test.fillna({'Fare': test['Fare'].mean()}, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# replace String with value\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "le = LabelEncoder()\n",
    "for label in ['Embarked','Sex']:\n",
    "    train[label] = le.fit_transform(train[label])\n",
    "    test[label] = le.fit_transform(test[label])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# complements missing value of age\n",
    "# [reference] kernel by Poonam Ligade\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "def fill_missing_age(df):\n",
    "    age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Pclass']]\n",
    "\n",
    "    train  = age_df.loc[ (df.Age.notnull()) ]\n",
    "    test = age_df.loc[ (df.Age.isnull()) ]\n",
    "\n",
    "    y = train.values[:, 0]   # Age\n",
    "    X = train.values[:, 1::] # except Age\n",
    "\n",
    "    # Random Forest\n",
    "    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)\n",
    "    rtr.fit(X, y)\n",
    "\n",
    "    predictedAges = rtr.predict(test.values[:, 1::])\n",
    "\n",
    "    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = fill_missing_age(train)\n",
    "test = fill_missing_age(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# standardize features\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "std_scale = StandardScaler().fit(train[['Age', 'Fare']])\n",
    "train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])\n",
    "\n",
    "std_scale = StandardScaler().fit(test[['Age', 'Fare']])\n",
    "test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset\n",
    "X_train = train.drop(\"Survived\", axis=1)\n",
    "Y_train = train[\"Survived\"]\n",
    "X_test  = test.drop(\"PassengerId\", axis=1).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# model selection\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- accuracy ---\n",
      "KNC: 0.834\n",
      "DTC: 0.751\n",
      "SVM: 0.852\n",
      "AdaBoost: 0.85\n",
      "GradientBoosting: 0.864\n"
     ]
    }
   ],
   "source": [
    "models = [(\"KNC\", KNeighborsClassifier()),\n",
    "                 (\"DTC\", DecisionTreeClassifier()),\n",
    "                (\"SVM\", SVC()),\n",
    "                  (\"AdaBoost\", AdaBoostClassifier()),\n",
    "                  (\"GradientBoosting\",GradientBoostingClassifier())]\n",
    "\n",
    "print ('--- accuracy ---')\n",
    "for name, model in models:\n",
    "    kfold = KFold(n_splits=10, random_state=42)\n",
    "    result = cross_val_score(model, X_train,Y_train,\n",
    "                             cv=kfold,scoring=\"roc_auc\")\n",
    "    \n",
    "    print ('{}: {:.3}'.format(name, result.mean()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Select GBM as the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tuning the hyper-parameters of an estimator\n",
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8688672260850385\n",
      "{'n_estimators': 98}\n"
     ]
    }
   ],
   "source": [
    "# tuning 'n_estimators'\n",
    "param1 = {'n_estimators':range(20,101)}\n",
    "gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), \n",
    "                        param_grid=param1, scoring='roc_auc', cv=5)\n",
    "\n",
    "gsearch1.fit(X_train, Y_train)\n",
    "\n",
    "print(gsearch1.best_score_)\n",
    "print(gsearch1.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8759890738587764\n",
      "{'max_depth': 5, 'min_samples_split': 302}\n"
     ]
    }
   ],
   "source": [
    "# tuning 'max_depth' and 'min_samples_split'\n",
    "param2 = {'max_depth':range(2,16), 'min_samples_split':range(2, 1002, 50)}\n",
    "gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=84), \n",
    "                        param_grid=param2, scoring='roc_auc', cv=5)\n",
    "\n",
    "gsearch2.fit(X_train, Y_train)\n",
    "\n",
    "print(gsearch2.best_score_)\n",
    "print(gsearch2.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict\n",
    "clf = GradientBoostingClassifier(n_estimators=98, max_depth=5, min_samples_split=302)\n",
    "clf.fit(X_train, Y_train)\n",
    "result = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output\n",
    "submission = pd.DataFrame({\n",
    "        \"PassengerId\": test[\"PassengerId\"],\n",
    "        \"Survived\": result})\n",
    "\n",
    "submission.to_csv(\"output/submission.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# load\n",
	"train = pd.read_csv('input/train.csv')\n",
	"test = pd.read_csv('input/test.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"# extract\n",
	"train = train.loc[:, ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]\n",
	"test = test.loc[:, ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Survived 0\n",
	"Pclass 0\n",
	"Sex 0\n",
	"Age 177\n",
	"SibSp 0\n",
	"Parch 0\n",
	"Fare 0\n",
	"Embarked 2\n",
	"dtype: int64"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train.isnull().sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"PassengerId 0\n",
	"Pclass 0\n",
	"Sex 0\n",
	"Age 86\n",
	"SibSp 0\n",
	"Parch 0\n",
	"Fare 1\n",
	"Embarked 0\n",
	"dtype: int64"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test.isnull().sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# listwise deletion\n",
	"train.replace('', np.nan, inplace=True)\n",
	"train.dropna(subset=['Embarked'], inplace=True)\n",
	"\n",
	"test.fillna({'Fare': test['Fare'].mean()}, inplace = True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# replace String with value\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"\n",
	"le = LabelEncoder()\n",
	"for label in ['Embarked','Sex']:\n",
	" train[label] = le.fit_transform(train[label])\n",
	" test[label] = le.fit_transform(test[label])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# complements missing value of age\n",
	"# [reference] kernel by Poonam Ligade\n",
	"from sklearn.ensemble import RandomForestRegressor\n",
	"def fill_missing_age(df):\n",
	" age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Pclass']]\n",
	"\n",
	" train = age_df.loc[ (df.Age.notnull()) ]\n",
	" test = age_df.loc[ (df.Age.isnull()) ]\n",
	"\n",
	" y = train.values[:, 0] # Age\n",
	" X = train.values[:, 1::] # except Age\n",
	"\n",
	" # Random Forest\n",
	" rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)\n",
	" rtr.fit(X, y)\n",
	"\n",
	" predictedAges = rtr.predict(test.values[:, 1::])\n",
	"\n",
	" df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges\n",
	"\n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"train = fill_missing_age(train)\n",
	"test = fill_missing_age(test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"# standardize features\n",
	"from sklearn.preprocessing import StandardScaler\n",
	"\n",
	"std_scale = StandardScaler().fit(train[['Age', 'Fare']])\n",
	"train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])\n",
	"\n",
	"std_scale = StandardScaler().fit(test[['Age', 'Fare']])\n",
	"test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"# dataset\n",
	"X_train = train.drop(\"Survived\", axis=1)\n",
	"Y_train = train[\"Survived\"]\n",
	"X_test = test.drop(\"PassengerId\", axis=1).copy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# model selection\n",
	"from sklearn.neighbors import KNeighborsClassifier\n",
	"from sklearn.tree import DecisionTreeClassifier\n",
	"from sklearn.svm import SVC\n",
	"from sklearn.ensemble import AdaBoostClassifier\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"\n",
	"from sklearn.model_selection import KFold\n",
	"from sklearn.model_selection import cross_val_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"--- accuracy ---\n",
	"KNC: 0.834\n",
	"DTC: 0.751\n",
	"SVM: 0.852\n",
	"AdaBoost: 0.85\n",
	"GradientBoosting: 0.864\n"
	]
	}
	],
	"source": [
	"models = [(\"KNC\", KNeighborsClassifier()),\n",
	" (\"DTC\", DecisionTreeClassifier()),\n",
	" (\"SVM\", SVC()),\n",
	" (\"AdaBoost\", AdaBoostClassifier()),\n",
	" (\"GradientBoosting\",GradientBoostingClassifier())]\n",
	"\n",
	"print ('--- accuracy ---')\n",
	"for name, model in models:\n",
	" kfold = KFold(n_splits=10, random_state=42)\n",
	" result = cross_val_score(model, X_train,Y_train,\n",
	" cv=kfold,scoring=\"roc_auc\")\n",
	" \n",
	" print ('{}: {:.3}'.format(name, result.mean()))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Select GBM as the model."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"# tuning the hyper-parameters of an estimator\n",
	"from sklearn.model_selection import GridSearchCV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.8688672260850385\n",
	"{'n_estimators': 98}\n"
	]
	}
	],
	"source": [
	"# tuning 'n_estimators'\n",
	"param1 = {'n_estimators':range(20,101)}\n",
	"gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), \n",
	" param_grid=param1, scoring='roc_auc', cv=5)\n",
	"\n",
	"gsearch1.fit(X_train, Y_train)\n",
	"\n",
	"print(gsearch1.best_score_)\n",
	"print(gsearch1.best_params_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.8759890738587764\n",
	"{'max_depth': 5, 'min_samples_split': 302}\n"
	]
	}
	],
	"source": [
	"# tuning 'max_depth' and 'min_samples_split'\n",
	"param2 = {'max_depth':range(2,16), 'min_samples_split':range(2, 1002, 50)}\n",
	"gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=84), \n",
	" param_grid=param2, scoring='roc_auc', cv=5)\n",
	"\n",
	"gsearch2.fit(X_train, Y_train)\n",
	"\n",
	"print(gsearch2.best_score_)\n",
	"print(gsearch2.best_params_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"# predict\n",
	"clf = GradientBoostingClassifier(n_estimators=98, max_depth=5, min_samples_split=302)\n",
	"clf.fit(X_train, Y_train)\n",
	"result = clf.predict(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"# output\n",
	"submission = pd.DataFrame({\n",
	" \"PassengerId\": test[\"PassengerId\"],\n",
	" \"Survived\": result})\n",
	"\n",
	"submission.to_csv(\"output/submission.csv\", index=False)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}