Skip to content

Instantly share code, notes, and snippets.

@okwrtdsh
Created April 20, 2019 18:55
Show Gist options
  • Save okwrtdsh/1c50ca860cd93c5c8494f3bd6145bd3f to your computer and use it in GitHub Desktop.
Save okwrtdsh/1c50ca860cd93c5c8494f3bd6145bd3f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# これまでの処理をまとめた\n",
"%matplotlib inline\n",
"import numpy as np\n",
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def load_data():\n",
" df_train = pd.read_csv(\"./titanic/train.csv\")\n",
" df_test = pd.read_csv(\"./titanic/test.csv\")\n",
" return df_train, df_test\n",
"\n",
"\n",
"def train(df_train, features, target='Survived', test_size=0.3, max_depth=5, n_estimators=10):\n",
" train, val = train_test_split(df_train[[target] + features], test_size=test_size)\n",
" X_train = train[features].fillna(0).values # 取り敢えず欠損値を0で埋める\n",
" y_train = train[target].values\n",
" X_val = val[features].fillna(0).values\n",
" y_val = val[target].values\n",
" clf = RandomForestClassifier(\n",
" n_estimators=n_estimators, # 木(決定木)の数\n",
" max_depth=max_depth, # 深さ\n",
" n_jobs=4, # 並列プロセス数\n",
" random_state=123, # 乱数を固定(再現性)\n",
" )\n",
" clf.fit(X_train, y_train)\n",
" print(\"val score: \", clf.score(X_val, y_val))\n",
" # 特徴量の重要度\n",
" sns.barplot(x=clf.feature_importances_, y=features)\n",
" plt.show()\n",
" return clf\n",
"\n",
"\n",
"def gen_submit_data(clf, df_test, features, output_filename='pred.csv'):\n",
" \"\"\"提出用データの生成\"\"\"\n",
" pred = clf.predict(df_test[features].fillna(0))\n",
" result = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': pred})\n",
" result.to_csv(output_filename, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"val score: 0.7164179104477612\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADItJREFUeJzt3H2MpXV5h/HrC8tWEIoVsKW8jVhadBdKy9ZUSQzFNrVtgli3rUREEpTQF5r0LWqhpGJN0/JHq1hS1tQoTS0EbBNKWqxSUCTlZdfusq6KoqClBelCQi1CV+DuH3Ooc4/Dzpnd8zIze32SSc6c85yz9y/P7FzzPM+ZSVUhSdJz9pv2AJKk5cUwSJIawyBJagyDJKkxDJKkxjBIkhrDIElqDIMkqTEMkqRmzbQH2BOHH354zczMTHsMSVpRtmzZsrOqjlhsuxUZhpmZGTZv3jztMSRpRUnytWG281SSJKkxDJKkxjBIkhrDIElqDIMkqTEMkqRmRb5d9QsPPsqpv3f1tMfQBG25/NxpjyDtMzxikCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSc2iYUjyTJKtST6X5LokB+1m2z9M8rujHVGSNEnDHDE8WVWnVNV6YBdw4ZhnkiRN0VJPJd0G/BBAknOT3JNkW5K/nr9hkrcnuXvw+MeeO9JI8kuDo49tST49uG9dkrsGRyb3JDlhbxcmSdoza4bdMMka4OeAm5KsAy4GTquqnUlevMBT/q6qPjh47h8B5wNXAJcCP1tV/5HkRYNtLwTeV1V/k2QtsP+eL0mStDeGOWI4MMlWYDPwdeCvgDOA66tqJ0BVPbbA89YnuS3JduDNwLrB/bcDH07ydr4TgH8Ffj/JO4DjqurJ+S+W5IIkm5Nsfvpb31zCEiVJS7GUawynVNVFVbULCFCLPO/DwG9U1UnAu4EXAFTVhcAlwDHA1iSHVdVHgTOBJ4GPJzlj/otV1aaq2lBVG9YcdMiw65MkLdGevl31ZuCXkxwG8Dynkg4BHkpyALNHDAy2fVlV3VlVlwI7gWOSHA98tareD9wAnLyHc0mS9tLQ1xjmqqodSd4LfCrJM8C/AefN2+wPgDuBrwHbmQ0FwOWDi8thNjDbgHcC5yT5NvAwcNmezCVJ2nupWuyM0PLzwh94aZ34lndPewxN0JbLz532CNKKl2RLVW1YbDt/81mS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1a6Y9wJ54+dGHsfnyc6c9hiStSh4xSJIawyBJagyDJKkxDJKkxjBIkhrDIElqDIMkqTEMkqTGMEiSGsMgSWoMgySpMQySpMYwSJKaFfnXVXc9tIOvX3bStMeQ9tqxl26f9gjSd/GIQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDVjCUOSNySpJCeO4/UlSeMzriOGs4HPAG8a0+tLksZk5GFIcjBwGnA+gzAk2S/JlUl2JLkxyT8m2Th47NQkn0qyJcnHkxw56pkkScMbxxHDWcBNVfUl4LEkPw78IjADnAS8DXgVQJIDgCuAjVV1KvAh4L1jmEmSNKQ1Y3jNs4E/H9y+ZvD5AcB1VfUs8HCSWwaP/wiwHvhEEoD9gYcWetEkFwAXABx16AFjGFuSBCMOQ5LDgDOA9UmK2W/0Bfz98z0F2FFVr1rstatqE7AJ4OSjDqzRTCxJmm/Up5I2AldX1XFVNVNVxwD3AzuBNw6uNXw/cPpg+3uBI5L8/6mlJOtGPJMkaQlGHYaz+e6jg48BPwg8CHwOuAq4E3i8qnYxG5M/SbIN2Aq8esQzSZKWYKSnkqrq9AXuez/Mvlupqv5ncLrpLmD74PGtwGtGOYckac+N4+Lz87kxyYuAtcB7qurhCf7bkqQhTSwMCx1NSJKWH/9WkiSpMQySpMYwSJIawyBJagyDJKkxDJKkxjBIkhrDIElqDIMkqTEMkqTGMEiSGsMgSWoMgySpMQySpMYwSJIawyBJagyDJKkxDJKkxjBIkhrDIElqDIMkqTEMkqRmzbQH2BNrj1zHsZdunvYYkrQqecQgSWoMgySpMQySpMYwSJIawyBJagyDJKkxDJKkxjBIkhrDIElqDIMkqTEMkqTGMEiSGsMgSWoMgySpWZF/dvuLj3yR0644bdpjSNJE3X7R7RP5dzxikCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSc2aUbxIkmeA7XPuOquqHhjFa0uSJmskYQCerKpTlvqkJPtX1TMjmkGSNAJjO5WUZCbJbUk+O/h49eD+05PckuSjDI4ykpyT5K4kW5NclWT/cc0lSdq9UR0xHJhk6+D2/VX1BuAR4Geq6qkkJwB/C2wYbPNKYH1V3Z/k5cCvAKdV1beTXAm8Gbh6RLNJkpZgnKeSDgA+kOQU4Bngh+c8dldV3T+4/VrgVODuJAAHMhuVJskFwAUAa79v7YjGliTNN6owLOS3gG8AP8rsKaun5jz2xJzbAT5SVe/a3YtV1SZgE8DBxx5cox1VkvSccb5d9VDgoap6FngL8HzXDW4GNiZ5CUCSFyc5boxzSZJ2Y5xhuBJ4a5I7mD2N9MRCG1XV54FLgH9Ocg/wCeDIMc4lSdqNkZxKqqqDF7jvy8DJc+561+D+W4Fb5217LXDtKGaRJO0df/NZktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1BgGSVJjGCRJjWGQJDWGQZLUGAZJUmMYJEmNYZAkNWumPcCeOPElJ3L7RbdPewxJWpU8YpAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1KSqpj3DkiX5JnDvtOeYgsOBndMeYkr21bW77n3PONd+XFUdsdhGK/L3GIB7q2rDtIeYtCSb98V1w767dte971kOa/dUkiSpMQySpGalhmHTtAeYkn113bDvrt1173umvvYVefFZkjQ+K/WIQZI0Jss6DElel+TeJPcleecCj39PkmsHj9+ZZGbyU47eEOt+TZLPJnk6ycZpzDgOQ6z7t5N8Psk9SW5Octw05hyHIdZ+YZLtSbYm+UySV0xjzlFbbN1zttuYpJKsincqDbG/z0vyX4P9vTXJ2yY6YFUtyw9gf+ArwPHAWmAb8Ip52/wa8JeD228Crp323BNa9wxwMnA1sHHaM09w3T8FHDS4/aurYX8vYe3fO+f2mcBN0557EusebHcI8GngDmDDtOee0P4+D/jAtGZczkcMrwTuq6qvVtUu4Brg9fO2eT3wkcHt64HXJskEZxyHRdddVQ9U1T3As9MYcEyGWfctVfWtwad3AEdPeMZxGWbt/z3n0xcCq+Hi4DD/xwHeA/wp8NQkhxujYdc9Ncs5DEcB/z7n8wcH9y24TVU9DTwOHDaR6cZnmHWvRktd9/nAP411oskZau1Jfj3JV5j9JvmbE5ptnBZdd5IfA46pqhsnOdiYDfu1/sbBadPrkxwzmdFmLecwLPST//yfkobZZqVZjWsaxtDrTnIOsAG4fKwTTc5Qa6+qv6iqlwHvAC4Z+1Tjt9t1J9kP+DPgdyY20WQMs7//AZipqpOBT/KdMyMTsZzD8CAwt5JHA//5fNskWQMcCjw2kenGZ5h1r0ZDrTvJTwMXA2dW1f9OaLZxW+o+vwY4a6wTTcZi6z4EWA/cmuQB4CeBG1bBBehF93dVPTrn6/uDwKkTmg1Y3mG4GzghyUuTrGX24vIN87a5AXjr4PZG4F9qcOVmBRtm3avRousenFa4itkoPDKFGcdlmLWfMOfTXwC+PMH5xmW3666qx6vq8KqaqaoZZq8rnVlVm6cz7sgMs7+PnPPpmcAXJjjf8n1X0uD7+88DX2L2Cv7Fg/suY/aLA+AFwHXAfcBdwPHTnnlC6/4JZn/qeAJ4FNgx7ZkntO5PAt8Atg4+bpj2zBNc+/uAHYN13wKsm/bMk1j3vG1vZRW8K2nI/f3Hg/29bbC/T5zkfP7msySpWc6nkiRJU2AYJEmNYZAkNYZBktQYBklSYxgkSY1hkCQ1hkGS1Pwf3gmdnytQ6gwAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# データの初期化\n",
"df_train, df_test = load_data()\n",
"# 特徴量を設定\n",
"features = ['Pclass', 'Age', 'Fare']\n",
"# パラメータを設定して学習\n",
"clf = train(df_train, features, test_size=0.3, max_depth=5, n_estimators=10)\n",
"# 良さそうな特徴量やパラメータの組み合わせを探そう"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 良さそうなスコアがでたら提出してみよう\n",
"gen_submit_data(clf, df_test, features, output_filename='pred.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 特徴量作成\n",
"* 数値化\n",
"* 欠損値の補完\n",
"* 1つ以上の系列から新たな特徴量"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1440x576 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 各列間の相関をプロット\n",
"# df.corr(): データフレームの型が数値の系列同士で相関係数を計算\n",
"def show_corr(df_train, df_test=None):\n",
" plt.figure(figsize=(20, 8))\n",
" if df_test is not None:\n",
" plt.subplot(1,2,1) # 1行2列の1つ目に表示\n",
" plt.title(\"train\")\n",
" sns.heatmap(df_train.corr(), vmin=-1, vmax=1, cmap='jet', center=0, annot=True, square=True)\n",
" \n",
" if df_test is not None:\n",
" plt.subplot(1,2,2) # 1行2列の2つ目に表示\n",
" plt.title(\"test\")\n",
" sns.heatmap(df_test.corr(), vmin=-1, vmax=1, cmap='jet', center=0, annot=True, square=True)\n",
" # 表示\n",
" plt.show()\n",
"\n",
"show_corr(df_train, df_test)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PassengerId 0\n",
"Survived 0\n",
"Pclass 0\n",
"Name 0\n",
"Sex 0\n",
"Age 177\n",
"SibSp 0\n",
"Parch 0\n",
"Ticket 0\n",
"Fare 0\n",
"Cabin 687\n",
"Embarked 2\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 欠損値の確認\n",
"df_train.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PassengerId 0\n",
"Pclass 0\n",
"Name 0\n",
"Sex 0\n",
"Age 86\n",
"SibSp 0\n",
"Parch 0\n",
"Ticket 0\n",
"Fare 1\n",
"Cabin 327\n",
"Embarked 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Embarked</th>\n",
" <th>Survived</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>C</td>\n",
" <td>0.553571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Q</td>\n",
" <td>0.389610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>S</td>\n",
" <td>0.336957</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Embarked Survived\n",
"0 C 0.553571\n",
"1 Q 0.389610\n",
"2 S 0.336957"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# まずEmbarkedの欠損値を埋めてみよう\n",
"# Embarkedの種類ごとで生存率を計算\n",
"df_train[['Embarked','Survived']].groupby(\n",
" ['Embarked'], as_index=False\n",
").mean().sort_values('Survived', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Embarked</th>\n",
" <th>C</th>\n",
" <th>Q</th>\n",
" <th>S</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Pclass</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>85</td>\n",
" <td>2</td>\n",
" <td>127</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>164</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>66</td>\n",
" <td>72</td>\n",
" <td>353</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Embarked C Q S\n",
"Pclass \n",
"1 85 2 127\n",
"2 17 3 164\n",
"3 66 72 353"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# PclassごとのEmbarkedを表示\n",
"pd.crosstab(df_train['Pclass'], df_train['Embarked'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 関係なさそうなので最も多い\"S\"にしてみる, fillna('S')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pclass</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>84.154687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>20.662183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>13.675550</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pclass Fare\n",
"0 1 84.154687\n",
"1 2 20.662183\n",
"2 3 13.675550"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fareの欠損値を埋めてみよう\n",
"# Fareと相関の高いPclass毎に平均値を計算\n",
"df_train[['Pclass','Fare']].groupby(['Pclass'], as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>152</th>\n",
" <td>1044</td>\n",
" <td>3</td>\n",
" <td>Storey, Mr. Thomas</td>\n",
" <td>male</td>\n",
" <td>60.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3701</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Pclass Name Sex Age SibSp Parch Ticket \\\n",
"152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 \n",
"\n",
" Fare Cabin Embarked \n",
"152 NaN NaN S "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test[df_test['Fare'].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Pclass: 3なので、fillna(13.675550)で良さそう"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Ageの欠損値の補完"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Sex</th>\n",
" <th>female</th>\n",
" <th>male</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Title</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Capt</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Col</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Countess</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Don</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dr</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Jonkheer</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lady</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Major</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Master</th>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Miss</th>\n",
" <td>182</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mlle</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mme</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mr</th>\n",
" <td>0</td>\n",
" <td>517</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mrs</th>\n",
" <td>125</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ms</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Rev</th>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sir</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Sex female male\n",
"Title \n",
"Capt 0 1\n",
"Col 0 2\n",
"Countess 1 0\n",
"Don 0 1\n",
"Dr 1 6\n",
"Jonkheer 0 1\n",
"Lady 1 0\n",
"Major 0 2\n",
"Master 0 40\n",
"Miss 182 0\n",
"Mlle 2 0\n",
"Mme 1 0\n",
"Mr 0 517\n",
"Mrs 125 0\n",
"Ms 1 0\n",
"Rev 0 6\n",
"Sir 0 1"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 敬称を抽出してみる\n",
"import re\n",
"df_train['Title'] = df_train['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\\.', x).group(1))\n",
"pd.crosstab(df_train['Title'], df_train['Sex'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Sex</th>\n",
" <th>female</th>\n",
" <th>male</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Title</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Col</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dona</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dr</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Master</th>\n",
" <td>0</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Miss</th>\n",
" <td>78</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mr</th>\n",
" <td>0</td>\n",
" <td>240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mrs</th>\n",
" <td>72</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ms</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Rev</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Sex female male\n",
"Title \n",
"Col 0 2\n",
"Dona 1 0\n",
"Dr 0 1\n",
"Master 0 21\n",
"Miss 78 0\n",
"Mr 0 240\n",
"Mrs 72 0\n",
"Ms 1 0\n",
"Rev 0 2"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test['Title'] = df_test['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\\.', x).group(1))\n",
"pd.crosstab(df_test['Title'], df_test['Sex'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Title</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Mr</th>\n",
" <td>757</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Miss</th>\n",
" <td>260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mrs</th>\n",
" <td>197</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Master</th>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Rev</th>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dr</th>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Col</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mlle</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ms</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Major</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mme</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Capt</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lady</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Jonkheer</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dona</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Don</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Countess</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sir</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId\n",
"Title \n",
"Mr 757\n",
"Miss 260\n",
"Mrs 197\n",
"Master 61\n",
"Rev 8\n",
"Dr 8\n",
"Col 4\n",
"Mlle 2\n",
"Ms 2\n",
"Major 2\n",
"Mme 1\n",
"Capt 1\n",
"Lady 1\n",
"Jonkheer 1\n",
"Dona 1\n",
"Don 1\n",
"Countess 1\n",
"Sir 1"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 敬称毎の人数\n",
"pd.concat([df_train, df_test], ignore_index=True, sort=False)[['Title','PassengerId']].groupby(\n",
" ['Title']\n",
").count().sort_values('PassengerId', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Title</th>\n",
" <th>Survived</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Sir</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Countess</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Ms</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Mme</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Lady</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Mlle</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Mrs</td>\n",
" <td>0.792000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Miss</td>\n",
" <td>0.697802</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Master</td>\n",
" <td>0.575000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Col</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Major</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Dr</td>\n",
" <td>0.428571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Mr</td>\n",
" <td>0.156673</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Jonkheer</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Don</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Rev</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Capt</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Title Survived\n",
"16 Sir 1.000000\n",
"2 Countess 1.000000\n",
"14 Ms 1.000000\n",
"11 Mme 1.000000\n",
"6 Lady 1.000000\n",
"10 Mlle 1.000000\n",
"13 Mrs 0.792000\n",
"9 Miss 0.697802\n",
"8 Master 0.575000\n",
"1 Col 0.500000\n",
"7 Major 0.500000\n",
"4 Dr 0.428571\n",
"12 Mr 0.156673\n",
"5 Jonkheer 0.000000\n",
"3 Don 0.000000\n",
"15 Rev 0.000000\n",
"0 Capt 0.000000"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 敬称毎の生存率\n",
"df_train[['Title','Survived']].groupby(\n",
" ['Title'], as_index=False\n",
").mean().sort_values('Survived', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Title</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Capt</td>\n",
" <td>70.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Col</td>\n",
" <td>54.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Sir</td>\n",
" <td>49.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Major</td>\n",
" <td>48.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Lady</td>\n",
" <td>48.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Dr</td>\n",
" <td>43.571429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Rev</td>\n",
" <td>41.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Don</td>\n",
" <td>40.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Dona</td>\n",
" <td>39.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Jonkheer</td>\n",
" <td>38.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Mrs</td>\n",
" <td>36.994118</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Countess</td>\n",
" <td>33.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Mr</td>\n",
" <td>32.252151</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Ms</td>\n",
" <td>28.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Mlle</td>\n",
" <td>24.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Mme</td>\n",
" <td>24.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Miss</td>\n",
" <td>21.774238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Master</td>\n",
" <td>5.482642</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Title Age\n",
"0 Capt 70.000000\n",
"1 Col 54.000000\n",
"17 Sir 49.000000\n",
"8 Major 48.500000\n",
"7 Lady 48.000000\n",
"5 Dr 43.571429\n",
"16 Rev 41.250000\n",
"3 Don 40.000000\n",
"4 Dona 39.000000\n",
"6 Jonkheer 38.000000\n",
"14 Mrs 36.994118\n",
"2 Countess 33.000000\n",
"13 Mr 32.252151\n",
"15 Ms 28.000000\n",
"11 Mlle 24.000000\n",
"12 Mme 24.000000\n",
"10 Miss 21.774238\n",
"9 Master 5.482642"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 敬称毎の平均年齢\n",
"pd.concat([df_train, df_test], ignore_index=True, sort=False)[['Title','Age']].groupby(\n",
" ['Title'], as_index=False\n",
").mean().sort_values('Age', ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ググる\n",
"#### 女性\n",
"* Mrs: 既婚女性\n",
"* Miss: 未婚女性\n",
"* Ms: 既婚女性または未婚女性 -> Missに統合\n",
"* Mme: 既婚女性(フランス語) -> Mrsに統合\n",
"* Mlle: 未婚女性(フランス語) -> Missに統合\n",
"\n",
"#### 男性\n",
"* Mr: 男性\n",
"* Master: ミスターと呼ぶには若すぎる少年\n",
"\n",
"#### その他(数が少ないのですべて統合)\n",
"##### 男性?\n",
"* Sir: 男性(英語圏)\n",
"* Col: 大佐\n",
"* Major: 少佐\n",
"* Dr: 医者や博士号の資格を持っている人(データではほぼ男性)\n",
"* Jonkheer: オランダとベルギーで用いられる貴族の敬称で、爵位を保有しない貴族に用いられる敬称\n",
"* Don: スペイン語圏とポルトガル語圏で使われる貴人・高位聖職者に対する尊称(男性)\n",
"* Rev: 聖職者の名前の前に付ける敬称\n",
"* Capt: 船長\n",
"\n",
"##### 女性\n",
"* Countess: 女性(伯爵夫人, 女伯爵)\n",
"* Lady: 女性(イギリスで貴族夫人)\n",
"* Dona: スペイン語圏とポルトガル語圏で使われる貴人・高位聖職者に対する尊称(女性)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df_train['Title'] = df_train['Title'].replace('Ms', 'Miss')\n",
"df_train['Title'] = df_train['Title'].replace('Mlle', 'Miss')\n",
"df_train['Title'] = df_train['Title'].replace('Mme', 'Mrs')\n",
"df_train['Title'] = df_train['Title'].replace([\n",
" 'Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'\n",
"], 'Other')\n",
"\n",
"df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')\n",
"df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')\n",
"df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')\n",
"df_test['Title'] = df_test['Title'].replace([\n",
" 'Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'\n",
"], 'Other')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Title</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Other</td>\n",
" <td>45.178571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Mrs</td>\n",
" <td>36.918129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Mr</td>\n",
" <td>32.252151</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Miss</td>\n",
" <td>21.824366</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Master</td>\n",
" <td>5.482642</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Title Age\n",
"4 Other 45.178571\n",
"3 Mrs 36.918129\n",
"2 Mr 32.252151\n",
"1 Miss 21.824366\n",
"0 Master 5.482642"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([df_train, df_test], ignore_index=True, sort=False)[['Title','Age']].groupby(\n",
" ['Title'], as_index=False\n",
").mean().sort_values('Age', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Family</th>\n",
" <th>Survived</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0.303538</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0.552795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0.578431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0.724138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>0.136364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>6</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>10</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Family Survived\n",
"0 0 0.303538\n",
"1 1 0.552795\n",
"2 2 0.578431\n",
"3 3 0.724138\n",
"4 4 0.200000\n",
"5 5 0.136364\n",
"6 6 0.333333\n",
"7 7 0.000000\n",
"8 10 0.000000"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 関係があるものから別の特徴量を作成\n",
"df_train['Family'] = df_train['SibSp'] + df_train['Parch']\n",
"df_train[['Family', 'Survived']].groupby(['Family'], as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment