Skip to content

Instantly share code, notes, and snippets.

@FujiHaruka
Created April 22, 2018 12:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FujiHaruka/88771a4740b67fc548a1eb67b595d77e to your computer and use it in GitHub Desktop.
Save FujiHaruka/88771a4740b67fc548a1eb67b595d77e to your computer and use it in GitHub Desktop.
Titanic(Kaggle)
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# data frame オブジェクト
df = pd.read_csv("./data/train.csv")
# ---------------
# STEP 1
# 前処理 欠損値の対処
# ---------------
# 欠測値の前処理
# df.isnull().sum() を見ると ...
# Age 177
# Cabin 687 <- このデータは学習に使わないので無視
# Embarked 2
# "Embarked" の欠測値は除外
df = df.dropna(subset=['Embarked'])
# "Age" は平均値補完
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(df['Age'].values.reshape(-1, 1))
df['Age'] = imputer.transform(df['Age'].values.reshape(-1, 1))
# ---------------
# STEP 2
# 前処理 カテゴリーデータのマッピング
# ---------------
mapping = lambda col: {label: i for i, label in enumerate(np.unique(col))}
sex_mapping = mapping(df['Sex'])
embarked_mapping = mapping(df['Embarked'])
df['Sex'] = df['Sex'].map(sex_mapping)
df['Embarked'] = df['Embarked'].map(embarked_mapping)
# ---------------
# STEP 3
# 前処理 特徴量を選ぶ
# ---------------
# 学習に使わない列を捨てる
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
# ---------------
# STEP 4
# トレーニングデータとテストデータに分割
# ---------------
X = df.values[:, 1:]
y = df.values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=1)
# ついでに特徴量のスケーリング
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
# ---------------
# STEP 5
# モデルの訓練
# ---------------
svm = SVC()
svm.fit(X_train, y_train)
# ---------------
# STEP 6
# 予測結果を見る
# ---------------
pred = svm.predict(X_test)
print(accuracy_score(y_test, pred))
# -> 0.846441947565543
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment