monmonmon/titanic.py

## titanic.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ds = pd.read_csv('train.csv')

# Ageの欠損値を Mr, Mrs, etc ごとの中央値で埋める
noage = ds[ ds['Age'].isna() ]
honorifics = ['Mr', 'Mrs', 'Ms', 'Miss', 'Master', 'Rev', 'Dr']
for h in honorifics:
    regexp = fr".*\b{h}\..*"
    tn1 = ds[ ds['Name'].str.match(regexp) ]
    median = tn1['Age'].median()
    #print(f"{h}: {len(tn1)}, {median}")
    hoe = noage[ noage['Name'].str.match(regexp) ]
    ds['Age'] = ds['Age'].fillna( hoe['Age'].fillna(median) )

# Sex カラムを One Hot Encode する（male カラムは除外）
ds['female'] = pd.get_dummies(ds['Sex'])['female'].values

# Sex, Age カラムのみ学習に使う
X = ds.loc[:, ['female', 'Age']]
# Survived カラム
y = ds.loc[:, ['Survived']]

# 標準化
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

# 訓練データとテストデータを分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Support Vector Machine
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

svc.score(X_test, y_test)
# 0.7988826815642458

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#              precision    recall  f1-score   support
#
#           0       0.84      0.84      0.84       110
#           1       0.74      0.74      0.74        69
#
# avg / total       0.80      0.80      0.80       179
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	ds = pd.read_csv('train.csv')

	# Ageの欠損値を Mr, Mrs, etc ごとの中央値で埋める
	noage = ds[ ds['Age'].isna() ]
	honorifics = ['Mr', 'Mrs', 'Ms', 'Miss', 'Master', 'Rev', 'Dr']
	for h in honorifics:
	regexp = fr".\b{h}\.."
	tn1 = ds[ ds['Name'].str.match(regexp) ]
	median = tn1['Age'].median()
	#print(f"{h}: {len(tn1)}, {median}")
	hoe = noage[ noage['Name'].str.match(regexp) ]
	ds['Age'] = ds['Age'].fillna( hoe['Age'].fillna(median) )

	# Sex カラムを One Hot Encode する（male カラムは除外）
	ds['female'] = pd.get_dummies(ds['Sex'])['female'].values

	# Sex, Age カラムのみ学習に使う
	X = ds.loc[:, ['female', 'Age']]
	# Survived カラム
	y = ds.loc[:, ['Survived']]

	# 標準化
	from sklearn.preprocessing import StandardScaler
	sc_X = StandardScaler()
	X = sc_X.fit_transform(X)

	# 訓練データとテストデータを分割
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	# Support Vector Machine
	from sklearn.svm import SVC
	svc = SVC()
	svc.fit(X_train, y_train)

	y_pred = svc.predict(X_test)

	svc.score(X_test, y_test)
	# 0.7988826815642458

	from sklearn.metrics import classification_report
	print(classification_report(y_test, y_pred))
	# precision recall f1-score support
	#
	# 0 0.84 0.84 0.84 110
	# 1 0.74 0.74 0.74 69
	#
	# avg / total 0.80 0.80 0.80 179