Created
October 21, 2017 08:27
-
-
Save doloopwhile/3fea26abfd98f0fa689d34f5e02f0de3 to your computer and use it in GitHub Desktop.
titanic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
import pandas as pd | |
from pprint import pprint | |
from sklearn import linear_model | |
from sklearn import svm | |
from sklearn.preprocessing import StandardScaler | |
def read_csv(filename): | |
df = pd.read_csv(filename) | |
df = df.drop(['Name', 'Cabin', 'Embarked', 'Ticket'], axis=1) | |
# 男:1 女: -1 | |
def sex_to_int(x): | |
if x == 'male': | |
return 1 | |
elif x == 'female': | |
return -1 | |
else: | |
return 0 | |
df['Sex'] = df['Sex'].map(sex_to_int) | |
return df | |
def regressions(): | |
kw = dict() | |
return [ | |
linear_model.LinearRegression(normalize=True), | |
linear_model.LogisticRegression(), | |
svm.SVC(kernel="poly", degree=2, **kw), | |
svm.SVC(kernel="poly", degree=3, **kw), | |
svm.SVC(kernel="poly", degree=4, **kw), | |
svm.SVC(kernel="rbf", **kw), | |
svm.SVC(kernel="sigmoid", **kw), | |
] | |
def main(): | |
scaler = StandardScaler() | |
train_df = read_csv('train.csv').drop('PassengerId', axis=1) | |
age_mean = train_df['Age'].mean() | |
fare_mean = train_df['Fare'] | |
train_df['Age'] = train_df['Age'].fillna(age_mean) | |
train_df['Fare'] = train_df['Fare'].fillna(fare_mean) | |
train_df['Survived'] = train_df['Survived'].map(lambda x: 2 * x - 1) | |
train_survived = train_df['Survived'] | |
train_df = train_df.drop('Survived', axis=1) | |
train_df1 = train_df[0:800] | |
train_survived1 = train_survived[0:800] | |
train_df2 = train_df[800:] | |
train_survived2 = train_survived[800:] | |
scaler.fit(train_df1) | |
fitted_regs = [] | |
for reg in regressions(): | |
print(reg) | |
reg.fit(scaler.transform(train_df1), train_survived1) | |
estimated = reg.predict(scaler.transform(train_df2)) | |
correct = 0 | |
for _, item in pd.DataFrame({ 'actual': train_survived2, 'estimated': estimated }).iterrows(): | |
if (item['actual'] == 1) == (item['estimated'] >= 0.5): | |
correct += 1 | |
print(correct / len(train_survived2.index)) | |
print() | |
fitted_regs.append(reg) | |
if True: | |
reg = fitted_regs[5] | |
print(reg) | |
test_df = read_csv('test.csv') | |
# Fare が nan の場合があるので、訓練データの平均値で穴埋め | |
test_df['Age'] = test_df['Age'].fillna(age_mean) | |
test_df['Fare'] = test_df['Fare'].fillna(fare_mean) | |
ids = test_df['PassengerId'] | |
test_df = test_df.drop('PassengerId', axis=1) | |
survived = reg.predict(scaler.transform(test_df)) | |
# print(survived[0]) | |
survived = [(1 if x >= 0 else 0 ) for x in survived] | |
answer = pd.DataFrame({ 'PassengerId': ids, 'Survived': survived }) | |
answer.to_csv('answer.csv', index=False) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment