Created
May 17, 2018 12:37
-
-
Save peroon/ca5c6743e62ffb98149779f6334b474d to your computer and use it in GitHub Desktop.
Kaggle Titanic by Random Forest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestClassifier | |
# column name | |
PassengerId = 'PassengerId' | |
Survived = 'Survived' | |
Pclass = 'Pclass' | |
Name = 'Name' | |
Sex = 'Sex' | |
Age = 'Age' | |
SibSp = 'SibSp' | |
Parch = 'Parch' | |
Ticket = 'Ticket' | |
Fare = 'Fare' | |
Cabin = 'Cabin' | |
Embarked = 'Embarked' | |
# new features | |
FamilySize = 'FamilySize' | |
Title = 'Title' | |
train_data_path = '../train.csv' | |
test_data_path = '../test.csv' | |
predict_save_path = '../predict.csv' | |
def load_data(data_path): | |
df = pd.read_csv(data_path, converters={Sex: sex_to_int}) | |
df[Embarked].fillna('S', inplace=True) | |
df[Embarked] = df[Embarked].map(dict(zip(("S", "C", "Q"), (0, 1, 2)))) | |
df[FamilySize] = df[SibSp] + df[Parch] + 1 | |
return df | |
def filter_data(data): | |
""" | |
:param data: | |
:return: pd.DataFrame | |
:rtype: pd.DataFrame | |
""" | |
return data.loc[:, [Pclass, Sex, Age, Fare, Embarked, FamilySize, Title]] | |
def sex_to_int(sex): | |
if sex == 'male': | |
return 0 | |
else: | |
return 1 | |
def age_nan_converter(age): | |
if age is '': | |
return -1 | |
else: | |
return age | |
def pre_survey(): | |
train = load_data(train_data_path) | |
test = load_data(test_data_path) | |
print('types') | |
print(train.dtypes) | |
for df in train, test: | |
print('check NaN') | |
print(df.isnull().any(axis=0)) | |
def name_to_title(name): | |
last_name, title, first_name = re.split(r"[,.]", name, maxsplit=2) | |
title.strip() | |
return title | |
def add_title(df, train_titles): | |
""" | |
:param df: | |
:param train_titles: | |
:type train_titles: list | |
:return: | |
""" | |
title_indexes = [] | |
for name in df[Name]: | |
current_title = name_to_title(name) | |
index = -1 | |
if current_title in train_titles: | |
index = train_titles.index(current_title) | |
title_indexes.append(index) | |
df[Title] = title_indexes | |
def train_and_predict(): | |
train = load_data(train_data_path) | |
# train mean | |
age_mean = train[Age].mean() | |
fare_mean = train[Fare].mean() | |
train[Age].fillna(age_mean, inplace=True) | |
# train title | |
train_titles = set() | |
for name in train[Name]: | |
title = name_to_title(name) | |
train_titles.add(title) | |
train_titles = list(train_titles) | |
print(train_titles) | |
# add title | |
add_title(train, train_titles) | |
X = filter_data(train) | |
y = np.ravel(train.loc[:, [Survived]]) | |
clf = RandomForestClassifier( | |
n_estimators=100, | |
criterion='gini', | |
max_depth=5, | |
min_samples_split=10, | |
min_samples_leaf=5, | |
random_state=0) | |
clf.fit(X, y) | |
# importance of each column | |
for name, value in zip(X.columns.values, clf.feature_importances_): | |
print(name, value) | |
# predict | |
print('predict...') | |
test = load_data(test_data_path) | |
test[Age].fillna(age_mean, inplace=True) | |
test[Fare].fillna(fare_mean, inplace=True) | |
# add title | |
add_title(test, train_titles) | |
test_X = filter_data(test) | |
test_y = clf.predict(test_X) | |
# save result | |
passenger_ids = np.ravel(test.loc[:, [PassengerId]]) | |
d = {PassengerId: passenger_ids, Survived: test_y} | |
df = pd.DataFrame(data=d) | |
df.to_csv(predict_save_path, index=False) | |
if __name__ == '__main__': | |
#pre_survey() | |
train_and_predict() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment