Instantly share code, notes, and snippets.

Embed
What would you like to do?
Kaggle Titanic by Random Forest
# -*- coding: utf-8 -*-
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
# column name
PassengerId = 'PassengerId'
Survived = 'Survived'
Pclass = 'Pclass'
Name = 'Name'
Sex = 'Sex'
Age = 'Age'
SibSp = 'SibSp'
Parch = 'Parch'
Ticket = 'Ticket'
Fare = 'Fare'
Cabin = 'Cabin'
Embarked = 'Embarked'
# new features
FamilySize = 'FamilySize'
Title = 'Title'
train_data_path = '../train.csv'
test_data_path = '../test.csv'
predict_save_path = '../predict.csv'
def load_data(data_path):
df = pd.read_csv(data_path, converters={Sex: sex_to_int})
df[Embarked].fillna('S', inplace=True)
df[Embarked] = df[Embarked].map(dict(zip(("S", "C", "Q"), (0, 1, 2))))
df[FamilySize] = df[SibSp] + df[Parch] + 1
return df
def filter_data(data):
"""
:param data:
:return: pd.DataFrame
:rtype: pd.DataFrame
"""
return data.loc[:, [Pclass, Sex, Age, Fare, Embarked, FamilySize, Title]]
def sex_to_int(sex):
if sex == 'male':
return 0
else:
return 1
def age_nan_converter(age):
if age is '':
return -1
else:
return age
def pre_survey():
train = load_data(train_data_path)
test = load_data(test_data_path)
print('types')
print(train.dtypes)
for df in train, test:
print('check NaN')
print(df.isnull().any(axis=0))
def name_to_title(name):
last_name, title, first_name = re.split(r"[,.]", name, maxsplit=2)
title.strip()
return title
def add_title(df, train_titles):
"""
:param df:
:param train_titles:
:type train_titles: list
:return:
"""
title_indexes = []
for name in df[Name]:
current_title = name_to_title(name)
index = -1
if current_title in train_titles:
index = train_titles.index(current_title)
title_indexes.append(index)
df[Title] = title_indexes
def train_and_predict():
train = load_data(train_data_path)
# train mean
age_mean = train[Age].mean()
fare_mean = train[Fare].mean()
train[Age].fillna(age_mean, inplace=True)
# train title
train_titles = set()
for name in train[Name]:
title = name_to_title(name)
train_titles.add(title)
train_titles = list(train_titles)
print(train_titles)
# add title
add_title(train, train_titles)
X = filter_data(train)
y = np.ravel(train.loc[:, [Survived]])
clf = RandomForestClassifier(
n_estimators=100,
criterion='gini',
max_depth=5,
min_samples_split=10,
min_samples_leaf=5,
random_state=0)
clf.fit(X, y)
# importance of each column
for name, value in zip(X.columns.values, clf.feature_importances_):
print(name, value)
# predict
print('predict...')
test = load_data(test_data_path)
test[Age].fillna(age_mean, inplace=True)
test[Fare].fillna(fare_mean, inplace=True)
# add title
add_title(test, train_titles)
test_X = filter_data(test)
test_y = clf.predict(test_X)
# save result
passenger_ids = np.ravel(test.loc[:, [PassengerId]])
d = {PassengerId: passenger_ids, Survived: test_y}
df = pd.DataFrame(data=d)
df.to_csv(predict_save_path, index=False)
if __name__ == '__main__':
#pre_survey()
train_and_predict()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment