Skip to content

Instantly share code, notes, and snippets.

@EikeDehling
Last active March 11, 2017 11:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EikeDehling/90404aed7de3746162595be161109ef3 to your computer and use it in GitHub Desktop.
Save EikeDehling/90404aed7de3746162595be161109ef3 to your computer and use it in GitHub Desktop.
Some experiments for kaggle titanic survivors machine learning competition (https://www.kaggle.com/c/titanic)
import pandas
from sklearn import linear_model, svm, tree, naive_bayes
from sklearn.model_selection import cross_val_score
import numpy as np
data = pandas.read_csv('train.csv')
def preprocess(data):
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
data['Age'] = data['Age'].fillna(data['Age'].mean())
titles = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty",
}
data['Title'] = data['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(titles).astype('category')
# Do the one-hot encoding
data = pandas.get_dummies(data, columns=['Sex', 'Title', 'Pclass'])
return data
data = preprocess(data)
"""
print(data['Sex_male'].astype('category').describe())
print()
print(data['Sex_female'].astype('category').describe())
print()
print(data['Survived'].describe())
print()
print(data['Age'].describe())
print()
"""
def getX(data):
return data.as_matrix(['Pclass_1', 'Pclass_2', 'Pclass_3',
'Sex_male', 'Sex_female',
'Age', 'Parch', 'SibSp', 'Fare',
'Title_Officer', 'Title_Royalty', 'Title_Master',
'Title_Mr', 'Title_Mrs', 'Title_Miss'])
classifiers = [
('Logistic regression', linear_model.LogisticRegression()),
('SVM classifier', svm.SVC(kernel='linear')),
('Decision tree classifier', tree.DecisionTreeClassifier()),
('Naive Bayes classifier', naive_bayes.GaussianNB()),
]
for name, candidate in classifiers:
scores = cross_val_score(candidate, getX(data), data['Survived'], cv=5, scoring='accuracy')
print(name, np.mean(scores))
"""
model = linear_model.LogisticRegression()
model.fit(getX(data), data['Survived'])
data = pandas.read_csv('test.csv')
data = preprocess(data)
data['Survived'] = model.predict(getX(data))
data.to_csv('submission.csv', index=False, columns=['PassengerId', 'Survived'])
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment