Skip to content

Instantly share code, notes, and snippets.

@ryllada
Created October 11, 2016 11:40
Show Gist options
  • Save ryllada/12562cb903c3cf27563e6c9b785fe11d to your computer and use it in GitHub Desktop.
Save ryllada/12562cb903c3cf27563e6c9b785fe11d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
import re
__author__ = 'Rodolfo Yllada (ryllada@gmail.com)'
def preprocess(data, predictors_base):
predictors = predictors_base[:]
data["Fare"] = data["Fare"].fillna(data["Fare"].median())
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Age"] = data["Age"].astype(int)
data.loc[data["Sex"] == "female", "Sex"] = 1
data.loc[data["Sex"] == "male", "Sex"] = 2
data["Person"] = data.apply(
lambda row: get_person(row), axis=1)
predictors.append("Person")
data["Embarked"] = data["Embarked"].fillna("S")
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2
data["FamilySize"] = data["SibSp"] + data["Parch"]
predictors.append("FamilySize")
data["NameLength"] = data["Name"].apply(lambda x: len(x))
predictors.append("NameLength")
titles = data["Name"].apply(get_title)
title_mapping = {
"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6,
"Major": 7,
"Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10,
"Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k, v in title_mapping.items():
titles[titles == k] = v
data["Title"] = titles
predictors.append("Title")
family_ids = data.apply(get_family_id, axis=1)
# There are a lot of family ids, so we'll compress all of the families
# under 3 members into one code.
family_ids[data["FamilySize"] < 3] = -1
data["FamilyId"] = family_ids
predictors.append("FamilyId")
data["Number_of_cabins"] = data["Cabin"].astype(str).apply(
lambda x: 0 if not x else len(x.split(" ")))
predictors.append("Number_of_cabins")
predictors.remove("PassengerId")
data = data.drop("Name", 1)
predictors.remove("Name")
data = data.drop("SibSp", 1)
predictors.remove("SibSp")
data = data.drop("Parch", 1)
predictors.remove("Parch")
data = data.drop("Ticket", 1)
predictors.remove("Ticket")
data = data.drop("Cabin", 1)
predictors.remove("Cabin")
return data, predictors
def get_person(row):
if row["Age"] <= 16:
return 0
return row["Sex"]
def get_title(name):
# Use a regular expression to search for a title.
# Titles always consist of capital and lowercase letters,
# and end with a period.
title_search = re.search(' ([A-Za-z]+)\.', name)
# If the title exists, extract and return it.
if title_search:
return title_search.group(1)
return ""
def get_family_id(row):
family_id_mapping = {}
# Find the last name by splitting on a comma
last_name = row["Name"].split(",")[0]
# Create the family id
family_id = "{0}{1}".format(last_name, row["FamilySize"])
# Look up the id in the mapping
if family_id not in family_id_mapping:
if len(family_id_mapping) == 0:
current_id = 1
else:
# Get the maximum id from the mapping and add one to it if
# we don't have an id
current_id = (
max(family_id_mapping.items(),
key=operator.itemgetter(1))[1] + 1)
family_id_mapping[family_id] = current_id
return family_id_mapping[family_id]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment