Created
October 11, 2016 11:40
-
-
Save ryllada/12562cb903c3cf27563e6c9b785fe11d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import re | |
__author__ = 'Rodolfo Yllada (ryllada@gmail.com)' | |
def preprocess(data, predictors_base): | |
predictors = predictors_base[:] | |
data["Fare"] = data["Fare"].fillna(data["Fare"].median()) | |
data["Age"] = data["Age"].fillna(data["Age"].median()) | |
data["Age"] = data["Age"].astype(int) | |
data.loc[data["Sex"] == "female", "Sex"] = 1 | |
data.loc[data["Sex"] == "male", "Sex"] = 2 | |
data["Person"] = data.apply( | |
lambda row: get_person(row), axis=1) | |
predictors.append("Person") | |
data["Embarked"] = data["Embarked"].fillna("S") | |
data.loc[data["Embarked"] == "S", "Embarked"] = 0 | |
data.loc[data["Embarked"] == "C", "Embarked"] = 1 | |
data.loc[data["Embarked"] == "Q", "Embarked"] = 2 | |
data["FamilySize"] = data["SibSp"] + data["Parch"] | |
predictors.append("FamilySize") | |
data["NameLength"] = data["Name"].apply(lambda x: len(x)) | |
predictors.append("NameLength") | |
titles = data["Name"].apply(get_title) | |
title_mapping = { | |
"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, | |
"Major": 7, | |
"Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10, | |
"Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} | |
for k, v in title_mapping.items(): | |
titles[titles == k] = v | |
data["Title"] = titles | |
predictors.append("Title") | |
family_ids = data.apply(get_family_id, axis=1) | |
# There are a lot of family ids, so we'll compress all of the families | |
# under 3 members into one code. | |
family_ids[data["FamilySize"] < 3] = -1 | |
data["FamilyId"] = family_ids | |
predictors.append("FamilyId") | |
data["Number_of_cabins"] = data["Cabin"].astype(str).apply( | |
lambda x: 0 if not x else len(x.split(" "))) | |
predictors.append("Number_of_cabins") | |
predictors.remove("PassengerId") | |
data = data.drop("Name", 1) | |
predictors.remove("Name") | |
data = data.drop("SibSp", 1) | |
predictors.remove("SibSp") | |
data = data.drop("Parch", 1) | |
predictors.remove("Parch") | |
data = data.drop("Ticket", 1) | |
predictors.remove("Ticket") | |
data = data.drop("Cabin", 1) | |
predictors.remove("Cabin") | |
return data, predictors | |
def get_person(row): | |
if row["Age"] <= 16: | |
return 0 | |
return row["Sex"] | |
def get_title(name): | |
# Use a regular expression to search for a title. | |
# Titles always consist of capital and lowercase letters, | |
# and end with a period. | |
title_search = re.search(' ([A-Za-z]+)\.', name) | |
# If the title exists, extract and return it. | |
if title_search: | |
return title_search.group(1) | |
return "" | |
def get_family_id(row): | |
family_id_mapping = {} | |
# Find the last name by splitting on a comma | |
last_name = row["Name"].split(",")[0] | |
# Create the family id | |
family_id = "{0}{1}".format(last_name, row["FamilySize"]) | |
# Look up the id in the mapping | |
if family_id not in family_id_mapping: | |
if len(family_id_mapping) == 0: | |
current_id = 1 | |
else: | |
# Get the maximum id from the mapping and add one to it if | |
# we don't have an id | |
current_id = ( | |
max(family_id_mapping.items(), | |
key=operator.itemgetter(1))[1] + 1) | |
family_id_mapping[family_id] = current_id | |
return family_id_mapping[family_id] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment