ryllada/Feature preprocess

## Feature preprocess
#!/usr/bin/env python
# encoding: utf-8
import re


__author__ = 'Rodolfo Yllada (ryllada@gmail.com)'


def preprocess(data, predictors_base):
    predictors = predictors_base[:]
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())

    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Age"] = data["Age"].astype(int)

    data.loc[data["Sex"] == "female", "Sex"] = 1
    data.loc[data["Sex"] == "male", "Sex"] = 2

    data["Person"] = data.apply(
        lambda row: get_person(row), axis=1)
    predictors.append("Person")

    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2

    data["FamilySize"] = data["SibSp"] + data["Parch"]
    predictors.append("FamilySize")

    data["NameLength"] = data["Name"].apply(lambda x: len(x))
    predictors.append("NameLength")

    titles = data["Name"].apply(get_title)
    title_mapping = {
        "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6,
        "Major": 7,
        "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10,
        "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
    for k, v in title_mapping.items():
        titles[titles == k] = v
    data["Title"] = titles
    predictors.append("Title")

    family_ids = data.apply(get_family_id, axis=1)

    # There are a lot of family ids, so we'll compress all of the families
    # under 3 members into one code.
    family_ids[data["FamilySize"] < 3] = -1
    data["FamilyId"] = family_ids
    predictors.append("FamilyId")

    data["Number_of_cabins"] = data["Cabin"].astype(str).apply(
        lambda x: 0 if not x else len(x.split(" ")))
    predictors.append("Number_of_cabins")

    predictors.remove("PassengerId")

    data = data.drop("Name", 1)
    predictors.remove("Name")
    data = data.drop("SibSp", 1)
    predictors.remove("SibSp")
    data = data.drop("Parch", 1)
    predictors.remove("Parch")
    data = data.drop("Ticket", 1)
    predictors.remove("Ticket")
    data = data.drop("Cabin", 1)
    predictors.remove("Cabin")
    return data, predictors


def get_person(row):
    if row["Age"] <= 16:
        return 0
    return row["Sex"]


def get_title(name):
    # Use a regular expression to search for a title.
    # Titles always consist of capital and lowercase letters,
    # and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


def get_family_id(row):
    family_id_mapping = {}

    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if
            # we don't have an id
            current_id = (
                max(family_id_mapping.items(),
                    key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]
	#!/usr/bin/env python
	# encoding: utf-8
	import re


	__author__ = 'Rodolfo Yllada (ryllada@gmail.com)'


	def preprocess(data, predictors_base):
	predictors = predictors_base[:]
	data["Fare"] = data["Fare"].fillna(data["Fare"].median())

	data["Age"] = data["Age"].fillna(data["Age"].median())
	data["Age"] = data["Age"].astype(int)

	data.loc[data["Sex"] == "female", "Sex"] = 1
	data.loc[data["Sex"] == "male", "Sex"] = 2

	data["Person"] = data.apply(
	lambda row: get_person(row), axis=1)
	predictors.append("Person")

	data["Embarked"] = data["Embarked"].fillna("S")
	data.loc[data["Embarked"] == "S", "Embarked"] = 0
	data.loc[data["Embarked"] == "C", "Embarked"] = 1
	data.loc[data["Embarked"] == "Q", "Embarked"] = 2

	data["FamilySize"] = data["SibSp"] + data["Parch"]
	predictors.append("FamilySize")

	data["NameLength"] = data["Name"].apply(lambda x: len(x))
	predictors.append("NameLength")

	titles = data["Name"].apply(get_title)
	title_mapping = {
	"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6,
	"Major": 7,
	"Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10,
	"Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
	for k, v in title_mapping.items():
	titles[titles == k] = v
	data["Title"] = titles
	predictors.append("Title")

	family_ids = data.apply(get_family_id, axis=1)

	# There are a lot of family ids, so we'll compress all of the families
	# under 3 members into one code.
	family_ids[data["FamilySize"] < 3] = -1
	data["FamilyId"] = family_ids
	predictors.append("FamilyId")

	data["Number_of_cabins"] = data["Cabin"].astype(str).apply(
	lambda x: 0 if not x else len(x.split(" ")))
	predictors.append("Number_of_cabins")

	predictors.remove("PassengerId")

	data = data.drop("Name", 1)
	predictors.remove("Name")
	data = data.drop("SibSp", 1)
	predictors.remove("SibSp")
	data = data.drop("Parch", 1)
	predictors.remove("Parch")
	data = data.drop("Ticket", 1)
	predictors.remove("Ticket")
	data = data.drop("Cabin", 1)
	predictors.remove("Cabin")
	return data, predictors


	def get_person(row):
	if row["Age"] <= 16:
	return 0
	return row["Sex"]


	def get_title(name):
	# Use a regular expression to search for a title.
	# Titles always consist of capital and lowercase letters,
	# and end with a period.
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
	return title_search.group(1)
	return ""


	def get_family_id(row):
	family_id_mapping = {}

	# Find the last name by splitting on a comma
	last_name = row["Name"].split(",")[0]
	# Create the family id
	family_id = "{0}{1}".format(last_name, row["FamilySize"])
	# Look up the id in the mapping
	if family_id not in family_id_mapping:
	if len(family_id_mapping) == 0:
	current_id = 1
	else:
	# Get the maximum id from the mapping and add one to it if
	# we don't have an id
	current_id = (
	max(family_id_mapping.items(),
	key=operator.itemgetter(1))[1] + 1)
	family_id_mapping[family_id] = current_id
	return family_id_mapping[family_id]