Created
June 5, 2017 10:58
-
-
Save jay-trivedi/4ab63ebf09313094e8e97376d6f254ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Code courtsey: Manav Sehgal (https://www.kaggle.com/startupsci) | |
## Link to Original Code: https://www.kaggle.com/startupsci/titanic-data-science-solutions | |
# data analysis and wrangling | |
import pandas as pd | |
import numpy as np | |
import random as rnd | |
def titanic(train, test): | |
train_df = pd.read_csv(train) | |
test_df = pd.read_csv(test) | |
combine = [train_df, test_df] | |
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1) | |
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1) | |
combine = [train_df, test_df] | |
for dataset in combine: | |
dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False) | |
pd.crosstab(train_df['Title'], train_df['Sex']) | |
for dataset in combine: | |
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ | |
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') | |
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') | |
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') | |
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') | |
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean() | |
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} | |
for dataset in combine: | |
dataset['Title'] = dataset['Title'].map(title_mapping) | |
dataset['Title'] = dataset['Title'].fillna(0) | |
train_df = train_df.drop(['Name', 'PassengerId'], axis=1) | |
test_df = test_df.drop(['Name'], axis=1) | |
combine = [train_df, test_df] | |
for dataset in combine: | |
dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int) | |
guess_ages = np.zeros((2,3)) | |
guess_ages | |
for dataset in combine: | |
for i in range(0, 2): | |
for j in range(0, 3): | |
guess_df = dataset[(dataset['Sex'] == i) & \ | |
(dataset['Pclass'] == j+1)]['Age'].dropna() | |
# age_mean = guess_df.mean() | |
# age_std = guess_df.std() | |
# age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std) | |
age_guess = guess_df.median() | |
# Convert random age float to nearest .5 age | |
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5 | |
for i in range(0, 2): | |
for j in range(0, 3): | |
dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\ | |
'Age'] = guess_ages[i,j] | |
dataset['Age'] = dataset['Age'].astype(int) | |
for dataset in combine: | |
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 | |
for dataset in combine: | |
dataset['IsAlone'] = 0 | |
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 | |
combine = [train_df, test_df] | |
freq_port = train_df.Embarked.dropna().mode()[0] | |
for dataset in combine: | |
dataset['Embarked'] = dataset['Embarked'].fillna(freq_port) | |
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True) | |
test_df.head(10) | |
return train_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment