Build, train and test a Binomial Regression model on the Titanic dataset using Python, pandas, and statsmodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
#load the data set into a Pandas data frame, and print out the first few rows | |
df = pd.read_csv('titanic_dataset.csv', header=0) | |
df.head(10) | |
#Drop the columns that our model will not use | |
df = df.drop(['Name','Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'], axis=1) | |
#print the top 10 rows | |
df.head(10) | |
#Bucket the Age column into bins | |
#define the bins | |
age_range_bins=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80] | |
#define the label for each bin. Num labels = Num bins - 1 | |
age_range_labels=[5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80] | |
#Cut up the age range into multiple bins and stuff them into a new Age_Range column | |
df['Age_Range']=pd.cut(df['Age'],age_range_bins,labels=age_range_labels) | |
#Print the output | |
df.head(10) | |
#Drop the age column | |
df = df.drop(['Age'],axis=1) | |
#Group by ['Pclass', 'Sex', 'Age_Range'] | |
groups = df.groupby(['Pclass', 'Sex', 'Age_Range']) | |
#Get the counts for each group. This is the number of passengers in each group who have survived | |
df_grouped_counts = groups.count() | |
#Get the size (number of passengers) in each group | |
df_grouped_survived = groups.sum() | |
#Merge the number of survivors and number of passengers for each group into each grouped data frame | |
df_grouped_counts.to_csv('df_grouped_counts.csv') | |
df_grouped_counts_1 = pd.read_csv('df_grouped_counts.csv', header=0) | |
df_grouped_survived.to_csv('df_grouped_survived.csv') | |
df_grouped_survived_1 = pd.read_csv('df_grouped_survived.csv', header=0) | |
#Create a new Data Frame | |
df_grouped = pd.DataFrame() | |
#Copy over the Pclass, Sex and Age Range columns | |
df_grouped['Pclass'] = df_grouped_counts_1['Pclass'] | |
df_grouped['Sex'] = df_grouped_counts_1['Sex'] | |
df_grouped['Age_Range'] = df_grouped_counts_1['Age_Range'] | |
#Copy over the num passengers from the counts grouped Data Frame | |
df_grouped['Total'] = df_grouped_counts_1['Survived'] | |
#Copy over the num survivors from the summation grouped Data Frame | |
df_grouped['Survived'] = df_grouped_survived_1['Survived'] | |
#Add a column containing the number who died | |
df_grouped['Died'] = df_grouped['Total'] - df_grouped['Survived'] | |
df_grouped.head(20) | |
df_grouped = df_grouped.dropna() | |
#replace the 'female' and 'male' strings with integers 1 and 2 | |
df_grouped=df_grouped.replace(to_replace={'female': 1, 'male': 2}) | |
#Separate out the training and test sets | |
import numpy as np | |
mask = np.random.rand(len(df_grouped)) < 0.85 | |
df_train = df_grouped[mask] | |
df_test = df_grouped[~mask] | |
#Construct the Binomial model's regression formula in Patsy syntax. | |
formula = 'Survived + Died ~ Pclass + Age_Range + Sex' | |
#carve out the X and y design matrices from the training and testing data frames | |
from patsy import dmatrices | |
#Carve out the training matrices from the training data frame using the regression formula | |
y_train, X_train = dmatrices(formula, df_train, return_type='dataframe') | |
#Carve out the testing matrices from the testing data frame using the regression formula | |
y_test, X_test = dmatrices(formula, df_test, return_type='dataframe') | |
#feed X_train and y_train into an instance of the Binomial Regression model class and train the model | |
import statsmodels.api as sm | |
binom_model = sm.GLM(y_train, X_train, family=sm.families.Binomial()) | |
binom_model_results = binom_model.fit() | |
print(binom_model_results.summary()) | |
#add a Percentage Survived column to the test data frame whose value we'll ask our model to predict | |
df_test['Pcnt_Survived'] = df_test['Survived']/df_test['Total'] | |
#use the .predict() method on the results object and pass the test data set get the predicted survival rate | |
predicted_survival_rate = binom_model_results.predict(X_test) | |
#plot the actual versus predicted survival rate | |
import matplotlib.pyplot as plt | |
plt.xlabel('Actual Survival Rate') | |
plt.ylabel('Predicted Survival Rate') | |
plt.scatter(df_test['Pcnt_Survived'], predicted_survival_rate, color = 'blue') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment