Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active February 25, 2020 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/5fae9fb94053ecef44426d026c471620 to your computer and use it in GitHub Desktop.
Save sachinsdate/5fae9fb94053ecef44426d026c471620 to your computer and use it in GitHub Desktop.
Build, train and test a Binomial Regression model on the Titanic dataset using Python, pandas, and statsmodels
import pandas as pd
#load the data set into a Pandas data frame, and print out the first few rows
df = pd.read_csv('titanic_dataset.csv', header=0)
df.head(10)
#Drop the columns that our model will not use
df = df.drop(['Name','Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'], axis=1)
#print the top 10 rows
df.head(10)
#Bucket the Age column into bins
#define the bins
age_range_bins=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]
#define the label for each bin. Num labels = Num bins - 1
age_range_labels=[5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]
#Cut up the age range into multiple bins and stuff them into a new Age_Range column
df['Age_Range']=pd.cut(df['Age'],age_range_bins,labels=age_range_labels)
#Print the output
df.head(10)
#Drop the age column
df = df.drop(['Age'],axis=1)
#Group by ['Pclass', 'Sex', 'Age_Range']
groups = df.groupby(['Pclass', 'Sex', 'Age_Range'])
#Get the counts for each group. This is the number of passengers in each group who have survived
df_grouped_counts = groups.count()
#Get the size (number of passengers) in each group
df_grouped_survived = groups.sum()
#Merge the number of survivors and number of passengers for each group into each grouped data frame
df_grouped_counts.to_csv('df_grouped_counts.csv')
df_grouped_counts_1 = pd.read_csv('df_grouped_counts.csv', header=0)
df_grouped_survived.to_csv('df_grouped_survived.csv')
df_grouped_survived_1 = pd.read_csv('df_grouped_survived.csv', header=0)
#Create a new Data Frame
df_grouped = pd.DataFrame()
#Copy over the Pclass, Sex and Age Range columns
df_grouped['Pclass'] = df_grouped_counts_1['Pclass']
df_grouped['Sex'] = df_grouped_counts_1['Sex']
df_grouped['Age_Range'] = df_grouped_counts_1['Age_Range']
#Copy over the num passengers from the counts grouped Data Frame
df_grouped['Total'] = df_grouped_counts_1['Survived']
#Copy over the num survivors from the summation grouped Data Frame
df_grouped['Survived'] = df_grouped_survived_1['Survived']
#Add a column containing the number who died
df_grouped['Died'] = df_grouped['Total'] - df_grouped['Survived']
df_grouped.head(20)
df_grouped = df_grouped.dropna()
#replace the 'female' and 'male' strings with integers 1 and 2
df_grouped=df_grouped.replace(to_replace={'female': 1, 'male': 2})
#Separate out the training and test sets
import numpy as np
mask = np.random.rand(len(df_grouped)) < 0.85
df_train = df_grouped[mask]
df_test = df_grouped[~mask]
#Construct the Binomial model's regression formula in Patsy syntax.
formula = 'Survived + Died ~ Pclass + Age_Range + Sex'
#carve out the X and y design matrices from the training and testing data frames
from patsy import dmatrices
#Carve out the training matrices from the training data frame using the regression formula
y_train, X_train = dmatrices(formula, df_train, return_type='dataframe')
#Carve out the testing matrices from the testing data frame using the regression formula
y_test, X_test = dmatrices(formula, df_test, return_type='dataframe')
#feed X_train and y_train into an instance of the Binomial Regression model class and train the model
import statsmodels.api as sm
binom_model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
binom_model_results = binom_model.fit()
print(binom_model_results.summary())
#add a Percentage Survived column to the test data frame whose value we'll ask our model to predict
df_test['Pcnt_Survived'] = df_test['Survived']/df_test['Total']
#use the .predict() method on the results object and pass the test data set get the predicted survival rate
predicted_survival_rate = binom_model_results.predict(X_test)
#plot the actual versus predicted survival rate
import matplotlib.pyplot as plt
plt.xlabel('Actual Survival Rate')
plt.ylabel('Predicted Survival Rate')
plt.scatter(df_test['Pcnt_Survived'], predicted_survival_rate, color = 'blue')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment