sachinsdate/binomial_regression.py

## binomial_regression.py
import pandas as pd

#load the data set into a Pandas data frame, and print out the first few rows
df = pd.read_csv('titanic_dataset.csv', header=0)
df.head(10)

#Drop the columns that our model will not use
df = df.drop(['Name','Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'], axis=1)

#print the top 10 rows
df.head(10)

#Bucket the Age column into bins

#define the bins
age_range_bins=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]

#define the label for each bin. Num labels = Num bins - 1
age_range_labels=[5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]

#Cut up the age range into multiple bins and stuff them into a new Age_Range column
df['Age_Range']=pd.cut(df['Age'],age_range_bins,labels=age_range_labels)

#Print the output
df.head(10)

#Drop the age column
df = df.drop(['Age'],axis=1)

#Group by ['Pclass', 'Sex', 'Age_Range']
groups = df.groupby(['Pclass', 'Sex', 'Age_Range'])

#Get the counts for each group. This is the number of passengers in each group who have survived
df_grouped_counts = groups.count()

#Get the size (number of passengers) in each group
df_grouped_survived = groups.sum()

#Merge the number of survivors and number of passengers for each group into each grouped data frame
df_grouped_counts.to_csv('df_grouped_counts.csv')
df_grouped_counts_1 = pd.read_csv('df_grouped_counts.csv', header=0)

df_grouped_survived.to_csv('df_grouped_survived.csv')
df_grouped_survived_1 = pd.read_csv('df_grouped_survived.csv', header=0)

#Create a new Data Frame
df_grouped = pd.DataFrame()

#Copy over the Pclass, Sex and Age Range columns
df_grouped['Pclass'] = df_grouped_counts_1['Pclass']
df_grouped['Sex'] = df_grouped_counts_1['Sex']
df_grouped['Age_Range'] = df_grouped_counts_1['Age_Range']

#Copy over the num passengers from the counts grouped Data Frame
df_grouped['Total'] = df_grouped_counts_1['Survived']

#Copy over the num survivors from the summation grouped Data Frame
df_grouped['Survived'] = df_grouped_survived_1['Survived']

#Add a column containing the number who died
df_grouped['Died'] = df_grouped['Total'] - df_grouped['Survived']

df_grouped.head(20)

df_grouped = df_grouped.dropna()

#replace the 'female' and 'male'  strings with integers 1 and 2
df_grouped=df_grouped.replace(to_replace={'female': 1, 'male': 2})

#Separate out the training and test sets
import numpy as np

mask = np.random.rand(len(df_grouped)) < 0.85
df_train = df_grouped[mask]
df_test = df_grouped[~mask]

#Construct the Binomial model's regression formula in Patsy syntax.
formula = 'Survived + Died ~ Pclass + Age_Range + Sex'

#carve out the X and y design matrices from the training and testing data frames
from patsy import dmatrices

#Carve out the training matrices from the training data frame using the regression formula
y_train, X_train = dmatrices(formula, df_train, return_type='dataframe')

#Carve out the testing matrices from the testing data frame using the regression formula
y_test, X_test = dmatrices(formula, df_test, return_type='dataframe')

#feed X_train and y_train into an instance of the Binomial Regression model class and train the model
import statsmodels.api as sm

binom_model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
binom_model_results = binom_model.fit()

print(binom_model_results.summary())

#add a Percentage Survived column to the test data frame whose value we'll ask our model to predict
df_test['Pcnt_Survived'] = df_test['Survived']/df_test['Total']

#use the .predict() method on the results object and pass the test data set get the predicted survival rate
predicted_survival_rate = binom_model_results.predict(X_test)

#plot the actual versus predicted survival rate
import matplotlib.pyplot as plt

plt.xlabel('Actual Survival Rate')
plt.ylabel('Predicted Survival Rate')
plt.scatter(df_test['Pcnt_Survived'], predicted_survival_rate, color = 'blue')
plt.show()
	import pandas as pd

	#load the data set into a Pandas data frame, and print out the first few rows
	df = pd.read_csv('titanic_dataset.csv', header=0)
	df.head(10)

	#Drop the columns that our model will not use
	df = df.drop(['Name','Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'], axis=1)

	#print the top 10 rows
	df.head(10)

	#Bucket the Age column into bins

	#define the bins
	age_range_bins=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]

	#define the label for each bin. Num labels = Num bins - 1
	age_range_labels=[5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80]

	#Cut up the age range into multiple bins and stuff them into a new Age_Range column
	df['Age_Range']=pd.cut(df['Age'],age_range_bins,labels=age_range_labels)

	#Print the output
	df.head(10)

	#Drop the age column
	df = df.drop(['Age'],axis=1)

	#Group by ['Pclass', 'Sex', 'Age_Range']
	groups = df.groupby(['Pclass', 'Sex', 'Age_Range'])

	#Get the counts for each group. This is the number of passengers in each group who have survived
	df_grouped_counts = groups.count()

	#Get the size (number of passengers) in each group
	df_grouped_survived = groups.sum()

	#Merge the number of survivors and number of passengers for each group into each grouped data frame
	df_grouped_counts.to_csv('df_grouped_counts.csv')
	df_grouped_counts_1 = pd.read_csv('df_grouped_counts.csv', header=0)

	df_grouped_survived.to_csv('df_grouped_survived.csv')
	df_grouped_survived_1 = pd.read_csv('df_grouped_survived.csv', header=0)

	#Create a new Data Frame
	df_grouped = pd.DataFrame()

	#Copy over the Pclass, Sex and Age Range columns
	df_grouped['Pclass'] = df_grouped_counts_1['Pclass']
	df_grouped['Sex'] = df_grouped_counts_1['Sex']
	df_grouped['Age_Range'] = df_grouped_counts_1['Age_Range']

	#Copy over the num passengers from the counts grouped Data Frame
	df_grouped['Total'] = df_grouped_counts_1['Survived']

	#Copy over the num survivors from the summation grouped Data Frame
	df_grouped['Survived'] = df_grouped_survived_1['Survived']

	#Add a column containing the number who died
	df_grouped['Died'] = df_grouped['Total'] - df_grouped['Survived']

	df_grouped.head(20)

	df_grouped = df_grouped.dropna()

	#replace the 'female' and 'male' strings with integers 1 and 2
	df_grouped=df_grouped.replace(to_replace={'female': 1, 'male': 2})

	#Separate out the training and test sets
	import numpy as np

	mask = np.random.rand(len(df_grouped)) < 0.85
	df_train = df_grouped[mask]
	df_test = df_grouped[~mask]

	#Construct the Binomial model's regression formula in Patsy syntax.
	formula = 'Survived + Died ~ Pclass + Age_Range + Sex'

	#carve out the X and y design matrices from the training and testing data frames
	from patsy import dmatrices

	#Carve out the training matrices from the training data frame using the regression formula
	y_train, X_train = dmatrices(formula, df_train, return_type='dataframe')

	#Carve out the testing matrices from the testing data frame using the regression formula
	y_test, X_test = dmatrices(formula, df_test, return_type='dataframe')

	#feed X_train and y_train into an instance of the Binomial Regression model class and train the model
	import statsmodels.api as sm

	binom_model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
	binom_model_results = binom_model.fit()

	print(binom_model_results.summary())

	#add a Percentage Survived column to the test data frame whose value we'll ask our model to predict
	df_test['Pcnt_Survived'] = df_test['Survived']/df_test['Total']

	#use the .predict() method on the results object and pass the test data set get the predicted survival rate
	predicted_survival_rate = binom_model_results.predict(X_test)

	#plot the actual versus predicted survival rate
	import matplotlib.pyplot as plt

	plt.xlabel('Actual Survival Rate')
	plt.ylabel('Predicted Survival Rate')
	plt.scatter(df_test['Pcnt_Survived'], predicted_survival_rate, color = 'blue')
	plt.show()