sachinsdate/bimodal_residuals.py

## bimodal_residuals.py
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.stattools as st
import matplotlib.pyplot as plt

#create a pandas DataFrame for the counts data set
df = pd.read_csv('bike_sharing_dataset_daywise.csv', header=0, parse_dates=['dteday'], infer_datetime_format=True)

#create the training and testing data sets
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

#Create the regression expression in Patsy syntax. We are saying that registered_user_count is the dependent variable and it depends on all the variables mentioned on the right side of ~\
expr = 'registered_user_count ~ season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'

#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

#Using the statsmodels GLM class, train the Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

#print out the training summary
print(poisson_training_results.summary())

#print out the Skewness and the Kurtosis of  4 residuals to see which one is the most normally distributed
raw_residual_skewness = st.robust_skewness(poisson_training_results.resid_response)[0]
pearson_residual_skewness = st.robust_skewness(poisson_training_results.resid_pearson)[0]
anscobe_residual_skewness = st.robust_skewness(poisson_training_results.resid_anscombe)[0]
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]

raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]

residual_stats = [['Raw residual', raw_residual_skewness, raw_residual_kurtosis],
                  ['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
                  ['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
                  ['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
                ]

residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])

print(residual_stats_df)

#Let's plot it's frequency distribution of the Pearson's residuals
poisson_training_results.resid_pearson.hist(bins=50)
plt.show()

#plot registered_user_count against yr
df.plot.scatter('yr', 'registered_user_count')
plt.show()

#let's add yr into the regression expression and build and train the model again
#Set up the regression expression. This time, include yr.
expr = 'registered_user_count ~ yr + season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'

#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

#Using the statsmodels GLM class, train the Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

#print out the training summary
print(poisson_training_results.summary())

#Once again, let's inspect all 4 kinds of residuals of the revised model:
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]

raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]

residual_stats = [
['Raw residual', raw_residual_skewness, raw_residual_kurtosis],['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
                ]

residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])

print(residual_stats_df)

#Let's plot the frequency distribution of the Pearson's residual of the revised model
poisson_training_results.resid_pearson.hist(bins=50)
plt.show()

#Make some predictions on the test data set.
poisson_predictions = poisson_training_results.get_prediction(X_test)
#.summary_frame() returns a pandas DataFrame
predictions_summary_frame = poisson_predictions.summary_frame()
print(predictions_summary_frame)

predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['registered_user_count']

#Plot the predicted counts versus the actual counts for the test data.
fig = plt.figure()
fig.suptitle('Predicted versus actual user counts')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()
	import pandas as pd
	from patsy import dmatrices
	import numpy as np
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	import statsmodels.stats.stattools as st
	import matplotlib.pyplot as plt

	#create a pandas DataFrame for the counts data set
	df = pd.read_csv('bike_sharing_dataset_daywise.csv', header=0, parse_dates=['dteday'], infer_datetime_format=True)

	#create the training and testing data sets
	mask = np.random.rand(len(df)) < 0.8
	df_train = df[mask]
	df_test = df[~mask]
	print('Training data set length='+str(len(df_train)))
	print('Testing data set length='+str(len(df_test)))

	#Create the regression expression in Patsy syntax. We are saying that registered_user_count is the dependent variable and it depends on all the variables mentioned on the right side of ~\
	expr = 'registered_user_count ~ season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'

	#Set up the X and y matrices for the training and testing data sets
	y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
	y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

	#Using the statsmodels GLM class, train the Poisson regression model on the training data set
	poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

	#print out the training summary
	print(poisson_training_results.summary())

	#print out the Skewness and the Kurtosis of 4 residuals to see which one is the most normally distributed
	raw_residual_skewness = st.robust_skewness(poisson_training_results.resid_response)[0]
	pearson_residual_skewness = st.robust_skewness(poisson_training_results.resid_pearson)[0]
	anscobe_residual_skewness = st.robust_skewness(poisson_training_results.resid_anscombe)[0]
	deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]

	raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
	pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
	anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
	deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]

	residual_stats = [['Raw residual', raw_residual_skewness, raw_residual_kurtosis],
	['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
	['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
	['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
	]

	residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])

	print(residual_stats_df)

	#Let's plot it's frequency distribution of the Pearson's residuals
	poisson_training_results.resid_pearson.hist(bins=50)
	plt.show()

	#plot registered_user_count against yr
	df.plot.scatter('yr', 'registered_user_count')
	plt.show()

	#let's add yr into the regression expression and build and train the model again
	#Set up the regression expression. This time, include yr.
	expr = 'registered_user_count ~ yr + season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'

	#Set up the X and y matrices for the training and testing data sets
	y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
	y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

	#Using the statsmodels GLM class, train the Poisson regression model on the training data set
	poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

	#print out the training summary
	print(poisson_training_results.summary())

	#Once again, let's inspect all 4 kinds of residuals of the revised model:
	deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]

	raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
	pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
	anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
	deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]

	residual_stats = [
	['Raw residual', raw_residual_skewness, raw_residual_kurtosis],['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
	['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
	['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
	]

	residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])

	print(residual_stats_df)

	#Let's plot the frequency distribution of the Pearson's residual of the revised model
	poisson_training_results.resid_pearson.hist(bins=50)
	plt.show()

	#Make some predictions on the test data set.
	poisson_predictions = poisson_training_results.get_prediction(X_test)
	#.summary_frame() returns a pandas DataFrame
	predictions_summary_frame = poisson_predictions.summary_frame()
	print(predictions_summary_frame)

	predicted_counts=predictions_summary_frame['mean']
	actual_counts = y_test['registered_user_count']

	#Plot the predicted counts versus the actual counts for the test data.
	fig = plt.figure()
	fig.suptitle('Predicted versus actual user counts')
	predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
	actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
	plt.legend(handles=[predicted, actual])
	plt.show()