What to do when residuals have a bimodal distribution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from patsy import dmatrices | |
import numpy as np | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
import statsmodels.stats.stattools as st | |
import matplotlib.pyplot as plt | |
#create a pandas DataFrame for the counts data set | |
df = pd.read_csv('bike_sharing_dataset_daywise.csv', header=0, parse_dates=['dteday'], infer_datetime_format=True) | |
#create the training and testing data sets | |
mask = np.random.rand(len(df)) < 0.8 | |
df_train = df[mask] | |
df_test = df[~mask] | |
print('Training data set length='+str(len(df_train))) | |
print('Testing data set length='+str(len(df_test))) | |
#Create the regression expression in Patsy syntax. We are saying that registered_user_count is the dependent variable and it depends on all the variables mentioned on the right side of ~\ | |
expr = 'registered_user_count ~ season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed' | |
#Set up the X and y matrices for the training and testing data sets | |
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe') | |
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe') | |
#Using the statsmodels GLM class, train the Poisson regression model on the training data set | |
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit() | |
#print out the training summary | |
print(poisson_training_results.summary()) | |
#print out the Skewness and the Kurtosis of 4 residuals to see which one is the most normally distributed | |
raw_residual_skewness = st.robust_skewness(poisson_training_results.resid_response)[0] | |
pearson_residual_skewness = st.robust_skewness(poisson_training_results.resid_pearson)[0] | |
anscobe_residual_skewness = st.robust_skewness(poisson_training_results.resid_anscombe)[0] | |
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0] | |
raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0] | |
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0] | |
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0] | |
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0] | |
residual_stats = [['Raw residual', raw_residual_skewness, raw_residual_kurtosis], | |
['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis], | |
['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis], | |
['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis] | |
] | |
residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis']) | |
print(residual_stats_df) | |
#Let's plot it's frequency distribution of the Pearson's residuals | |
poisson_training_results.resid_pearson.hist(bins=50) | |
plt.show() | |
#plot registered_user_count against yr | |
df.plot.scatter('yr', 'registered_user_count') | |
plt.show() | |
#let's add yr into the regression expression and build and train the model again | |
#Set up the regression expression. This time, include yr. | |
expr = 'registered_user_count ~ yr + season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed' | |
#Set up the X and y matrices for the training and testing data sets | |
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe') | |
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe') | |
#Using the statsmodels GLM class, train the Poisson regression model on the training data set | |
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit() | |
#print out the training summary | |
print(poisson_training_results.summary()) | |
#Once again, let's inspect all 4 kinds of residuals of the revised model: | |
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0] | |
raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0] | |
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0] | |
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0] | |
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0] | |
residual_stats = [ | |
['Raw residual', raw_residual_skewness, raw_residual_kurtosis],['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis], | |
['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis], | |
['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis] | |
] | |
residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis']) | |
print(residual_stats_df) | |
#Let's plot the frequency distribution of the Pearson's residual of the revised model | |
poisson_training_results.resid_pearson.hist(bins=50) | |
plt.show() | |
#Make some predictions on the test data set. | |
poisson_predictions = poisson_training_results.get_prediction(X_test) | |
#.summary_frame() returns a pandas DataFrame | |
predictions_summary_frame = poisson_predictions.summary_frame() | |
print(predictions_summary_frame) | |
predicted_counts=predictions_summary_frame['mean'] | |
actual_counts = y_test['registered_user_count'] | |
#Plot the predicted counts versus the actual counts for the test data. | |
fig = plt.figure() | |
fig.suptitle('Predicted versus actual user counts') | |
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts') | |
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts') | |
plt.legend(handles=[predicted, actual]) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment