Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
What to do when residuals have a bimodal distribution
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.stattools as st
import matplotlib.pyplot as plt
#create a pandas DataFrame for the counts data set
df = pd.read_csv('bike_sharing_dataset_daywise.csv', header=0, parse_dates=['dteday'], infer_datetime_format=True)
#create the training and testing data sets
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))
#Create the regression expression in Patsy syntax. We are saying that registered_user_count is the dependent variable and it depends on all the variables mentioned on the right side of ~\
expr = 'registered_user_count ~ season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'
#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Using the statsmodels GLM class, train the Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
#print out the training summary
print(poisson_training_results.summary())
#print out the Skewness and the Kurtosis of 4 residuals to see which one is the most normally distributed
raw_residual_skewness = st.robust_skewness(poisson_training_results.resid_response)[0]
pearson_residual_skewness = st.robust_skewness(poisson_training_results.resid_pearson)[0]
anscobe_residual_skewness = st.robust_skewness(poisson_training_results.resid_anscombe)[0]
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]
raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]
residual_stats = [['Raw residual', raw_residual_skewness, raw_residual_kurtosis],
['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
]
residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])
print(residual_stats_df)
#Let's plot it's frequency distribution of the Pearson's residuals
poisson_training_results.resid_pearson.hist(bins=50)
plt.show()
#plot registered_user_count against yr
df.plot.scatter('yr', 'registered_user_count')
plt.show()
#let's add yr into the regression expression and build and train the model again
#Set up the regression expression. This time, include yr.
expr = 'registered_user_count ~ yr + season + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed'
#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Using the statsmodels GLM class, train the Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
#print out the training summary
print(poisson_training_results.summary())
#Once again, let's inspect all 4 kinds of residuals of the revised model:
deviance_residual_skewness = st.robust_skewness(poisson_training_results.resid_deviance)[0]
raw_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_response)[0]
pearson_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_pearson)[0]
anscobe_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_anscombe)[0]
deviance_residual_kurtosis = st.robust_kurtosis(poisson_training_results.resid_deviance)[0]
residual_stats = [
['Raw residual', raw_residual_skewness, raw_residual_kurtosis],['Pearson\'s residual', pearson_residual_skewness, pearson_residual_kurtosis],
['Anscombe residual', anscobe_residual_skewness, anscobe_residual_kurtosis],
['Deviance residual', deviance_residual_skewness, deviance_residual_kurtosis]
]
residual_stats_df = pd.DataFrame(residual_stats, columns=['Residual', 'Skewness', 'Kurtosis'])
print(residual_stats_df)
#Let's plot the frequency distribution of the Pearson's residual of the revised model
poisson_training_results.resid_pearson.hist(bins=50)
plt.show()
#Make some predictions on the test data set.
poisson_predictions = poisson_training_results.get_prediction(X_test)
#.summary_frame() returns a pandas DataFrame
predictions_summary_frame = poisson_predictions.summary_frame()
print(predictions_summary_frame)
predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['registered_user_count']
#Plot the predicted counts versus the actual counts for the test data.
fig = plt.figure()
fig.suptitle('Predicted versus actual user counts')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment