Last active
October 13, 2023 17:20
-
-
Save sachinsdate/7c4bb6b82009a130466c48e0957ffb52 to your computer and use it in GitHub Desktop.
Negative Binomial Regression using the GLM class of statsmodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from patsy import dmatrices | |
import numpy as np | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
import matplotlib.pyplot as plt | |
#create a pandas DataFrame for the counts data set | |
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0]) | |
#add a few derived regression variables to the X matrix | |
ds = df.index.to_series() | |
df['MONTH'] = ds.dt.month | |
df['DAY_OF_WEEK'] = ds.dt.dayofweek | |
df['DAY'] = ds.dt.day | |
#create the training and testing data sets | |
mask = np.random.rand(len(df)) < 0.8 | |
df_train = df[mask] | |
df_test = df[~mask] | |
print('Training data set length='+str(len(df_train))) | |
print('Testing data set length='+str(len(df_test))) | |
#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP | |
expr = """BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP""" | |
#Set up the X and y matrices for the training and testing data sets | |
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe') | |
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe') | |
#Using the statsmodels GLM class, train the Poisson regression model on the training data set | |
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit() | |
#print out the training summary | |
print(poisson_training_results.summary()) | |
#print out the fitted rate vector | |
print(poisson_training_results.mu) | |
#Add the λ vector as a new column called 'BB_LAMBDA' to the Data Frame of the training data set | |
df_train['BB_LAMBDA'] = poisson_training_results.mu | |
#add a derived column called 'AUX_OLS_DEP' to the pandas Data Frame. This new column will store the values of the dependent variable of the OLS regression | |
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['BB_COUNT'] - x['BB_LAMBDA'])**2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1) | |
#use patsy to form the model specification for the OLSR | |
ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1""" | |
#Configure and fit the OLSR model | |
aux_olsr_results = smf.ols(ols_expr, df_train).fit() | |
#Print the regression params | |
print(aux_olsr_results.params) | |
#train the NB2 model on the training data set | |
nb2_training_results = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit() | |
#print the training summary | |
print(nb2_training_results.summary()) | |
#make some predictions using our trained NB2 model | |
nb2_predictions = nb2_training_results.get_prediction(X_test) | |
#print out the predictions | |
predictions_summary_frame = nb2_predictions.summary_frame() | |
print(predictions_summary_frame) | |
#plot the predicted counts versus the actual counts for the test data | |
predicted_counts=predictions_summary_frame['mean'] | |
actual_counts = y_test['BB_COUNT'] | |
fig = plt.figure() | |
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge') | |
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts') | |
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts') | |
plt.legend(handles=[predicted, actual]) | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment