Poisson Regression model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from patsy import dmatrices | |
import numpy as np | |
import statsmodels.api as sm | |
import matplotlib.pyplot as plt | |
#Create a pandas DataFrame for the counts data set. | |
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0]) | |
#Add a few derived regression variables. | |
ds = df.index.to_series() | |
df['MONTH'] = ds.dt.month | |
df['DAY_OF_WEEK'] = ds.dt.dayofweek | |
df['DAY'] = ds.dt.day | |
#Create the training and testing data sets. | |
mask = np.random.rand(len(df)) < 0.8 | |
df_train = df[mask] | |
df_test = df[~mask] | |
print('Training data set length='+str(len(df_train))) | |
print('Testing data set length='+str(len(df_test))) | |
#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and | |
# it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP. | |
expr = """BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP""" | |
#Set up the X and y matrices | |
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe') | |
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe') | |
#Using the statsmodels GLM class, train the Poisson regression model on the training data set. | |
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit() | |
#Print the training summary. | |
print(poisson_training_results.summary()) | |
#Make some predictions on the test data set. | |
poisson_predictions = poisson_training_results.get_prediction(X_test) | |
#.summary_frame() returns a pandas DataFrame | |
predictions_summary_frame = poisson_predictions.summary_frame() | |
print(predictions_summary_frame) | |
predicted_counts=predictions_summary_frame['mean'] | |
actual_counts = y_test['BB_COUNT'] | |
#Mlot the predicted counts versus the actual counts for the test data. | |
fig = plt.figure() | |
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge') | |
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts') | |
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts') | |
plt.legend(handles=[predicted, actual]) | |
plt.show() | |
#Show scatter plot of Actual versus Predicted counts | |
plt.clf() | |
fig = plt.figure() | |
fig.suptitle('Scatter plot of Actual versus Predicted counts') | |
plt.scatter(x=predicted_counts, y=actual_counts, marker='.') | |
plt.xlabel('Predicted counts') | |
plt.ylabel('Actual counts') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment