Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created April 4, 2020 18:25
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
#Create a pandas DataFrame for the counts data set.
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
#We'll add a few derived regression variables to the X matrix.
ds = df.index.to_series()
df['MONTH'] = ds.dt.month
df['DAY_OF_WEEK'] = ds.dt.dayofweek
df['DAY'] = ds.dt.day
#Let's print out the first few rows of our data set to see how it looks like
print(df.head(10))
#Let's create the training and testing data sets.
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))
#Setup the regression expression in Patsy notation.
#We are telling patsy that BB_COUNT is our dependent variable y and it depends on the regression variables X:
#DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP.
expr = 'BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP'
#Let's use Patsy to carve out the X and y matrices for the training and testing data sets:
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Using the statsmodels GLM class, train the Poisson regression model on the training data set.
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
#Print the training summary.
print(poisson_training_results.summary())
#Let's print out the variance and mean of the data set
print('variance='+str(df['BB_COUNT'].var()))
print('mean='+str(df['BB_COUNT'].mean()))
#Build Consul's Generalized Poison regression model, know as GP-1
gen_poisson_gp1 = sm.GeneralizedPoisson(y_train, X_train, p=1)
#Fit the model
gen_poisson_gp1_results = gen_poisson_gp1.fit()
#print the results
print(gen_poisson_gp1_results.summary())
#Get the model's predictions on the test data set
gen_poisson_gp1_predictions = gen_poisson_gp1_results.predict(X_test)
predicted_counts=gen_poisson_gp1_predictions
actual_counts = y_test['BB_COUNT']
fig = plt.figure()
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()
#Build Famoye's Restricted Generalized Poison regression model, know as GP-2
gen_poisson_gp2 = sm.GeneralizedPoisson(y_train, X_train, p=2)
#Fit the model
gen_poisson_gp2_results = gen_poisson_gp2.fit()
#print the results
print(gen_poisson_gp2_results.summary())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment