sachinsdate/negative_binomial_regression.py

## negative_binomial_regression.py
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

#create a pandas DataFrame for the counts data set
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])

#add a few derived regression variables to the X matrix
ds = df.index.to_series()
df['MONTH'] = ds.dt.month
df['DAY_OF_WEEK'] = ds.dt.dayofweek
df['DAY'] = ds.dt.day

#create the training and testing data sets
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP
expr = """BB_COUNT ~ DAY  + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP"""

#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

#Using the statsmodels GLM class, train the Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

#print out the training summary
print(poisson_training_results.summary())

#print out the fitted rate vector
print(poisson_training_results.mu)

#Add the λ vector as a new column called 'BB_LAMBDA' to the Data Frame of the training data set
df_train['BB_LAMBDA'] = poisson_training_results.mu

#add a derived column called 'AUX_OLS_DEP' to the pandas Data Frame. This new column will store the values of the dependent variable of the OLS regression
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['BB_COUNT'] - x['BB_LAMBDA'])**2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1)

#use patsy to form the model specification for the OLSR
ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1"""

#Configure and fit the OLSR model
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

#Print the regression params
print(aux_olsr_results.params)

#train the NB2 model on the training data set
nb2_training_results = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()

#print the training summary
print(nb2_training_results.summary())

#make some predictions using our trained NB2 model
nb2_predictions = nb2_training_results.get_prediction(X_test)

#print out the predictions
predictions_summary_frame = nb2_predictions.summary_frame()
print(predictions_summary_frame)

#plot the predicted counts versus the actual counts for the test data
predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['BB_COUNT']
fig = plt.figure()
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()
	import pandas as pd
	from patsy import dmatrices
	import numpy as np
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	import matplotlib.pyplot as plt

	#create a pandas DataFrame for the counts data set
	df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])

	#add a few derived regression variables to the X matrix
	ds = df.index.to_series()
	df['MONTH'] = ds.dt.month
	df['DAY_OF_WEEK'] = ds.dt.dayofweek
	df['DAY'] = ds.dt.day

	#create the training and testing data sets
	mask = np.random.rand(len(df)) < 0.8
	df_train = df[mask]
	df_test = df[~mask]
	print('Training data set length='+str(len(df_train)))
	print('Testing data set length='+str(len(df_test)))

	#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP
	expr = """BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP"""

	#Set up the X and y matrices for the training and testing data sets
	y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
	y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

	#Using the statsmodels GLM class, train the Poisson regression model on the training data set
	poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

	#print out the training summary
	print(poisson_training_results.summary())

	#print out the fitted rate vector
	print(poisson_training_results.mu)

	#Add the λ vector as a new column called 'BB_LAMBDA' to the Data Frame of the training data set
	df_train['BB_LAMBDA'] = poisson_training_results.mu

	#add a derived column called 'AUX_OLS_DEP' to the pandas Data Frame. This new column will store the values of the dependent variable of the OLS regression
	df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['BB_COUNT'] - x['BB_LAMBDA'])**2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1)

	#use patsy to form the model specification for the OLSR
	ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1"""

	#Configure and fit the OLSR model
	aux_olsr_results = smf.ols(ols_expr, df_train).fit()

	#Print the regression params
	print(aux_olsr_results.params)

	#train the NB2 model on the training data set
	nb2_training_results = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()

	#print the training summary
	print(nb2_training_results.summary())

	#make some predictions using our trained NB2 model
	nb2_predictions = nb2_training_results.get_prediction(X_test)

	#print out the predictions
	predictions_summary_frame = nb2_predictions.summary_frame()
	print(predictions_summary_frame)

	#plot the predicted counts versus the actual counts for the test data
	predicted_counts=predictions_summary_frame['mean']
	actual_counts = y_test['BB_COUNT']
	fig = plt.figure()
	fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
	predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
	actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
	plt.legend(handles=[predicted, actual])
	plt.show()