sachinsdate/poisson_regression.py

## poisson_regression.py
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt


#Create a pandas DataFrame for the counts data set.
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])

#Add a few derived regression variables.
ds = df.index.to_series()
df['MONTH'] = ds.dt.month
df['DAY_OF_WEEK'] = ds.dt.dayofweek
df['DAY'] = ds.dt.day

#Create the training and testing data sets.
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and
# it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP.
expr = """BB_COUNT ~ DAY  + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP"""

#Set up the X and y matrices
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

#Using the statsmodels GLM class, train the Poisson regression model on the training data set.
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

#Print the training summary.
print(poisson_training_results.summary())

#Make some predictions on the test data set.
poisson_predictions = poisson_training_results.get_prediction(X_test)
#.summary_frame() returns a pandas DataFrame
predictions_summary_frame = poisson_predictions.summary_frame()
print(predictions_summary_frame)

predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['BB_COUNT']

#Mlot the predicted counts versus the actual counts for the test data.
fig = plt.figure()
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()

#Show scatter plot of Actual versus Predicted counts
plt.clf()
fig = plt.figure()
fig.suptitle('Scatter plot of Actual versus Predicted counts')
plt.scatter(x=predicted_counts, y=actual_counts, marker='.')
plt.xlabel('Predicted counts')
plt.ylabel('Actual counts')
plt.show()
	import pandas as pd
	from patsy import dmatrices
	import numpy as np
	import statsmodels.api as sm
	import matplotlib.pyplot as plt


	#Create a pandas DataFrame for the counts data set.
	df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])

	#Add a few derived regression variables.
	ds = df.index.to_series()
	df['MONTH'] = ds.dt.month
	df['DAY_OF_WEEK'] = ds.dt.dayofweek
	df['DAY'] = ds.dt.day

	#Create the training and testing data sets.
	mask = np.random.rand(len(df)) < 0.8
	df_train = df[mask]
	df_test = df[~mask]
	print('Training data set length='+str(len(df_train)))
	print('Testing data set length='+str(len(df_test)))

	#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and
	# it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP.
	expr = """BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP"""

	#Set up the X and y matrices
	y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
	y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

	#Using the statsmodels GLM class, train the Poisson regression model on the training data set.
	poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

	#Print the training summary.
	print(poisson_training_results.summary())

	#Make some predictions on the test data set.
	poisson_predictions = poisson_training_results.get_prediction(X_test)
	#.summary_frame() returns a pandas DataFrame
	predictions_summary_frame = poisson_predictions.summary_frame()
	print(predictions_summary_frame)

	predicted_counts=predictions_summary_frame['mean']
	actual_counts = y_test['BB_COUNT']

	#Mlot the predicted counts versus the actual counts for the test data.
	fig = plt.figure()
	fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
	predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
	actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
	plt.legend(handles=[predicted, actual])
	plt.show()

	#Show scatter plot of Actual versus Predicted counts
	plt.clf()
	fig = plt.figure()
	fig.suptitle('Scatter plot of Actual versus Predicted counts')
	plt.scatter(x=predicted_counts, y=actual_counts, marker='.')
	plt.xlabel('Predicted counts')
	plt.ylabel('Actual counts')
	plt.show()