sachinsdate/olsr_for_counts_based_data.py

## olsr_for_counts_based_data.py
import pandas as pd
from matplotlib import pyplot as plt

#load the data into a pandas data frame and plot the BB_COUNT variable
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
fig = plt.figure()
fig.suptitle('Bicyclist counts on the Brooklyn bridge')
plt.xlabel('Date')
plt.ylabel('Count')
actual, = plt.plot(df.index, df['BB_COUNT'], 'go-', label='Count of bicyclists')
plt.legend(handles=[actual])
plt.show()

#create a histogram plot to see how normally distributed is this data set
plt.hist(df['BB_COUNT'], bins=20)
plt.show()

#print out a few descriptive stats: the mean, median, skewness, and kurtosis.
#For skewness and kurtosis, we'll run the Jarque-Bera normality test on the BB_COUNT variable
from statsmodels.stats.stattools import jarque_bera as jb
from statsmodels.stats.stattools import omni_normtest as omb
from statsmodels.compat import lzip

print('Mean='+str(round(df['BB_COUNT'].mean(), 2)))
print('Median='+str(round(df['BB_COUNT'].median(), 2)))
name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
test_results = jb(df['BB_COUNT'])
lzip(name, test_results)

#Try to fix the skewness by transforming the dependent variable
#Add two new columns to the data frame: a LOG(BB_COUNT) column and a  SQRT(BB_COUNT)
import numpy as np

#Add a column to the Data Frame that contains log(BB_COUNT):
df['LOG_BB_COUNT'] = np.log(df['BB_COUNT'])
#All another column containing sqrt(BB_COUNT)
df['SQRT_BB_COUNT'] = np.sqrt(df['BB_COUNT'])

#run the Jarque-Bera again on the LOG and SQRT columns to see if there is any improvement in normality
name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
test_results = omb(df['LOG_BB_COUNT'])
lzip(name, test_results)
test_results = omb(df['SQRT_BB_COUNT'])
lzip(name, test_results)

#revert to using the original BB_COUNT. Build, trainand test an OLSR model on the counts data

#start  with importing all the required packages
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

#add a few derived regression variables to the X matrix
ds = df.index.to_series()
df['MONTH'] = ds.dt.month
df['DAY_OF_WEEK'] = ds.dt.dayofweek
df['DAY'] = ds.dt.day

#create the training and testing data sets
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP
expr = 'BB_COUNT ~ DAY  + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP'

#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

#Configure and fit the OLSR model
olsr_results = smf.ols(expr, df_train).fit()

#Print the regression output
print(olsr_results.summary())

#generate the model's predictions on the test data
olsr_predictions = olsr_results.get_prediction(X_test)

#let's print them out
predictions_summary_frame = olsr_predictions.summary_frame()
print(predictions_summary_frame)

#plot the predicted and the actual values
predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['BB_COUNT']
fig = plt.figure()
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()

#plot histogram of the residual errors
plt.hist(olsr_results.resid, bins=20)
plt.show()

#plot the Q-Q plot of the residual errors
fig = sm.qqplot(olsr_results.resid)
plt.show()

#plot the ACF plot of residual errors
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(olsr_results.resid, title='ACF of residual errors')
plt.show()
	import pandas as pd
	from matplotlib import pyplot as plt

	#load the data into a pandas data frame and plot the BB_COUNT variable
	df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
	fig = plt.figure()
	fig.suptitle('Bicyclist counts on the Brooklyn bridge')
	plt.xlabel('Date')
	plt.ylabel('Count')
	actual, = plt.plot(df.index, df['BB_COUNT'], 'go-', label='Count of bicyclists')
	plt.legend(handles=[actual])
	plt.show()

	#create a histogram plot to see how normally distributed is this data set
	plt.hist(df['BB_COUNT'], bins=20)
	plt.show()

	#print out a few descriptive stats: the mean, median, skewness, and kurtosis.
	#For skewness and kurtosis, we'll run the Jarque-Bera normality test on the BB_COUNT variable
	from statsmodels.stats.stattools import jarque_bera as jb
	from statsmodels.stats.stattools import omni_normtest as omb
	from statsmodels.compat import lzip

	print('Mean='+str(round(df['BB_COUNT'].mean(), 2)))
	print('Median='+str(round(df['BB_COUNT'].median(), 2)))
	name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
	test_results = jb(df['BB_COUNT'])
	lzip(name, test_results)

	#Try to fix the skewness by transforming the dependent variable
	#Add two new columns to the data frame: a LOG(BB_COUNT) column and a SQRT(BB_COUNT)
	import numpy as np

	#Add a column to the Data Frame that contains log(BB_COUNT):
	df['LOG_BB_COUNT'] = np.log(df['BB_COUNT'])
	#All another column containing sqrt(BB_COUNT)
	df['SQRT_BB_COUNT'] = np.sqrt(df['BB_COUNT'])

	#run the Jarque-Bera again on the LOG and SQRT columns to see if there is any improvement in normality
	name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
	test_results = omb(df['LOG_BB_COUNT'])
	lzip(name, test_results)
	test_results = omb(df['SQRT_BB_COUNT'])
	lzip(name, test_results)

	#revert to using the original BB_COUNT. Build, trainand test an OLSR model on the counts data

	#start with importing all the required packages
	import pandas as pd
	from patsy import dmatrices
	import numpy as np
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	import matplotlib.pyplot as plt

	#add a few derived regression variables to the X matrix
	ds = df.index.to_series()
	df['MONTH'] = ds.dt.month
	df['DAY_OF_WEEK'] = ds.dt.dayofweek
	df['DAY'] = ds.dt.day

	#create the training and testing data sets
	mask = np.random.rand(len(df)) < 0.8
	df_train = df[mask]
	df_test = df[~mask]
	print('Training data set length='+str(len(df_train)))
	print('Testing data set length='+str(len(df_test)))

	#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP
	expr = 'BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP'

	#Set up the X and y matrices for the training and testing data sets
	y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
	y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

	#Configure and fit the OLSR model
	olsr_results = smf.ols(expr, df_train).fit()

	#Print the regression output
	print(olsr_results.summary())

	#generate the model's predictions on the test data
	olsr_predictions = olsr_results.get_prediction(X_test)

	#let's print them out
	predictions_summary_frame = olsr_predictions.summary_frame()
	print(predictions_summary_frame)

	#plot the predicted and the actual values
	predicted_counts=predictions_summary_frame['mean']
	actual_counts = y_test['BB_COUNT']
	fig = plt.figure()
	fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
	predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
	actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
	plt.legend(handles=[predicted, actual])
	plt.show()

	#plot histogram of the residual errors
	plt.hist(olsr_results.resid, bins=20)
	plt.show()

	#plot the Q-Q plot of the residual errors
	fig = sm.qqplot(olsr_results.resid)
	plt.show()

	#plot the ACF plot of residual errors
	from statsmodels.graphics.tsaplots import plot_acf

	plot_acf(olsr_results.resid, title='ACF of residual errors')
	plt.show()