Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created November 30, 2019 15:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/de66cfd22caa5f871f495fc50c924c3c to your computer and use it in GitHub Desktop.
Save sachinsdate/de66cfd22caa5f871f495fc50c924c3c to your computer and use it in GitHub Desktop.
OLS regression model fitted to the bicyclist counts data set
import pandas as pd
from matplotlib import pyplot as plt
#load the data into a pandas data frame and plot the BB_COUNT variable
df = pd.read_csv('nyc_bb_bicyclist_counts.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
fig = plt.figure()
fig.suptitle('Bicyclist counts on the Brooklyn bridge')
plt.xlabel('Date')
plt.ylabel('Count')
actual, = plt.plot(df.index, df['BB_COUNT'], 'go-', label='Count of bicyclists')
plt.legend(handles=[actual])
plt.show()
#create a histogram plot to see how normally distributed is this data set
plt.hist(df['BB_COUNT'], bins=20)
plt.show()
#print out a few descriptive stats: the mean, median, skewness, and kurtosis.
#For skewness and kurtosis, we'll run the Jarque-Bera normality test on the BB_COUNT variable
from statsmodels.stats.stattools import jarque_bera as jb
from statsmodels.stats.stattools import omni_normtest as omb
from statsmodels.compat import lzip
print('Mean='+str(round(df['BB_COUNT'].mean(), 2)))
print('Median='+str(round(df['BB_COUNT'].median(), 2)))
name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
test_results = jb(df['BB_COUNT'])
lzip(name, test_results)
#Try to fix the skewness by transforming the dependent variable
#Add two new columns to the data frame: a LOG(BB_COUNT) column and a SQRT(BB_COUNT)
import numpy as np
#Add a column to the Data Frame that contains log(BB_COUNT):
df['LOG_BB_COUNT'] = np.log(df['BB_COUNT'])
#All another column containing sqrt(BB_COUNT)
df['SQRT_BB_COUNT'] = np.sqrt(df['BB_COUNT'])
#run the Jarque-Bera again on the LOG and SQRT columns to see if there is any improvement in normality
name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis']
test_results = omb(df['LOG_BB_COUNT'])
lzip(name, test_results)
test_results = omb(df['SQRT_BB_COUNT'])
lzip(name, test_results)
#revert to using the original BB_COUNT. Build, trainand test an OLSR model on the counts data
#start with importing all the required packages
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
#add a few derived regression variables to the X matrix
ds = df.index.to_series()
df['MONTH'] = ds.dt.month
df['DAY_OF_WEEK'] = ds.dt.dayofweek
df['DAY'] = ds.dt.day
#create the training and testing data sets
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))
#Setup the regression expression in patsy notation. We are telling patsy that BB_COUNT is our dependent variable and it depends on the regression variables: DAY, DAY_OF_WEEK, MONTH, HIGH_T, LOW_T and PRECIP
expr = 'BB_COUNT ~ DAY + DAY_OF_WEEK + MONTH + HIGH_T + LOW_T + PRECIP'
#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Configure and fit the OLSR model
olsr_results = smf.ols(expr, df_train).fit()
#Print the regression output
print(olsr_results.summary())
#generate the model's predictions on the test data
olsr_predictions = olsr_results.get_prediction(X_test)
#let's print them out
predictions_summary_frame = olsr_predictions.summary_frame()
print(predictions_summary_frame)
#plot the predicted and the actual values
predicted_counts=predictions_summary_frame['mean']
actual_counts = y_test['BB_COUNT']
fig = plt.figure()
fig.suptitle('Predicted versus actual bicyclist counts on the Brooklyn bridge')
predicted, = plt.plot(X_test.index, predicted_counts, 'go-', label='Predicted counts')
actual, = plt.plot(X_test.index, actual_counts, 'ro-', label='Actual counts')
plt.legend(handles=[predicted, actual])
plt.show()
#plot histogram of the residual errors
plt.hist(olsr_results.resid, bins=20)
plt.show()
#plot the Q-Q plot of the residual errors
fig = sm.qqplot(olsr_results.resid)
plt.show()
#plot the ACF plot of residual errors
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(olsr_results.resid, title='ACF of residual errors')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment