Select the best linear regression time series model using AIC score as the criterion by performing a brute force search through a sample space of candidate models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from patsy import dmatrices | |
from collections import OrderedDict | |
import itertools | |
import statsmodels.formula.api as smf | |
import sys | |
import matplotlib.pyplot as plt | |
#Read the data set into a pandas DataFrame | |
df = pd.read_csv('boston_daily_temps_1978_2019.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0]) | |
#resample at a month level | |
df_resampled = df.resample('M').mean() | |
#Plot the data set | |
fig = plt.figure() | |
fig.suptitle('Monthly Average temperatures in Boston, MA from 1978 to 2019') | |
actual, = plt.plot(df_resampled.index, df_resampled['TAVG'], 'go-', label='Monthly Average Temperature') | |
plt.xlabel('Date') | |
plt.ylabel('Temperature') | |
plt.legend(handles=[actual]) | |
plt.show() | |
print(df_resampled.head()) | |
#Take a copy | |
df_lagged = df_resampled.copy() | |
#Add time lagged columns to the data set | |
for i in range(1, 13, 1): | |
df_lagged['TAVG_LAG_' + str(i)] = df_lagged['TAVG'].shift(i) | |
print(df_lagged.head(13)) | |
#Drop the NaN rows | |
for i in range(0, 12, 1): | |
df_lagged = df_lagged.drop(df_lagged.index[0]) | |
print(df_lagged.head()) | |
#Carve out the test and the training data sets | |
split_index = round(len(df_lagged)*0.8) | |
split_date = df_lagged.index[split_index] | |
df_train = df_lagged.loc[df_lagged.index <= split_date].copy() | |
df_test = df_lagged.loc[df_lagged.index > split_date].copy() | |
#Generate and store away, all possible combinations of the list [1,2,3,4,5,6,7,8,9,10,11,12] | |
lag_combinations = OrderedDict() | |
l = list(range(1,13,1)) | |
for i in range(1, 13, 1): | |
for combination in itertools.combinations(l, i): | |
lag_combinations[combination] = 0.0 | |
print('Number of combinations to be tested: ' + str(len(lag_combinations))) | |
expr_prefix = 'TAVG ~ ' | |
min_aic = sys.float_info.max | |
best_expr = '' | |
best_olsr_model_results = None | |
#Iterate over each combination | |
for combination in lag_combinations: | |
expr = expr_prefix | |
i = 1 | |
#Setup the model expression using patsy syntax | |
for lag_num in combination: | |
if i < len(combination): | |
expr = expr + 'TAVG_LAG_' + str(lag_num) + ' + ' | |
else: | |
expr = expr + 'TAVG_LAG_' + str(lag_num) | |
i += 1 | |
print('Building model for expr: ' + expr) | |
#Carve out the X,y vectors using patsy. We will use X_test, y_test later for testing the model. | |
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe') | |
#Build and train the OLSR model on the training data set | |
olsr_results = smf.ols(expr, df_train).fit() | |
#Store it's AIC value | |
lag_combinations[combination] = olsr_results.aic | |
#Keep track of the best model (the one with the lowest AIC score) seen so far | |
if olsr_results.aic < min_aic: | |
min_aic = olsr_results.aic | |
best_expr = expr | |
best_olsr_model_results = olsr_results | |
print('AIC='+str(lag_combinations[combination])) | |
#Print out the model expression, AIC score and model summary of the best model | |
print('Best expr=' + best_expr) | |
print('min AIC=' + str(min_aic)) | |
print(best_olsr_model_results.summary()) | |
#Generate predictions for TAVG on the test data set | |
olsr_predictions = best_olsr_model_results.get_prediction(X_test) | |
olsr_predictions_summary_frame = olsr_predictions.summary_frame() | |
print(olsr_predictions_summary_frame.head(10)) | |
actual_temps = y_test['TAVG'] | |
predicted_temps=olsr_predictions_summary_frame['mean'] | |
#Plot the actual versus predicted values of TAVG on the test data set | |
fig = plt.figure() | |
fig.suptitle('Predicted versus actual monthly average temperatures') | |
predicted, = plt.plot(X_test.index, predicted_temps, 'go-', label='Predicted monthly average temp') | |
actual, = plt.plot(X_test.index, actual_temps, 'ro-', label='Actual monthly average temp') | |
plt.legend(handles=[predicted, actual]) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment