Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active Mar 10, 2020
Embed
What would you like to do?
Select the best linear regression time series model using AIC score as the criterion by performing a brute force search through a sample space of candidate models
import pandas as pd
from patsy import dmatrices
from collections import OrderedDict
import itertools
import statsmodels.formula.api as smf
import sys
import matplotlib.pyplot as plt
#Read the data set into a pandas DataFrame
df = pd.read_csv('boston_daily_temps_1978_2019.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
#resample at a month level
df_resampled = df.resample('M').mean()
#Plot the data set
fig = plt.figure()
fig.suptitle('Monthly Average temperatures in Boston, MA from 1978 to 2019')
actual, = plt.plot(df_resampled.index, df_resampled['TAVG'], 'go-', label='Monthly Average Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.legend(handles=[actual])
plt.show()
print(df_resampled.head())
#Take a copy
df_lagged = df_resampled.copy()
#Add time lagged columns to the data set
for i in range(1, 13, 1):
df_lagged['TAVG_LAG_' + str(i)] = df_lagged['TAVG'].shift(i)
print(df_lagged.head(13))
#Drop the NaN rows
for i in range(0, 12, 1):
df_lagged = df_lagged.drop(df_lagged.index[0])
print(df_lagged.head())
#Carve out the test and the training data sets
split_index = round(len(df_lagged)*0.8)
split_date = df_lagged.index[split_index]
df_train = df_lagged.loc[df_lagged.index <= split_date].copy()
df_test = df_lagged.loc[df_lagged.index > split_date].copy()
#Generate and store away, all possible combinations of the list [1,2,3,4,5,6,7,8,9,10,11,12]
lag_combinations = OrderedDict()
l = list(range(1,13,1))
for i in range(1, 13, 1):
for combination in itertools.combinations(l, i):
lag_combinations[combination] = 0.0
print('Number of combinations to be tested: ' + str(len(lag_combinations)))
expr_prefix = 'TAVG ~ '
min_aic = sys.float_info.max
best_expr = ''
best_olsr_model_results = None
#Iterate over each combination
for combination in lag_combinations:
expr = expr_prefix
i = 1
#Setup the model expression using patsy syntax
for lag_num in combination:
if i < len(combination):
expr = expr + 'TAVG_LAG_' + str(lag_num) + ' + '
else:
expr = expr + 'TAVG_LAG_' + str(lag_num)
i += 1
print('Building model for expr: ' + expr)
#Carve out the X,y vectors using patsy. We will use X_test, y_test later for testing the model.
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Build and train the OLSR model on the training data set
olsr_results = smf.ols(expr, df_train).fit()
#Store it's AIC value
lag_combinations[combination] = olsr_results.aic
#Keep track of the best model (the one with the lowest AIC score) seen so far
if olsr_results.aic < min_aic:
min_aic = olsr_results.aic
best_expr = expr
best_olsr_model_results = olsr_results
print('AIC='+str(lag_combinations[combination]))
#Print out the model expression, AIC score and model summary of the best model
print('Best expr=' + best_expr)
print('min AIC=' + str(min_aic))
print(best_olsr_model_results.summary())
#Generate predictions for TAVG on the test data set
olsr_predictions = best_olsr_model_results.get_prediction(X_test)
olsr_predictions_summary_frame = olsr_predictions.summary_frame()
print(olsr_predictions_summary_frame.head(10))
actual_temps = y_test['TAVG']
predicted_temps=olsr_predictions_summary_frame['mean']
#Plot the actual versus predicted values of TAVG on the test data set
fig = plt.figure()
fig.suptitle('Predicted versus actual monthly average temperatures')
predicted, = plt.plot(X_test.index, predicted_temps, 'go-', label='Predicted monthly average temp')
actual, = plt.plot(X_test.index, actual_temps, 'ro-', label='Actual monthly average temp')
plt.legend(handles=[predicted, actual])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment