Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A tutorial on how to solve a system of regression equations. The assumption is that the residuals from individual regression models are correlated across different models but only for the same time period. The data set is in the file 90day_RAR_on_assets.csv
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from matplotlib import pyplot as plt
import numpy as np
asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'Nucor', 'Ford', 'Tesla', 'Google', 'Microsoft']
#M = number of equations
M = len(asset_names)
#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day
# risk-adjusted-returns data on 8 stock assets: Chevron, Halliburton, Alcoa, Nucor, Ford, Tesla,
# Google, Microsoft
df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0)
#T = number of data samples per equation
T = len(df_90day_RAR)
#Using the asset pricing model of economics, create 8 independent linear models, one for each
# asset in the file and fit them using OLS
regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy',
'RAR_Alcoa ~ RAR_Metals', 'RAR_Nucor ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto',
'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology']
#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M
# regression models
df_OLS_resid = pd.DataFrame()
y_matrices = []
X_matrices = []
i=0
#Build and train the M linear models, print out the training summary, accumulate residuals in a
# new column of the residuals data frame, and accumulate corresponding y, X design matrices
for expr in regression_equations:
olsr_model = smf.ols(formula=expr, data=df_90day_RAR)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
df_OLS_resid[asset_names[i]] = olsr_model_results.resid
y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe')
y_matrices.append(y)
X_matrices.append(X)
i = i + 1
#print the table of correlations between the residuals of the 8 models to judge whether the
# residuals are correlated across equations
print(df_OLS_resid.corr())
#We'll now build the stacked X and y matrices. X has the following format:
#X = [X0 Z Z Z Z Z Z Z
# Z X1 Z Z Z Z Z Z
# Z Z X2 Z Z Z Z Z
# Z Z Z X3 Z Z Z Z
# Z Z Z Z X4 Z Z Z
# Z Z Z Z Z X5 Z Z
# Z Z Z Z Z Z X6 Z
# Z Z Z Z Z Z Z X7]
Z = np.zeros((T,2))
R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z))
R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z))
R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z))
R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z))
R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z))
R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z))
R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z))
R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7]))
X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8))
#y has the following format:
#y = [y0
# y1
# y2
# y3
# y4
# y5
# y6
# y7]
y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4],
y_matrices[5], y_matrices[6], y_matrices[7]))
#We'll now build the Sigma matrix, which we will later use to estimate the (MT X MT) Omega
# matrix of correlations between residuals from the M regressions.
#Our working assumption is that residuals are correlated across equations for the same time
# period but not across different time periods i.e. Corr(e_ij, e_kl) = 0 if i != k
#We'll start by creating an M X M matrix of zeros
S = np.zeros((M,M))
#The S matrix has the following structure:
# [e_11 e_12 e_13 ... e_1M
# e_21 e_22 e_23 ... e_2M
# ... ... ... ... e_2M
# e_M1 e_M2 e_M3 ... e_MM]
#Think of the S matrix as an element-wise summation of T layers, each of size [M x M]. Each [M x M]
# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th
# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size
# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M]
for i in range(len(df_OLS_resid)):
e_t_prime = np.array(df_OLS_resid[i:i+1])
e_t = np.transpose(e_t_prime)
S = np.add(S, np.matmul(e_t, e_t_prime))
#Divide S by T
S = S/T
#The Sigma matrix is the estimated value of the Omega matrix of residual correlations. Sigma (or
# Omega) has the following structure
#Sigma = [s_11*I s_12*I s_13*I ... s_1M*I
# s_21*I s_22*I s_23*I ... s_2M*I
# ... ... ... ... ...
# s_M1*I s_M2*I ... ... s_MM*I]
sigma = []
for row in range(M):
sigma_row = []
for col in range(M):
I = np.identity(T)
sigma_row.append(S[row][col]*I)
sigma.append(sigma_row)
sigma_rows= []
for row in range(M):
sigma_rows.append(np.hstack(sigma[row]))
sigma_matrix = np.vstack(sigma_rows)
#Build and train the GLS model
gls_model = sm.GLS(endog=y, exog=X, sigma=sigma_matrix)
gls_model_results = gls_model.fit()
print(gls_model_results.summary())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment