Created
November 20, 2022 11:04
-
-
Save sachinsdate/d4ed5865666e157677ddab0f970e6569 to your computer and use it in GitHub Desktop.
A tutorial on how to solve a system of regression equations. The assumption is that the residuals from individual regression models are correlated across different models but only for the same time period. The data set is in the file 90day_RAR_on_assets.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import statsmodels.formula.api as smf | |
import statsmodels.api as sm | |
from patsy import dmatrices | |
from matplotlib import pyplot as plt | |
import numpy as np | |
asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'Nucor', 'Ford', 'Tesla', 'Google', 'Microsoft'] | |
#M = number of equations | |
M = len(asset_names) | |
#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day | |
# risk-adjusted-returns data on 8 stock assets: Chevron, Halliburton, Alcoa, Nucor, Ford, Tesla, | |
# Google, Microsoft | |
df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0) | |
#T = number of data samples per equation | |
T = len(df_90day_RAR) | |
#Using the asset pricing model of economics, create 8 independent linear models, one for each | |
# asset in the file and fit them using OLS | |
regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy', | |
'RAR_Alcoa ~ RAR_Metals', 'RAR_Nucor ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto', | |
'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology'] | |
#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M | |
# regression models | |
df_OLS_resid = pd.DataFrame() | |
y_matrices = [] | |
X_matrices = [] | |
i=0 | |
#Build and train the M linear models, print out the training summary, accumulate residuals in a | |
# new column of the residuals data frame, and accumulate corresponding y, X design matrices | |
for expr in regression_equations: | |
olsr_model = smf.ols(formula=expr, data=df_90day_RAR) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) | |
df_OLS_resid[asset_names[i]] = olsr_model_results.resid | |
y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe') | |
y_matrices.append(y) | |
X_matrices.append(X) | |
i = i + 1 | |
#print the table of correlations between the residuals of the 8 models to judge whether the | |
# residuals are correlated across equations | |
print(df_OLS_resid.corr()) | |
#We'll now build the stacked X and y matrices. X has the following format: | |
#X = [X0 Z Z Z Z Z Z Z | |
# Z X1 Z Z Z Z Z Z | |
# Z Z X2 Z Z Z Z Z | |
# Z Z Z X3 Z Z Z Z | |
# Z Z Z Z X4 Z Z Z | |
# Z Z Z Z Z X5 Z Z | |
# Z Z Z Z Z Z X6 Z | |
# Z Z Z Z Z Z Z X7] | |
Z = np.zeros((T,2)) | |
R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z)) | |
R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z)) | |
R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z)) | |
R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z)) | |
R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z)) | |
R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z)) | |
R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z)) | |
R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7])) | |
X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8)) | |
#y has the following format: | |
#y = [y0 | |
# y1 | |
# y2 | |
# y3 | |
# y4 | |
# y5 | |
# y6 | |
# y7] | |
y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4], | |
y_matrices[5], y_matrices[6], y_matrices[7])) | |
#We'll now build the Sigma matrix, which we will later use to estimate the (MT X MT) Omega | |
# matrix of correlations between residuals from the M regressions. | |
#Our working assumption is that residuals are correlated across equations for the same time | |
# period but not across different time periods i.e. Corr(e_ij, e_kl) = 0 if i != k | |
#We'll start by creating an M X M matrix of zeros | |
S = np.zeros((M,M)) | |
#The S matrix has the following structure: | |
# [e_11 e_12 e_13 ... e_1M | |
# e_21 e_22 e_23 ... e_2M | |
# ... ... ... ... e_2M | |
# e_M1 e_M2 e_M3 ... e_MM] | |
#Think of the S matrix as an element-wise summation of T layers, each of size [M x M]. Each [M x M] | |
# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th | |
# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size | |
# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M] | |
for i in range(len(df_OLS_resid)): | |
e_t_prime = np.array(df_OLS_resid[i:i+1]) | |
e_t = np.transpose(e_t_prime) | |
S = np.add(S, np.matmul(e_t, e_t_prime)) | |
#Divide S by T | |
S = S/T | |
#The Sigma matrix is the estimated value of the Omega matrix of residual correlations. Sigma (or | |
# Omega) has the following structure | |
#Sigma = [s_11*I s_12*I s_13*I ... s_1M*I | |
# s_21*I s_22*I s_23*I ... s_2M*I | |
# ... ... ... ... ... | |
# s_M1*I s_M2*I ... ... s_MM*I] | |
sigma = [] | |
for row in range(M): | |
sigma_row = [] | |
for col in range(M): | |
I = np.identity(T) | |
sigma_row.append(S[row][col]*I) | |
sigma.append(sigma_row) | |
sigma_rows= [] | |
for row in range(M): | |
sigma_rows.append(np.hstack(sigma[row])) | |
sigma_matrix = np.vstack(sigma_rows) | |
#Build and train the GLS model | |
gls_model = sm.GLS(endog=y, exog=X, sigma=sigma_matrix) | |
gls_model_results = gls_model.fit() | |
print(gls_model_results.summary()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment