sachinsdate/system_of_regression_equations.py

## system_of_regression_equations.py
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sb


#Create a list of the assets whose capital asset pricing models will make up the the
# equations system
asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'US Steel', 'Ford', 'Tesla', 'Google',
               'Microsoft']
#M = number of equations
M = len(asset_names)

#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day Risk-Adjusted-Returns data on 8 stock assets over the time period 02-Jan-2019 thru 30-Jan-2020, along with the 90-daya RAR data for the corresponding S & P market indices.
df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0)

#T = number of data samples per equation
N = len(df_90day_RAR)

#Using the asset pricing model of economics, create 8 independent linear models, one for each asset in the file and fit them using OLS. We'll use the Patsy syntax for defining the model's equation. Each model also contains an intercept which is assumed i.e. we do not need to explicitly state it in the model's equation:
regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy',
    'RAR_Alcoa ~ RAR_Metals', 'RAR_USSteel ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto',
    'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology']

#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M
# regression models
df_OLS_resid = pd.DataFrame()

#Build and train the M linear models, print out the training summary, accumulate residuals in a
# new column of the residuals data frame. Along the way, accumulate the corresponding y, and X design matrices for each model which we'll soon use to build the stacked (block) model:
y_matrices = []
X_matrices = []
i=0
for expr in regression_equations:
    olsr_model = smf.ols(formula=expr, data=df_90day_RAR)
    olsr_model_results = olsr_model.fit()
    print(olsr_model_results.summary())
    df_OLS_resid[asset_names[i]] = olsr_model_results.resid
    y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe')
    y_matrices.append(y)
    X_matrices.append(X)
    i = i + 1

#Display the heatmap of correlations between the residuals of the 8 models to judge whether the
# residuals are correlated across equations
sb.heatmap(df_OLS_resid.corr())
plt.show()

#Also print the table of correlations
print(df_OLS_resid.corr())

#We'll now build the stacked X and y matrices. X has the following format:
#X = [X0 Z Z Z Z Z Z Z
#     Z X1 Z Z Z Z Z Z
#     Z Z X2 Z Z Z Z Z
#     Z Z Z X3 Z Z Z Z
#     Z Z Z Z X4 Z Z Z
#     Z Z Z Z Z X5 Z Z
#     Z Z Z Z Z Z X6 Z
#     Z Z Z Z Z Z Z X7]

Z = np.zeros((N,2))

R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z))
R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z))
R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z))
R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z))
R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z))
R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z))
R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z))
R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7]))

X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8))

#y has the following format:
#y = [y0
#     y1
#     y2
#     y3
#     y4
#     y5
#     y6
#     y7]

y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4],
               y_matrices[5], y_matrices[6], y_matrices[7]))

#We'll now build the Sigma matrix, which we will later use to estimate the (MN X MN) Omega
# matrix of correlations between residuals from the M regressions.
#Our working assumption is that residuals are correlated across equations for the same time
# period but not across different time periods i.e. Corr(e_ti, e_sj) = 0 if t != s
#We'll start by creating an M X M matrix of zeros
S = np.zeros((M,M))

#The S matrix has the following structure:
# [e_11 e_12 e_13 ... e_1M
#  e_21 e_22 e_23 ... e_2M
#  ...  ...  ...  ... e_2M
#  e_M1 e_M2 e_M3 ... e_MM]

#Think of the S matrix as an element-wise summation of N layers, each of size [M x M]. Each [M x M]
# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th
# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size
# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M]. The complete formula for S (ref: Greene
# 2018 pp 334) is (1/N)*Sigma_t=1 to N (e_t*e_t_transpose) = (1/N)E'E where E is the residual
# errors Dataframe

E=np.array(df_OLS_resid)
S=(1/N)*np.matmul(np.transpose(E), E)

#Next, we estimate Omega as the Kronecker product of S and I where I is an Identity matrix of
# size (N x N)
I = np.identity(N)
estimated_omega = np.kron(S, I)

#Build and train the GLS model. Supply the estimated Omega matrix into the model. Statsmodels
# calls it the Sigma matrix
gls_model = sm.GLS(endog=y, exog=X, sigma=estimated_omega)
gls_model_results = gls_model.fit()
print(gls_model_results.summary())
	import pandas as pd
	import statsmodels.formula.api as smf
	import statsmodels.api as sm
	from patsy import dmatrices
	from matplotlib import pyplot as plt
	import numpy as np
	import seaborn as sb


	#Create a list of the assets whose capital asset pricing models will make up the the
	# equations system
	asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'US Steel', 'Ford', 'Tesla', 'Google',
	'Microsoft']
	#M = number of equations
	M = len(asset_names)

	#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day Risk-Adjusted-Returns data on 8 stock assets over the time period 02-Jan-2019 thru 30-Jan-2020, along with the 90-daya RAR data for the corresponding S & P market indices.
	df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0)

	#T = number of data samples per equation
	N = len(df_90day_RAR)

	#Using the asset pricing model of economics, create 8 independent linear models, one for each asset in the file and fit them using OLS. We'll use the Patsy syntax for defining the model's equation. Each model also contains an intercept which is assumed i.e. we do not need to explicitly state it in the model's equation:
	regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy',
	'RAR_Alcoa ~ RAR_Metals', 'RAR_USSteel ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto',
	'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology']

	#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M
	# regression models
	df_OLS_resid = pd.DataFrame()

	#Build and train the M linear models, print out the training summary, accumulate residuals in a
	# new column of the residuals data frame. Along the way, accumulate the corresponding y, and X design matrices for each model which we'll soon use to build the stacked (block) model:
	y_matrices = []
	X_matrices = []
	i=0
	for expr in regression_equations:
	olsr_model = smf.ols(formula=expr, data=df_90day_RAR)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())
	df_OLS_resid[asset_names[i]] = olsr_model_results.resid
	y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe')
	y_matrices.append(y)
	X_matrices.append(X)
	i = i + 1

	#Display the heatmap of correlations between the residuals of the 8 models to judge whether the
	# residuals are correlated across equations
	sb.heatmap(df_OLS_resid.corr())
	plt.show()

	#Also print the table of correlations
	print(df_OLS_resid.corr())

	#We'll now build the stacked X and y matrices. X has the following format:
	#X = [X0 Z Z Z Z Z Z Z
	# Z X1 Z Z Z Z Z Z
	# Z Z X2 Z Z Z Z Z
	# Z Z Z X3 Z Z Z Z
	# Z Z Z Z X4 Z Z Z
	# Z Z Z Z Z X5 Z Z
	# Z Z Z Z Z Z X6 Z
	# Z Z Z Z Z Z Z X7]

	Z = np.zeros((N,2))

	R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z))
	R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z))
	R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z))
	R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z))
	R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z))
	R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z))
	R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z))
	R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7]))

	X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8))

	#y has the following format:
	#y = [y0
	# y1
	# y2
	# y3
	# y4
	# y5
	# y6
	# y7]

	y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4],
	y_matrices[5], y_matrices[6], y_matrices[7]))

	#We'll now build the Sigma matrix, which we will later use to estimate the (MN X MN) Omega
	# matrix of correlations between residuals from the M regressions.
	#Our working assumption is that residuals are correlated across equations for the same time
	# period but not across different time periods i.e. Corr(e_ti, e_sj) = 0 if t != s
	#We'll start by creating an M X M matrix of zeros
	S = np.zeros((M,M))

	#The S matrix has the following structure:
	# [e_11 e_12 e_13 ... e_1M
	# e_21 e_22 e_23 ... e_2M
	# ... ... ... ... e_2M
	# e_M1 e_M2 e_M3 ... e_MM]

	#Think of the S matrix as an element-wise summation of N layers, each of size [M x M]. Each [M x M]
	# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th
	# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size
	# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M]. The complete formula for S (ref: Greene
	# 2018 pp 334) is (1/N)Sigma_t=1 to N (e_te_t_transpose) = (1/N)E'E where E is the residual
	# errors Dataframe

	E=np.array(df_OLS_resid)
	S=(1/N)*np.matmul(np.transpose(E), E)

	#Next, we estimate Omega as the Kronecker product of S and I where I is an Identity matrix of
	# size (N x N)
	I = np.identity(N)
	estimated_omega = np.kron(S, I)

	#Build and train the GLS model. Supply the estimated Omega matrix into the model. Statsmodels
	# calls it the Sigma matrix
	gls_model = sm.GLS(endog=y, exog=X, sigma=estimated_omega)
	gls_model_results = gls_model.fit()
	print(gls_model_results.summary())