sachinsdate/system_of_regression_equations.py

## system_of_regression_equations.py
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from matplotlib import pyplot as plt
import numpy as np

asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'Nucor', 'Ford', 'Tesla', 'Google', 'Microsoft']
#M = number of equations
M = len(asset_names)

#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day
# risk-adjusted-returns data on 8 stock assets: Chevron, Halliburton, Alcoa, Nucor, Ford, Tesla,
# Google, Microsoft
df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0)

#T = number of data samples per equation
T = len(df_90day_RAR)

#Using the asset pricing model of economics, create 8 independent linear models, one for each
# asset in the file and fit them using OLS
regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy',
    'RAR_Alcoa ~ RAR_Metals', 'RAR_Nucor ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto',
    'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology']

#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M
# regression models
df_OLS_resid = pd.DataFrame()

y_matrices = []
X_matrices = []
i=0
#Build and train the M linear models, print out the training summary, accumulate residuals in a
# new column of the residuals data frame, and accumulate corresponding y, X design matrices
for expr in regression_equations:
    olsr_model = smf.ols(formula=expr, data=df_90day_RAR)
    olsr_model_results = olsr_model.fit()
    print(olsr_model_results.summary())
    df_OLS_resid[asset_names[i]] = olsr_model_results.resid
    y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe')
    y_matrices.append(y)
    X_matrices.append(X)
    i = i + 1

#print the table of correlations between the residuals of the 8 models to judge whether the
# residuals are correlated across equations
print(df_OLS_resid.corr())

#We'll now build the stacked X and y matrices. X has the following format:
#X = [X0 Z Z Z Z Z Z Z
#     Z X1 Z Z Z Z Z Z
#     Z Z X2 Z Z Z Z Z
#     Z Z Z X3 Z Z Z Z
#     Z Z Z Z X4 Z Z Z
#     Z Z Z Z Z X5 Z Z
#     Z Z Z Z Z Z X6 Z
#     Z Z Z Z Z Z Z X7]

Z = np.zeros((T,2))

R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z))
R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z))
R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z))
R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z))
R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z))
R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z))
R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z))
R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7]))

X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8))

#y has the following format:
#y = [y0
#     y1
#     y2
#     y3
#     y4
#     y5
#     y6
#     y7]

y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4],
               y_matrices[5], y_matrices[6], y_matrices[7]))

#We'll now build the Sigma matrix, which we will later use to estimate the (MT X MT) Omega
# matrix of correlations between residuals from the M regressions.
#Our working assumption is that residuals are correlated across equations for the same time
# period but not across different time periods i.e. Corr(e_ij, e_kl) = 0 if i != k
#We'll start by creating an M X M matrix of zeros
S = np.zeros((M,M))

#The S matrix has the following structure:
# [e_11 e_12 e_13 ... e_1M
#  e_21 e_22 e_23 ... e_2M
#  ...  ...  ...  ... e_2M
#  e_M1 e_M2 e_M3 ... e_MM]

#Think of the S matrix as an element-wise summation of T layers, each of size [M x M]. Each [M x M]
# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th
# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size
# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M]
for i in range(len(df_OLS_resid)):
    e_t_prime = np.array(df_OLS_resid[i:i+1])
    e_t = np.transpose(e_t_prime)
    S = np.add(S, np.matmul(e_t, e_t_prime))

#Divide S by T
S = S/T

#The Sigma matrix is the estimated value of the Omega matrix of residual correlations. Sigma (or
# Omega) has the following structure
#Sigma = [s_11*I s_12*I s_13*I ... s_1M*I
#         s_21*I s_22*I s_23*I ... s_2M*I
#          ...    ...    ...   ...  ...
#         s_M1*I s_M2*I  ...   ... s_MM*I]

sigma = []
for row in range(M):
    sigma_row = []
    for col in range(M):
        I = np.identity(T)
        sigma_row.append(S[row][col]*I)
    sigma.append(sigma_row)

sigma_rows= []
for row in range(M):
    sigma_rows.append(np.hstack(sigma[row]))

sigma_matrix = np.vstack(sigma_rows)

#Build and train the GLS model
gls_model = sm.GLS(endog=y, exog=X, sigma=sigma_matrix)
gls_model_results = gls_model.fit()
print(gls_model_results.summary())
	import pandas as pd
	import statsmodels.formula.api as smf
	import statsmodels.api as sm
	from patsy import dmatrices
	from matplotlib import pyplot as plt
	import numpy as np

	asset_names = ['Chevron', 'Halliburton', 'Alcoa', 'Nucor', 'Ford', 'Tesla', 'Google', 'Microsoft']
	#M = number of equations
	M = len(asset_names)

	#Import the 90-day returns file into a Pandas DataFrame. The file has 90-day
	# risk-adjusted-returns data on 8 stock assets: Chevron, Halliburton, Alcoa, Nucor, Ford, Tesla,
	# Google, Microsoft
	df_90day_RAR = pd.read_csv('90day_RAR_on_assets.csv', header=0, parse_dates=['Date'], index_col=0)

	#T = number of data samples per equation
	T = len(df_90day_RAR)

	#Using the asset pricing model of economics, create 8 independent linear models, one for each
	# asset in the file and fit them using OLS
	regression_equations = ['RAR_Chevron ~ RAR_Energy', 'RAR_Halliburton ~ RAR_Energy',
	'RAR_Alcoa ~ RAR_Metals', 'RAR_Nucor ~ RAR_Metals', 'RAR_Ford ~ RAR_Auto',
	'RAR_Tesla ~ RAR_Auto', 'RAR_Google ~ RAR_Technology', 'RAR_Microsoft ~ RAR_Technology']

	#Create a Dataframe of residuals. Each column will contain the T residuals from one of the M
	# regression models
	df_OLS_resid = pd.DataFrame()

	y_matrices = []
	X_matrices = []
	i=0
	#Build and train the M linear models, print out the training summary, accumulate residuals in a
	# new column of the residuals data frame, and accumulate corresponding y, X design matrices
	for expr in regression_equations:
	olsr_model = smf.ols(formula=expr, data=df_90day_RAR)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())
	df_OLS_resid[asset_names[i]] = olsr_model_results.resid
	y, X = dmatrices(expr, df_90day_RAR, return_type='dataframe')
	y_matrices.append(y)
	X_matrices.append(X)
	i = i + 1

	#print the table of correlations between the residuals of the 8 models to judge whether the
	# residuals are correlated across equations
	print(df_OLS_resid.corr())

	#We'll now build the stacked X and y matrices. X has the following format:
	#X = [X0 Z Z Z Z Z Z Z
	# Z X1 Z Z Z Z Z Z
	# Z Z X2 Z Z Z Z Z
	# Z Z Z X3 Z Z Z Z
	# Z Z Z Z X4 Z Z Z
	# Z Z Z Z Z X5 Z Z
	# Z Z Z Z Z Z X6 Z
	# Z Z Z Z Z Z Z X7]

	Z = np.zeros((T,2))

	R1 = np.hstack((X_matrices[0], Z,Z,Z,Z,Z,Z,Z))
	R2 = np.hstack((Z, X_matrices[1], Z,Z,Z,Z,Z,Z))
	R3 = np.hstack((Z,Z, X_matrices[2], Z,Z,Z,Z,Z))
	R4 = np.hstack((Z,Z,Z, X_matrices[3], Z,Z,Z,Z))
	R5 = np.hstack((Z,Z,Z,Z, X_matrices[4], Z,Z,Z))
	R6 = np.hstack((Z,Z,Z,Z,Z, X_matrices[5], Z,Z))
	R7 = np.hstack((Z,Z,Z,Z,Z,Z, X_matrices[6], Z))
	R8 = np.hstack((Z,Z,Z,Z,Z,Z,Z, X_matrices[7]))

	X = np.vstack((R1, R2, R3, R4, R5, R6, R7, R8))

	#y has the following format:
	#y = [y0
	# y1
	# y2
	# y3
	# y4
	# y5
	# y6
	# y7]

	y = np.vstack((y_matrices[0], y_matrices[1], y_matrices[2], y_matrices[3], y_matrices[4],
	y_matrices[5], y_matrices[6], y_matrices[7]))

	#We'll now build the Sigma matrix, which we will later use to estimate the (MT X MT) Omega
	# matrix of correlations between residuals from the M regressions.
	#Our working assumption is that residuals are correlated across equations for the same time
	# period but not across different time periods i.e. Corr(e_ij, e_kl) = 0 if i != k
	#We'll start by creating an M X M matrix of zeros
	S = np.zeros((M,M))

	#The S matrix has the following structure:
	# [e_11 e_12 e_13 ... e_1M
	# e_21 e_22 e_23 ... e_2M
	# ... ... ... ... e_2M
	# e_M1 e_M2 e_M3 ... e_MM]

	#Think of the S matrix as an element-wise summation of T layers, each of size [M x M]. Each [M x M]
	# 'layer' is a matrix product of two vectors, e_t and e_t_transpose. e_t_transpose is the t-th
	# row of the residuals Dataframe and so e_t is a column vector of size [1 x M]. e_t is of size
	# [M x 1]. Hence matmul(e_t, e_t_prime) is of size [M x M]
	for i in range(len(df_OLS_resid)):
	e_t_prime = np.array(df_OLS_resid[i:i+1])
	e_t = np.transpose(e_t_prime)
	S = np.add(S, np.matmul(e_t, e_t_prime))

	#Divide S by T
	S = S/T

	#The Sigma matrix is the estimated value of the Omega matrix of residual correlations. Sigma (or
	# Omega) has the following structure
	#Sigma = [s_11I s_12I s_13I ... s_1MI
	# s_21I s_22I s_23I ... s_2MI
	# ... ... ... ... ...
	# s_M1I s_M2I ... ... s_MM*I]

	sigma = []
	for row in range(M):
	sigma_row = []
	for col in range(M):
	I = np.identity(T)
	sigma_row.append(S[row][col]*I)
	sigma.append(sigma_row)

	sigma_rows= []
	for row in range(M):
	sigma_rows.append(np.hstack(sigma[row]))

	sigma_matrix = np.vstack(sigma_rows)

	#Build and train the GLS model
	gls_model = sm.GLS(endog=y, exog=X, sigma=sigma_matrix)
	gls_model_results = gls_model.fit()
	print(gls_model_results.summary())