sachinsdate/random_effects_regression_model.py

## random_effects_regression_model.py
import math
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns

colors_master = ['blue', 'red', 'orange', 'lime', 'yellow', 'cyan', 'violet', 'yellow',
                 'sandybrown', 'silver']

#Define the units (countries) of interest
unit_names = ['Belgium', 'CzechRepublic', 'France', 'Ireland', 'Portugal', 'UK', 'USA']
unit_names.sort()

colors = colors_master[:len(unit_names)]

unit_col_name='COUNTRY'
time_period_col_name='YEAR'

#Define the y and X variable names
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']

#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)

#Setup the variables for performing the calculations of sigma2u and sigma2v

#n=number of groups
n=len(unit_names)

#T=number of time periods per unit
T=df_panel.shape[0]/n

#N=total number of rows in the panel data set
N=n*T

#k=number of regression variables of the Pooled OLS model
k=len(X_var_names)

plot_against_X_index=0

#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
# capital formation growth:
sns.scatterplot(x=df_panel[X_var_names[plot_against_X_index]], y=df_panel[y_var_name],
                hue=df_panel[unit_col_name], palette=colors).set(title=
                'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
plt.show()

#Print out the first 30 rows
print(df_panel.head(30))

############### Estimation procedure for the Random Effects model ###############

##################################################################################################
#STEP 1: In this step, using statsmodels, we train a Pooled OLS regression model model on the
# panel data set. The Pooled OLSR model is basically an OLS regression model that is built on the
# flattened version of the panel data set:
##################################################################################################
pooled_y=df_panel[y_var_name]
pooled_X=df_panel[X_var_names]
pooled_X = sm.add_constant(pooled_X)
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
pooled_olsr_model_results = pooled_olsr_model.fit()
print('===============================================================================')
print('============================== Pooled OLSR model ==============================')
print(pooled_olsr_model_results.summary())

print('residuals of the \'Pooled OLSR\' model:')
print(pooled_olsr_model_results.resid)

##################################################################################################
#STEP 2: Calculation of variance components σ²_ϵ and σ²_u
##################################################################################################

#Build and fit a LSDV model on the panel data set so that we can access it's SSE later on
#Create the dummy variables, one for each country
df_dummies = pd.get_dummies(df_panel[unit_col_name])

#Join the dummies Dataframe with the panel data set
df_panel_with_dummies = df_panel.join(df_dummies)

print(df_panel_with_dummies)

#Construct the regression equation for the LSDV model. Note that we are leaving out one dummy
# variable so as to avoid perfect multi-colinearity between the 7 dummy variables. The regression
# model's intercept will contain the value of the coefficient for the omitted dummy variable.
lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name

print('Regression expression for OLS with dummies=' + lsdv_expr)

#Build and train the LSDV model
lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies)
lsdv_model_results = lsdv_model.fit()
print('===============================================================================')
print('============================== OLSR With Dummies ==============================')
print(lsdv_model_results.summary())

#Calculate sigma-square-epsilon
sigma2_epsilon = lsdv_model_results.ssr/(n*T-(n+k+1))
print('sigma2_epsilon = ' + str(sigma2_epsilon))

#Calculate sigma-square-pooled
sigma2_pooled = pooled_olsr_model_results.ssr/(n*T-(k+1))
print('sigma2_pooled = ' + str(sigma2_pooled))

#Calculate sigma-square-u
sigma2_u = sigma2_pooled - sigma2_epsilon
print('sigma2_u = ' + str(sigma2_u))

#Calculate theta
theta = 1 - math.sqrt(sigma2_epsilon/(sigma2_epsilon + T*sigma2_u))
print('theta = ' + str(theta))


##################################################################################################
#STEP 3: Calculating the means of the y_i and X_i values for each group (i.e. each unit i) in the
# data panel
##################################################################################################
df_panel_group_means = df_panel.groupby(unit_col_name).mean()
#Append a column for storing the regression intercept to this Dataframe. We'll use that column
# in STEP 5:
df_panel_group_means['const'] = 1.0

#Print out the group means
print(df_panel_group_means)


##################################################################################################
#STEP 4: Calculation of the centered (mean-subtracted) data panel
##################################################################################################

#Prepare the data set for mean centering the y and X columns:
pooled_y_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_y], axis=1)
pooled_X_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_X], axis=1)

#Center each X value using the θ-scaled group-specific mean:
unit_name = ''
for row_index, row in pooled_X_with_unit_name.iterrows():
    for column_name, cell_value in row.items():
        if column_name == unit_col_name:
            unit_name = pooled_X_with_unit_name.at[row_index, column_name]
        else:
            pooled_X_group_mean = df_panel_group_means.loc[unit_name][column_name]
            pooled_X_with_unit_name.at[row_index, column_name] = pooled_X_with_unit_name.at[
                row_index, column_name] - theta*pooled_X_group_mean

#Center each y value using the θ-scaled group-specific mean:
for row_index, row in pooled_y_with_unit_name.iterrows():
    for column_name, cell_value in row.items():
        if column_name == unit_col_name:
            unit_name = pooled_y_with_unit_name.at[row_index, column_name]
        else:
            pooled_y_group_mean = df_panel_group_means.loc[unit_name][column_name]
            pooled_y_with_unit_name.at[row_index, column_name] = pooled_y_with_unit_name.at[
                row_index, column_name] - theta*pooled_y_group_mean

##################################################################################################
#STEP 5: Calculation of the centered (mean-subtracted) data panel
##################################################################################################

#Carve out the y and X matrices:
re_y=pooled_y_with_unit_name[list(pooled_y_with_unit_name.columns[1:])]
re_X=pooled_X_with_unit_name[list(pooled_X_with_unit_name.columns[1:])]

#Build and train the model
re_model = sm.OLS(endog=re_y, exog=re_X)
re_model_results = re_model.fit()

print('===============================================================================')
print('================================== RE Model ===================================')
print(re_model_results.summary())


#Calculate the LM statistic to test for the significance of the Random Effect
df_pooled_olsr_resid_with_unitnames = pd.concat([df_panel[unit_col_name],pooled_olsr_model_results.resid], axis=1)
df_pooled_olsr_resid_group_means = df_pooled_olsr_resid_with_unitnames.groupby(unit_col_name).mean()
ssr_grouped_means=(df_pooled_olsr_resid_group_means[0]**2).sum()
ssr_pooled_olsr=pooled_olsr_model_results.ssr

LM_statistic = (n*T)/(2*(T-1))*math.pow(((T*T*ssr_grouped_means)/ssr_pooled_olsr - 1),2)
print('LM Statistic='+str(LM_statistic))

alpha=0.05
chi2_critical_value=st.chi2.ppf(q=(1.0-alpha), df=1)
print('chi2_critical_value='+str(chi2_critical_value))
	import math
	import pandas as pd
	import scipy.stats as st
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	from matplotlib import pyplot as plt
	import seaborn as sns

	colors_master = ['blue', 'red', 'orange', 'lime', 'yellow', 'cyan', 'violet', 'yellow',
	'sandybrown', 'silver']

	#Define the units (countries) of interest
	unit_names = ['Belgium', 'CzechRepublic', 'France', 'Ireland', 'Portugal', 'UK', 'USA']
	unit_names.sort()

	colors = colors_master[:len(unit_names)]

	unit_col_name='COUNTRY'
	time_period_col_name='YEAR'

	#Define the y and X variable names
	y_var_name = 'GDP_PCAP_GWTH_PCNT'
	X_var_names = ['GCF_GWTH_PCNT']

	#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
	df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)

	#Setup the variables for performing the calculations of sigma2u and sigma2v

	#n=number of groups
	n=len(unit_names)

	#T=number of time periods per unit
	T=df_panel.shape[0]/n

	#N=total number of rows in the panel data set
	N=n*T

	#k=number of regression variables of the Pooled OLS model
	k=len(X_var_names)

	plot_against_X_index=0

	#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
	# capital formation growth:
	sns.scatterplot(x=df_panel[X_var_names[plot_against_X_index]], y=df_panel[y_var_name],
	hue=df_panel[unit_col_name], palette=colors).set(title=
	'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
	plt.show()

	#Print out the first 30 rows
	print(df_panel.head(30))

	############### Estimation procedure for the Random Effects model ###############

	##################################################################################################
	#STEP 1: In this step, using statsmodels, we train a Pooled OLS regression model model on the
	# panel data set. The Pooled OLSR model is basically an OLS regression model that is built on the
	# flattened version of the panel data set:
	##################################################################################################
	pooled_y=df_panel[y_var_name]
	pooled_X=df_panel[X_var_names]
	pooled_X = sm.add_constant(pooled_X)
	pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
	pooled_olsr_model_results = pooled_olsr_model.fit()
	print('===============================================================================')
	print('============================== Pooled OLSR model ==============================')
	print(pooled_olsr_model_results.summary())

	print('residuals of the \'Pooled OLSR\' model:')
	print(pooled_olsr_model_results.resid)

	##################################################################################################
	#STEP 2: Calculation of variance components σ²_ϵ and σ²_u
	##################################################################################################

	#Build and fit a LSDV model on the panel data set so that we can access it's SSE later on
	#Create the dummy variables, one for each country
	df_dummies = pd.get_dummies(df_panel[unit_col_name])

	#Join the dummies Dataframe with the panel data set
	df_panel_with_dummies = df_panel.join(df_dummies)

	print(df_panel_with_dummies)

	#Construct the regression equation for the LSDV model. Note that we are leaving out one dummy
	# variable so as to avoid perfect multi-colinearity between the 7 dummy variables. The regression
	# model's intercept will contain the value of the coefficient for the omitted dummy variable.
	lsdv_expr = y_var_name + ' ~ '
	i = 0
	for X_var_name in X_var_names:
	if i > 0:
	lsdv_expr = lsdv_expr + ' + ' + X_var_name
	else:
	lsdv_expr = lsdv_expr + X_var_name
	i = i + 1
	for dummy_name in unit_names[:-1]:
	lsdv_expr = lsdv_expr + ' + ' + dummy_name

	print('Regression expression for OLS with dummies=' + lsdv_expr)

	#Build and train the LSDV model
	lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies)
	lsdv_model_results = lsdv_model.fit()
	print('===============================================================================')
	print('============================== OLSR With Dummies ==============================')
	print(lsdv_model_results.summary())

	#Calculate sigma-square-epsilon
	sigma2_epsilon = lsdv_model_results.ssr/(n*T-(n+k+1))
	print('sigma2_epsilon = ' + str(sigma2_epsilon))

	#Calculate sigma-square-pooled
	sigma2_pooled = pooled_olsr_model_results.ssr/(n*T-(k+1))
	print('sigma2_pooled = ' + str(sigma2_pooled))

	#Calculate sigma-square-u
	sigma2_u = sigma2_pooled - sigma2_epsilon
	print('sigma2_u = ' + str(sigma2_u))

	#Calculate theta
	theta = 1 - math.sqrt(sigma2_epsilon/(sigma2_epsilon + T*sigma2_u))
	print('theta = ' + str(theta))



	##################################################################################################
	#STEP 3: Calculating the means of the y_i and X_i values for each group (i.e. each unit i) in the
	# data panel
	##################################################################################################
	df_panel_group_means = df_panel.groupby(unit_col_name).mean()
	#Append a column for storing the regression intercept to this Dataframe. We'll use that column
	# in STEP 5:
	df_panel_group_means['const'] = 1.0

	#Print out the group means
	print(df_panel_group_means)


	##################################################################################################
	#STEP 4: Calculation of the centered (mean-subtracted) data panel
	##################################################################################################

	#Prepare the data set for mean centering the y and X columns:
	pooled_y_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_y], axis=1)
	pooled_X_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_X], axis=1)

	#Center each X value using the θ-scaled group-specific mean:
	unit_name = ''
	for row_index, row in pooled_X_with_unit_name.iterrows():
	for column_name, cell_value in row.items():
	if column_name == unit_col_name:
	unit_name = pooled_X_with_unit_name.at[row_index, column_name]
	else:
	pooled_X_group_mean = df_panel_group_means.loc[unit_name][column_name]
	pooled_X_with_unit_name.at[row_index, column_name] = pooled_X_with_unit_name.at[
	row_index, column_name] - theta*pooled_X_group_mean

	#Center each y value using the θ-scaled group-specific mean:
	for row_index, row in pooled_y_with_unit_name.iterrows():
	for column_name, cell_value in row.items():
	if column_name == unit_col_name:
	unit_name = pooled_y_with_unit_name.at[row_index, column_name]
	else:
	pooled_y_group_mean = df_panel_group_means.loc[unit_name][column_name]
	pooled_y_with_unit_name.at[row_index, column_name] = pooled_y_with_unit_name.at[
	row_index, column_name] - theta*pooled_y_group_mean

	##################################################################################################
	#STEP 5: Calculation of the centered (mean-subtracted) data panel
	##################################################################################################

	#Carve out the y and X matrices:
	re_y=pooled_y_with_unit_name[list(pooled_y_with_unit_name.columns[1:])]
	re_X=pooled_X_with_unit_name[list(pooled_X_with_unit_name.columns[1:])]

	#Build and train the model
	re_model = sm.OLS(endog=re_y, exog=re_X)
	re_model_results = re_model.fit()

	print('===============================================================================')
	print('================================== RE Model ===================================')
	print(re_model_results.summary())


	#Calculate the LM statistic to test for the significance of the Random Effect
	df_pooled_olsr_resid_with_unitnames = pd.concat([df_panel[unit_col_name],pooled_olsr_model_results.resid], axis=1)
	df_pooled_olsr_resid_group_means = df_pooled_olsr_resid_with_unitnames.groupby(unit_col_name).mean()
	ssr_grouped_means=(df_pooled_olsr_resid_group_means[0]**2).sum()
	ssr_pooled_olsr=pooled_olsr_model_results.ssr

	LM_statistic = (nT)/(2(T-1))math.pow(((TT*ssr_grouped_means)/ssr_pooled_olsr - 1),2)
	print('LM Statistic='+str(LM_statistic))

	alpha=0.05
	chi2_critical_value=st.chi2.ppf(q=(1.0-alpha), df=1)
	print('chi2_critical_value='+str(chi2_critical_value))