Last active
March 22, 2022 12:46
-
-
Save sachinsdate/0ceae4423d66c1dd7951a7059957e06d to your computer and use it in GitHub Desktop.
The Random Effects regression model is used to estimate the effect of characteristics of individuals that are intrinsic and unmeasurable.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import scipy.stats as st | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
from matplotlib import pyplot as plt | |
import seaborn as sns | |
colors_master = ['blue', 'red', 'orange', 'lime', 'yellow', 'cyan', 'violet', 'yellow', | |
'sandybrown', 'silver'] | |
#Define the units (countries) of interest | |
unit_names = ['Belgium', 'CzechRepublic', 'France', 'Ireland', 'Portugal', 'UK', 'USA'] | |
unit_names.sort() | |
colors = colors_master[:len(unit_names)] | |
unit_col_name='COUNTRY' | |
time_period_col_name='YEAR' | |
#Define the y and X variable names | |
y_var_name = 'GDP_PCAP_GWTH_PCNT' | |
X_var_names = ['GCF_GWTH_PCNT'] | |
#Load the panel data set of World Bank published development indicators into a Pandas Dataframe | |
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0) | |
#Setup the variables for performing the calculations of sigma2u and sigma2v | |
#n=number of groups | |
n=len(unit_names) | |
#T=number of time periods per unit | |
T=df_panel.shape[0]/n | |
#N=total number of rows in the panel data set | |
N=n*T | |
#k=number of regression variables of the Pooled OLS model | |
k=len(X_var_names) | |
plot_against_X_index=0 | |
#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross | |
# capital formation growth: | |
sns.scatterplot(x=df_panel[X_var_names[plot_against_X_index]], y=df_panel[y_var_name], | |
hue=df_panel[unit_col_name], palette=colors).set(title= | |
'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation') | |
plt.show() | |
#Print out the first 30 rows | |
print(df_panel.head(30)) | |
############### Estimation procedure for the Random Effects model ############### | |
################################################################################################## | |
#STEP 1: In this step, using statsmodels, we train a Pooled OLS regression model model on the | |
# panel data set. The Pooled OLSR model is basically an OLS regression model that is built on the | |
# flattened version of the panel data set: | |
################################################################################################## | |
pooled_y=df_panel[y_var_name] | |
pooled_X=df_panel[X_var_names] | |
pooled_X = sm.add_constant(pooled_X) | |
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X) | |
pooled_olsr_model_results = pooled_olsr_model.fit() | |
print('===============================================================================') | |
print('============================== Pooled OLSR model ==============================') | |
print(pooled_olsr_model_results.summary()) | |
print('residuals of the \'Pooled OLSR\' model:') | |
print(pooled_olsr_model_results.resid) | |
################################################################################################## | |
#STEP 2: Calculation of variance components σ²_ϵ and σ²_u | |
################################################################################################## | |
#Build and fit a LSDV model on the panel data set so that we can access it's SSE later on | |
#Create the dummy variables, one for each country | |
df_dummies = pd.get_dummies(df_panel[unit_col_name]) | |
#Join the dummies Dataframe with the panel data set | |
df_panel_with_dummies = df_panel.join(df_dummies) | |
print(df_panel_with_dummies) | |
#Construct the regression equation for the LSDV model. Note that we are leaving out one dummy | |
# variable so as to avoid perfect multi-colinearity between the 7 dummy variables. The regression | |
# model's intercept will contain the value of the coefficient for the omitted dummy variable. | |
lsdv_expr = y_var_name + ' ~ ' | |
i = 0 | |
for X_var_name in X_var_names: | |
if i > 0: | |
lsdv_expr = lsdv_expr + ' + ' + X_var_name | |
else: | |
lsdv_expr = lsdv_expr + X_var_name | |
i = i + 1 | |
for dummy_name in unit_names[:-1]: | |
lsdv_expr = lsdv_expr + ' + ' + dummy_name | |
print('Regression expression for OLS with dummies=' + lsdv_expr) | |
#Build and train the LSDV model | |
lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies) | |
lsdv_model_results = lsdv_model.fit() | |
print('===============================================================================') | |
print('============================== OLSR With Dummies ==============================') | |
print(lsdv_model_results.summary()) | |
#Calculate sigma-square-epsilon | |
sigma2_epsilon = lsdv_model_results.ssr/(n*T-(n+k+1)) | |
print('sigma2_epsilon = ' + str(sigma2_epsilon)) | |
#Calculate sigma-square-pooled | |
sigma2_pooled = pooled_olsr_model_results.ssr/(n*T-(k+1)) | |
print('sigma2_pooled = ' + str(sigma2_pooled)) | |
#Calculate sigma-square-u | |
sigma2_u = sigma2_pooled - sigma2_epsilon | |
print('sigma2_u = ' + str(sigma2_u)) | |
#Calculate theta | |
theta = 1 - math.sqrt(sigma2_epsilon/(sigma2_epsilon + T*sigma2_u)) | |
print('theta = ' + str(theta)) | |
################################################################################################## | |
#STEP 3: Calculating the means of the y_i and X_i values for each group (i.e. each unit i) in the | |
# data panel | |
################################################################################################## | |
df_panel_group_means = df_panel.groupby(unit_col_name).mean() | |
#Append a column for storing the regression intercept to this Dataframe. We'll use that column | |
# in STEP 5: | |
df_panel_group_means['const'] = 1.0 | |
#Print out the group means | |
print(df_panel_group_means) | |
################################################################################################## | |
#STEP 4: Calculation of the centered (mean-subtracted) data panel | |
################################################################################################## | |
#Prepare the data set for mean centering the y and X columns: | |
pooled_y_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_y], axis=1) | |
pooled_X_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_X], axis=1) | |
#Center each X value using the θ-scaled group-specific mean: | |
unit_name = '' | |
for row_index, row in pooled_X_with_unit_name.iterrows(): | |
for column_name, cell_value in row.items(): | |
if column_name == unit_col_name: | |
unit_name = pooled_X_with_unit_name.at[row_index, column_name] | |
else: | |
pooled_X_group_mean = df_panel_group_means.loc[unit_name][column_name] | |
pooled_X_with_unit_name.at[row_index, column_name] = pooled_X_with_unit_name.at[ | |
row_index, column_name] - theta*pooled_X_group_mean | |
#Center each y value using the θ-scaled group-specific mean: | |
for row_index, row in pooled_y_with_unit_name.iterrows(): | |
for column_name, cell_value in row.items(): | |
if column_name == unit_col_name: | |
unit_name = pooled_y_with_unit_name.at[row_index, column_name] | |
else: | |
pooled_y_group_mean = df_panel_group_means.loc[unit_name][column_name] | |
pooled_y_with_unit_name.at[row_index, column_name] = pooled_y_with_unit_name.at[ | |
row_index, column_name] - theta*pooled_y_group_mean | |
################################################################################################## | |
#STEP 5: Calculation of the centered (mean-subtracted) data panel | |
################################################################################################## | |
#Carve out the y and X matrices: | |
re_y=pooled_y_with_unit_name[list(pooled_y_with_unit_name.columns[1:])] | |
re_X=pooled_X_with_unit_name[list(pooled_X_with_unit_name.columns[1:])] | |
#Build and train the model | |
re_model = sm.OLS(endog=re_y, exog=re_X) | |
re_model_results = re_model.fit() | |
print('===============================================================================') | |
print('================================== RE Model ===================================') | |
print(re_model_results.summary()) | |
#Calculate the LM statistic to test for the significance of the Random Effect | |
df_pooled_olsr_resid_with_unitnames = pd.concat([df_panel[unit_col_name],pooled_olsr_model_results.resid], axis=1) | |
df_pooled_olsr_resid_group_means = df_pooled_olsr_resid_with_unitnames.groupby(unit_col_name).mean() | |
ssr_grouped_means=(df_pooled_olsr_resid_group_means[0]**2).sum() | |
ssr_pooled_olsr=pooled_olsr_model_results.ssr | |
LM_statistic = (n*T)/(2*(T-1))*math.pow(((T*T*ssr_grouped_means)/ssr_pooled_olsr - 1),2) | |
print('LM Statistic='+str(LM_statistic)) | |
alpha=0.05 | |
chi2_critical_value=st.chi2.ppf(q=(1.0-alpha), df=1) | |
print('chi2_critical_value='+str(chi2_critical_value)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment