Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active March 22, 2022 12:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sachinsdate/0ceae4423d66c1dd7951a7059957e06d to your computer and use it in GitHub Desktop.
Save sachinsdate/0ceae4423d66c1dd7951a7059957e06d to your computer and use it in GitHub Desktop.
The Random Effects regression model is used to estimate the effect of characteristics of individuals that are intrinsic and unmeasurable.
import math
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns
colors_master = ['blue', 'red', 'orange', 'lime', 'yellow', 'cyan', 'violet', 'yellow',
'sandybrown', 'silver']
#Define the units (countries) of interest
unit_names = ['Belgium', 'CzechRepublic', 'France', 'Ireland', 'Portugal', 'UK', 'USA']
unit_names.sort()
colors = colors_master[:len(unit_names)]
unit_col_name='COUNTRY'
time_period_col_name='YEAR'
#Define the y and X variable names
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']
#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)
#Setup the variables for performing the calculations of sigma2u and sigma2v
#n=number of groups
n=len(unit_names)
#T=number of time periods per unit
T=df_panel.shape[0]/n
#N=total number of rows in the panel data set
N=n*T
#k=number of regression variables of the Pooled OLS model
k=len(X_var_names)
plot_against_X_index=0
#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
# capital formation growth:
sns.scatterplot(x=df_panel[X_var_names[plot_against_X_index]], y=df_panel[y_var_name],
hue=df_panel[unit_col_name], palette=colors).set(title=
'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
plt.show()
#Print out the first 30 rows
print(df_panel.head(30))
############### Estimation procedure for the Random Effects model ###############
##################################################################################################
#STEP 1: In this step, using statsmodels, we train a Pooled OLS regression model model on the
# panel data set. The Pooled OLSR model is basically an OLS regression model that is built on the
# flattened version of the panel data set:
##################################################################################################
pooled_y=df_panel[y_var_name]
pooled_X=df_panel[X_var_names]
pooled_X = sm.add_constant(pooled_X)
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
pooled_olsr_model_results = pooled_olsr_model.fit()
print('===============================================================================')
print('============================== Pooled OLSR model ==============================')
print(pooled_olsr_model_results.summary())
print('residuals of the \'Pooled OLSR\' model:')
print(pooled_olsr_model_results.resid)
##################################################################################################
#STEP 2: Calculation of variance components σ²_ϵ and σ²_u
##################################################################################################
#Build and fit a LSDV model on the panel data set so that we can access it's SSE later on
#Create the dummy variables, one for each country
df_dummies = pd.get_dummies(df_panel[unit_col_name])
#Join the dummies Dataframe with the panel data set
df_panel_with_dummies = df_panel.join(df_dummies)
print(df_panel_with_dummies)
#Construct the regression equation for the LSDV model. Note that we are leaving out one dummy
# variable so as to avoid perfect multi-colinearity between the 7 dummy variables. The regression
# model's intercept will contain the value of the coefficient for the omitted dummy variable.
lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
if i > 0:
lsdv_expr = lsdv_expr + ' + ' + X_var_name
else:
lsdv_expr = lsdv_expr + X_var_name
i = i + 1
for dummy_name in unit_names[:-1]:
lsdv_expr = lsdv_expr + ' + ' + dummy_name
print('Regression expression for OLS with dummies=' + lsdv_expr)
#Build and train the LSDV model
lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies)
lsdv_model_results = lsdv_model.fit()
print('===============================================================================')
print('============================== OLSR With Dummies ==============================')
print(lsdv_model_results.summary())
#Calculate sigma-square-epsilon
sigma2_epsilon = lsdv_model_results.ssr/(n*T-(n+k+1))
print('sigma2_epsilon = ' + str(sigma2_epsilon))
#Calculate sigma-square-pooled
sigma2_pooled = pooled_olsr_model_results.ssr/(n*T-(k+1))
print('sigma2_pooled = ' + str(sigma2_pooled))
#Calculate sigma-square-u
sigma2_u = sigma2_pooled - sigma2_epsilon
print('sigma2_u = ' + str(sigma2_u))
#Calculate theta
theta = 1 - math.sqrt(sigma2_epsilon/(sigma2_epsilon + T*sigma2_u))
print('theta = ' + str(theta))
##################################################################################################
#STEP 3: Calculating the means of the y_i and X_i values for each group (i.e. each unit i) in the
# data panel
##################################################################################################
df_panel_group_means = df_panel.groupby(unit_col_name).mean()
#Append a column for storing the regression intercept to this Dataframe. We'll use that column
# in STEP 5:
df_panel_group_means['const'] = 1.0
#Print out the group means
print(df_panel_group_means)
##################################################################################################
#STEP 4: Calculation of the centered (mean-subtracted) data panel
##################################################################################################
#Prepare the data set for mean centering the y and X columns:
pooled_y_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_y], axis=1)
pooled_X_with_unit_name = pd.concat([df_panel[unit_col_name], pooled_X], axis=1)
#Center each X value using the θ-scaled group-specific mean:
unit_name = ''
for row_index, row in pooled_X_with_unit_name.iterrows():
for column_name, cell_value in row.items():
if column_name == unit_col_name:
unit_name = pooled_X_with_unit_name.at[row_index, column_name]
else:
pooled_X_group_mean = df_panel_group_means.loc[unit_name][column_name]
pooled_X_with_unit_name.at[row_index, column_name] = pooled_X_with_unit_name.at[
row_index, column_name] - theta*pooled_X_group_mean
#Center each y value using the θ-scaled group-specific mean:
for row_index, row in pooled_y_with_unit_name.iterrows():
for column_name, cell_value in row.items():
if column_name == unit_col_name:
unit_name = pooled_y_with_unit_name.at[row_index, column_name]
else:
pooled_y_group_mean = df_panel_group_means.loc[unit_name][column_name]
pooled_y_with_unit_name.at[row_index, column_name] = pooled_y_with_unit_name.at[
row_index, column_name] - theta*pooled_y_group_mean
##################################################################################################
#STEP 5: Calculation of the centered (mean-subtracted) data panel
##################################################################################################
#Carve out the y and X matrices:
re_y=pooled_y_with_unit_name[list(pooled_y_with_unit_name.columns[1:])]
re_X=pooled_X_with_unit_name[list(pooled_X_with_unit_name.columns[1:])]
#Build and train the model
re_model = sm.OLS(endog=re_y, exog=re_X)
re_model_results = re_model.fit()
print('===============================================================================')
print('================================== RE Model ===================================')
print(re_model_results.summary())
#Calculate the LM statistic to test for the significance of the Random Effect
df_pooled_olsr_resid_with_unitnames = pd.concat([df_panel[unit_col_name],pooled_olsr_model_results.resid], axis=1)
df_pooled_olsr_resid_group_means = df_pooled_olsr_resid_with_unitnames.groupby(unit_col_name).mean()
ssr_grouped_means=(df_pooled_olsr_resid_group_means[0]**2).sum()
ssr_pooled_olsr=pooled_olsr_model_results.ssr
LM_statistic = (n*T)/(2*(T-1))*math.pow(((T*T*ssr_grouped_means)/ssr_pooled_olsr - 1),2)
print('LM Statistic='+str(LM_statistic))
alpha=0.05
chi2_critical_value=st.chi2.ppf(q=(1.0-alpha), df=1)
print('chi2_critical_value='+str(chi2_critical_value))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment