The Fixed Effects regression model is used to estimate the effect of intrinsic characteristics of individuals in a panel data set. This gist builds and trains an FE model on the World Bank dataset available at https://gist.github.com/sachinsdate/c40651e9e4bc13a696780462209f1992.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import scipy.stats as st | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
from matplotlib import pyplot as plt | |
import seaborn as sns | |
colors_master = ['blue', 'red', 'orange', 'lime', 'yellow', 'cyan', 'violet', 'yellow', | |
'sandybrown', 'silver'] | |
#Define the units (countries) of interest | |
unit_names = ['Belgium', 'CzechRepublic', 'France', 'Ireland', 'Portugal', 'UK', 'USA'] | |
unit_names.sort() | |
colors = colors_master[:len(unit_names)] | |
unit_col_name='COUNTRY' | |
time_period_col_name='YEAR' | |
#Define the y and X variable names | |
y_var_name = 'GDP_PCAP_GWTH_PCNT' | |
X_var_names = ['GCF_GWTH_PCNT'] | |
#Load the panel data set of World Bank published development indicators into a Pandas Dataframe | |
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0) | |
print(df_panel.corr()) | |
plot_against_X_index=0 | |
#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross | |
# capital formation growth: | |
sns.scatterplot(x=df_panel[X_var_names[plot_against_X_index]], y=df_panel[y_var_name], | |
hue=df_panel[unit_col_name], palette=colors).set(title= | |
'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation') | |
plt.show() | |
#Print out the first 30 rows | |
print(df_panel.head(30)) | |
#Create the dummy variables, one for each country | |
df_dummies = pd.get_dummies(df_panel[unit_col_name]) | |
#Join the dummies Dataframe with the panel data set | |
df_panel_with_dummies = df_panel.join(df_dummies) | |
print(df_panel_with_dummies) | |
#Construct the regression equation. Note that we are leaving out one dummy variable so as to | |
# avoid perfect multi-colinearity between the 7 dummy variables. The regression model's intercept | |
# will contain the value of the coefficient for the omitted dummy variable. | |
lsdv_expr = y_var_name + ' ~ ' | |
i = 0 | |
for X_var_name in X_var_names: | |
if i > 0: | |
lsdv_expr = lsdv_expr + ' + ' + X_var_name | |
else: | |
lsdv_expr = lsdv_expr + X_var_name | |
i = i + 1 | |
for dummy_name in unit_names[:-1]: | |
lsdv_expr = lsdv_expr + ' + ' + dummy_name | |
print('Regression expression for OLS with dummies=' + lsdv_expr) | |
lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies) | |
lsdv_model_results = lsdv_model.fit() | |
print('===============================================================================') | |
print('============================== OLSR With Dummies ==============================') | |
print(lsdv_model_results.summary()) | |
print('LSDV='+str(lsdv_model_results.ssr)) | |
#Compare the goodness-of-fit of the LSDV FE model with that of the Pooled OLSR model | |
#First build and fit a Poole OLSR model on the panel data set so that we can access it's SSE | |
pooled_y=df_panel[y_var_name] | |
pooled_X=df_panel[X_var_names] | |
pooled_X = sm.add_constant(pooled_X) | |
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X) | |
pooled_olsr_model_results = pooled_olsr_model.fit() | |
#Setup the variables for calculating the F-test | |
#n=number of groups | |
n=len(unit_names) | |
#T=number of time periods per unit | |
T=df_panel.shape[0]/n | |
#N=total number of rows in the panel data set | |
N=n*T | |
#k=number of regression variables of the Pooled OLS model | |
k=len(X_var_names)+1 | |
#Get the Residual Sum of Squares for the Pooled OLS model | |
ssr_restricted_model = pooled_olsr_model_results.ssr | |
#Get the Residual Sum of Squares for the Fixed Effects model | |
ssr_unrestricted_model = lsdv_model_results.ssr | |
#Get the degrees of freedom of the Pooled OLSR model | |
k1 = len(pooled_olsr_model_results.params) | |
#Get the degrees of freedom of the Fixed Effects model | |
k2 = len(lsdv_model_results.params) | |
#Calculate the F statistic | |
f_statistic = ((ssr_restricted_model - ssr_unrestricted_model)/ssr_unrestricted_model)*((N-k2)/( | |
k2-k1)) | |
print('F-statistic for FE model='+str(f_statistic)) | |
#Calculate the critical value at alpha=.05 | |
alpha=0.05 | |
f_critical_value=st.f.ppf((1.0-alpha), (k2-k1), (N-k2)) | |
print('F test critical value at alpha=0.05='+str(f_critical_value)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment