Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active December 15, 2023 13:52
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/20620e1bd3d38d7476d894cffb4c8b6b to your computer and use it in GitHub Desktop.
Save sachinsdate/20620e1bd3d38d7476d894cffb4c8b6b to your computer and use it in GitHub Desktop.
A Pooled OLS regression model for panel data sets using Python and statsmodels, alongwith a detailed analysis of its goodness of fit.
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.graphics.tsaplots as tsap
from statsmodels.compat import lzip
from statsmodels.stats.diagnostic import het_white
from matplotlib import pyplot as plt
import seaborn as sns
#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)
#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
# capital formation growth:
sns.scatterplot(x=df_panel['GCF_GWTH_PCNT'], y=df_panel['GDP_PCAP_GWTH_PCNT'],
hue=df_panel['COUNTRY']).set(title=
'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
plt.show()
#Define the variables
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']
#Carve out the pooled Y
pooled_y=df_panel[y_var_name]
#Carve out the pooled X
pooled_X=df_panel[X_var_names]
#Add the placeholder for the regression intercept. When the model is fitted, the coefficient of
# this variable is the regression model's intercept β_0.
pooled_X = sm.add_constant(pooled_X)
#Build the OLS model
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
#Train the model and fetch the results
pooled_olsr_model_results = pooled_olsr_model.fit()
#Print the results summary
print('===============================================================================')
print('================================= Pooled OLS ==================================')
print(pooled_olsr_model_results.summary())
#Let's plot the Q-Q plot of the residual errors:
sm.qqplot(data=pooled_olsr_model_results.resid, line='45')
plt.show()
#Print out the mean value of the residual errors
print('Mean value of residual errors='+str(pooled_olsr_model_results.resid.mean()))
#Plot the residual errors against X to check for heteroskedasticity of residuals
fig, ax = plt.subplots()
fig.suptitle('Raw residuals of Pooled OLS versus X')
plt.ylabel('Residual (y - mu)')
plt.xlabel('X='+str(X_var_names[0]))
ax.scatter(pooled_X[X_var_names[0]], pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
plt.show()
#Confirm presence of heteroskedasticity of residuals using the White test:
keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:',
'F-statistic:', 'F-test\'s ' 'p-value:']
results = het_white(resid=pooled_olsr_model_results.resid, exog=pooled_X)
print('Results of the White test for heteroskedasticity of residual errors ===> ')
print(lzip(keys,results))
#plot the residuals against y to see if they are correlated with the response variable
fig, ax = plt.subplots()
fig.suptitle('Raw residuals of Pooled OLS versus y')
plt.ylabel('Residual (y - mu)')
plt.xlabel('y')
ax.scatter(pooled_y, pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
plt.show()
#Calculate the Pearson's r between residuals and y
keys = ['Pearson\'s r:', 'p-value:']
results = st.pearsonr(x=pooled_y, y=pooled_olsr_model_results.resid)
print('Results of the Pearson\'s r test of correlation between the residual errors and the '
'response variable y ===>')
print(lzip(keys,results))
#Check if the residuals are auto-correlated
tsap.plot_acf(x=pooled_olsr_model_results.resid)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment