sachinsdate/pooled_ols_regression_model.py

## pooled_ols_regression_model.py
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.graphics.tsaplots as tsap
from statsmodels.compat import lzip
from statsmodels.stats.diagnostic import het_white
from matplotlib import pyplot as plt
import seaborn as sns


#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)

#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
# capital formation growth:
sns.scatterplot(x=df_panel['GCF_GWTH_PCNT'], y=df_panel['GDP_PCAP_GWTH_PCNT'],
                hue=df_panel['COUNTRY']).set(title=
                'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
plt.show()

#Define the variables
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']

#Carve out the pooled Y
pooled_y=df_panel[y_var_name]
#Carve out the pooled X
pooled_X=df_panel[X_var_names]
#Add the placeholder for the regression intercept. When the model is fitted, the coefficient of
# this variable is the regression model's intercept β_0.
pooled_X = sm.add_constant(pooled_X)
#Build the OLS model
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
#Train the model and fetch the results
pooled_olsr_model_results = pooled_olsr_model.fit()
#Print the results summary
print('===============================================================================')
print('================================= Pooled OLS ==================================')
print(pooled_olsr_model_results.summary())

#Let's plot the Q-Q plot of the residual errors:
sm.qqplot(data=pooled_olsr_model_results.resid, line='45')
plt.show()

#Print out the mean value of the residual errors
print('Mean value of residual errors='+str(pooled_olsr_model_results.resid.mean()))

#Plot the residual errors against X to check for heteroskedasticity of residuals
fig, ax = plt.subplots()
fig.suptitle('Raw residuals of Pooled OLS versus X')
plt.ylabel('Residual (y - mu)')
plt.xlabel('X='+str(X_var_names[0]))
ax.scatter(pooled_X[X_var_names[0]], pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
plt.show()

#Confirm presence of heteroskedasticity of residuals using the White test:
keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:',
        'F-statistic:', 'F-test\'s ' 'p-value:']
results = het_white(resid=pooled_olsr_model_results.resid, exog=pooled_X)
print('Results of the White test for heteroskedasticity of residual errors ===> ')
print(lzip(keys,results))

#plot the residuals against y to see if they are correlated with the response variable
fig, ax = plt.subplots()
fig.suptitle('Raw residuals of Pooled OLS versus y')
plt.ylabel('Residual (y - mu)')
plt.xlabel('y')
ax.scatter(pooled_y, pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
plt.show()

#Calculate the Pearson's r between residuals and y
keys = ['Pearson\'s r:', 'p-value:']
results = st.pearsonr(x=pooled_y, y=pooled_olsr_model_results.resid)
print('Results of the Pearson\'s r test of correlation between the residual errors and the '
      'response variable y ===>')
print(lzip(keys,results))


#Check if the residuals are auto-correlated
tsap.plot_acf(x=pooled_olsr_model_results.resid)
plt.show()
	import pandas as pd
	import scipy.stats as st
	import statsmodels.api as sm
	import statsmodels.graphics.tsaplots as tsap
	from statsmodels.compat import lzip
	from statsmodels.stats.diagnostic import het_white
	from matplotlib import pyplot as plt
	import seaborn as sns


	#Load the panel data set of World Bank published development indicators into a Pandas Dataframe
	df_panel = pd.read_csv('wb_data_panel_2ind_7units_1992_2014.csv', header=0)

	#Use Seaborn to plot GDP growth over all time periods and across all countries versus gross
	# capital formation growth:
	sns.scatterplot(x=df_panel['GCF_GWTH_PCNT'], y=df_panel['GDP_PCAP_GWTH_PCNT'],
	hue=df_panel['COUNTRY']).set(title=
	'Y-o-Y % Change in per-capita GDP versus Y-o-Y % Change in Gross capital formation')
	plt.show()

	#Define the variables
	y_var_name = 'GDP_PCAP_GWTH_PCNT'
	X_var_names = ['GCF_GWTH_PCNT']

	#Carve out the pooled Y
	pooled_y=df_panel[y_var_name]
	#Carve out the pooled X
	pooled_X=df_panel[X_var_names]
	#Add the placeholder for the regression intercept. When the model is fitted, the coefficient of
	# this variable is the regression model's intercept β_0.
	pooled_X = sm.add_constant(pooled_X)
	#Build the OLS model
	pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
	#Train the model and fetch the results
	pooled_olsr_model_results = pooled_olsr_model.fit()
	#Print the results summary
	print('===============================================================================')
	print('================================= Pooled OLS ==================================')
	print(pooled_olsr_model_results.summary())

	#Let's plot the Q-Q plot of the residual errors:
	sm.qqplot(data=pooled_olsr_model_results.resid, line='45')
	plt.show()

	#Print out the mean value of the residual errors
	print('Mean value of residual errors='+str(pooled_olsr_model_results.resid.mean()))

	#Plot the residual errors against X to check for heteroskedasticity of residuals
	fig, ax = plt.subplots()
	fig.suptitle('Raw residuals of Pooled OLS versus X')
	plt.ylabel('Residual (y - mu)')
	plt.xlabel('X='+str(X_var_names[0]))
	ax.scatter(pooled_X[X_var_names[0]], pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
	plt.show()

	#Confirm presence of heteroskedasticity of residuals using the White test:
	keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:',
	'F-statistic:', 'F-test\'s ' 'p-value:']
	results = het_white(resid=pooled_olsr_model_results.resid, exog=pooled_X)
	print('Results of the White test for heteroskedasticity of residual errors ===> ')
	print(lzip(keys,results))

	#plot the residuals against y to see if they are correlated with the response variable
	fig, ax = plt.subplots()
	fig.suptitle('Raw residuals of Pooled OLS versus y')
	plt.ylabel('Residual (y - mu)')
	plt.xlabel('y')
	ax.scatter(pooled_y, pooled_olsr_model_results.resid, s=4, c='black', label='Residual Error')
	plt.show()

	#Calculate the Pearson's r between residuals and y
	keys = ['Pearson\'s r:', 'p-value:']
	results = st.pearsonr(x=pooled_y, y=pooled_olsr_model_results.resid)
	print('Results of the Pearson\'s r test of correlation between the residual errors and the '
	'response variable y ===>')
	print(lzip(keys,results))


	#Check if the residuals are auto-correlated
	tsap.plot_acf(x=pooled_olsr_model_results.resid)
	plt.show()