sachinsdate/chi_squared_goodness_of_fit.py

## chi_squared_goodness_of_fit.py
import pandas as pd
import numpy as np
from patsy import dmatrices
from matplotlib import pyplot as plt
from scipy.stats import poisson
import scipy.stats as stats
import statsmodels.api as sm

#Read the Takeover Bids data set
df = pd.read_csv('takeover_bids_dataset.csv', header=0, index_col=[0])
#Display the top 15 rows
df.head(15)

#Print out the summary statistics for the dependent variable: NUMBIDS
stats_labels = ['Mean NUMBIDS', 'Variance NUMBIDS', 'Minimum NUMBIDS', 'Maximum NUMBIDS']
stats_values = [round(df['NUMBIDS'].mean(), 2), round(df['NUMBIDS'].var(), 2), df['NUMBIDS'].min(), df['NUMBIDS'].max()]
print(set(zip(stats_labels, stats_values)))

#Create a grouped data set, grouping by frequency of NUMBIDS and plot the histogram
grouped_df = df.groupby('NUMBIDS').count().sort_values(by='NUMBIDS')
grouped_df['NUMBIDS_OBS_FREQ'] = grouped_df['SIZESQ']
#Print the grouped dataset
print(grouped_df['NUMBIDS_OBS_FREQ'])

#We see that the frequencies for NUMBIDS >= 5 are very less
#The Chi-squared test is not very accurate for bins with very small frequencies.
#To get around this issue, we'll group together frequencies for all NUMBIDS >= 5
grouped_df.at[5, 'NUMBIDS_OBS_FREQ'] = 5

#Let's drop the rows for NUMBIDS > 5 as NUMBID=5 captures frequencies for all MUMBIDS >=5
grouped_df = grouped_df.drop([6,7,10])

#Calculate and store away the observed probabilities of NUMBIDS
grouped_df['NUMBIDS_OBS_PROBA'] = grouped_df['NUMBIDS_OBS_FREQ']/len(df)

#Calculate and store the expected probabilities of NUMBIDS assuming that NUMBIDS are Poisson distributed.
grouped_df['NUMBIDS_POISSON_PMF'] = poisson.pmf(k=grouped_df.index, mu=df['NUMBIDS'].mean())
#For NUMBIDS >=5, we will use the Poisson Survival Function which will give us the probability of seeing NUMBIDS >=5
grouped_df.at[5, 'NUMBIDS_POISSON_PMF'] = poisson.sf(k=4, mu=df['NUMBIDS'].mean())

#Calculate the Poisson distributed expected frequency E_i of each NUMBIDS
grouped_df['NUMBIDS_POISSON_FREQ'] = grouped_df['NUMBIDS_POISSON_PMF']*len(df)

#Print the observed and expected values
print(grouped_df[['NUMBIDS_OBS_PROBA', 'NUMBIDS_OBS_FREQ', 'NUMBIDS_POISSON_PMF', 'NUMBIDS_POISSON_FREQ']])

#Plot the O_i and E_i for all i
labels=['0', '1', '2', '3', '4', ' >=5']
x = np.arange(len(labels))
width = 0.35
fig, ax = plt.subplots()
ax.set_title('Poisson Predicted versus Observed Frequencies of NUMBIDS')
ax.set_xlabel('NUMBIDS')
ax.set_ylabel('Frequency of NUMBIDS')
ax.set_xticks(x)
ax.set_xticklabels(labels)
bar1 = ax.bar(x=x - width/2, height=list(grouped_df['NUMBIDS_OBS_FREQ']), width=width, label='Observed Frequency')
bar2 = ax.bar(x=x + width/2, height=list(grouped_df['NUMBIDS_POISSON_FREQ']), width=width, label='Expected Frequency')
ax.legend()
fig.tight_layout()
plt.show()


#Now let's calculate the Chi-squared test statistic: Sigma [ (O_i-E_i)^2 / E_i ]
grouped_df['CHI_SQUARED_STAT'] = np.power(grouped_df['NUMBIDS_OBS_FREQ']-grouped_df['NUMBIDS_POISSON_FREQ'], 2)/grouped_df['NUMBIDS_POISSON_FREQ']
chi_squared_value = grouped_df['CHI_SQUARED_STAT'].sum()
print('chi_squared_value='+str(chi_squared_value))

total_degrees_of_freedom = len(grouped_df)
#Reduce the degrees of freedom by p where p=number of params of the Poisson distribution. So p=1
reduced_degrees_of_freedom = total_degrees_of_freedom - 1
#Get the p-value of the Chi-squared test statistic with N-p degrees of freedom
chi_squared_p_value = stats.chi2.sf(x=chi_squared_value, df=reduced_degrees_of_freedom)

#Get the test statistic value corresponding to a critical alpha of 0.05 (95% confidence level)
#We use the Inverse Survival Function for getting this value.
#The Survival Function for any probability distribution gives you the probability of observing a value that is greater than a certain value.
#i.e. S(X=x) = Pr(X > x)
#The Inverse of S(x) gives you the X=x such that the  probability of observing any X > x is the given q value (e.g. 0.05 or 5%)
critical_chi_squared_value_at_95p = stats.chi2.isf(q=0.05, df=reduced_degrees_of_freedom)

#Print out all these values
stats_labels=['Degrees of freedom', 'Chi-squared test statistic', 'p-value', 'Maximum allowed Chi-squared test statistic for H0 at alpha=0.05']
stats_values=[reduced_degrees_of_freedom, chi_squared_value, chi_squared_p_value, critical_chi_squared_value_at_95p]
print(set(zip(stats_labels, stats_values)))

#Let's also plot the Chi-squared distribution with degrees of freedom=len(df).
#It will help us to visualize how the above values fit into the big picture
plt.xlabel('Chi-squared distributed random number')
plt.ylabel('Chi-squared(df='+str(reduced_degrees_of_freedom)+') probability density')
rv = stats.chi2.rvs(df=reduced_degrees_of_freedom, size=100000)
plt.hist(x=rv, bins=100, density=True, histtype='stepfilled')
plt.show()

#Scipy gives you a method to run the Chi-squared test on your data sets. So you don't have to do some of the calculations that we did earlier
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)

#################################################################################
# Using the Chi-squared test to measure the goodness of fit of a regression model
#################################################################################

#Form the regression expression in Patsy format. NUMBIDS is out dependent variable.
#The variables on the RHS are the explanatory a.k.a. regression variables.
#Patsy will automatically add an intercept of regression.
expr = 'NUMBIDS ~ LEGLREST + REALREST + FINREST + WHITEKNT + BIDPREM + INSTHOLD + SIZE + SIZESQ + REGULATN'

#Using Patsy, carve out the X and y matrics
y_train, X_train = dmatrices(expr, df, return_type='dataframe')

#Build and fit a Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

#Print the model summary
print(poisson_training_results.summary())

#Print the predictions of the Poisson model on the training data set. These predictions are stored in the mu variable.
#There are as many predictions as the number of rows in X (or y).
# These mu values are also known as the conditional expectations of the Poisson rate.
# They are Expectations because each value is the expected (a.k.a. mean) rate.
# They are conditionals because each expected value is conditioned upon the values of the corresponding regression variables. because each value is conditioned
print(poisson_training_results.mu)

#For each observation y in training set, calculate the Poisson probability of observing this y using each value of
#predicted mean y from the Poisson model as the Poisson rate parameter, taking the average of all probabilities calculated
# in this manner
def get_poisson_conditional_probability(row):
    num_bids = int(row.name)
    if num_bids < 5:
        return np.mean(poisson.pmf(k=num_bids, mu=poisson_training_results.mu))
    else:
        return np.mean(poisson.sf(k=num_bids, mu=poisson_training_results.mu))

grouped_df['PM_NUMBIDS_POISSON_PMF'] = grouped_df.apply(get_poisson_conditional_probability, axis=1)
#Calculate and store the expected frequencies.
grouped_df['PM_NUMBIDS_POISSON_FREQ'] = grouped_df['PM_NUMBIDS_POISSON_PMF']*len(df)

#Once again run the Chi-squared test of goodness of fit.
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['PM_NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)
	import pandas as pd
	import numpy as np
	from patsy import dmatrices
	from matplotlib import pyplot as plt
	from scipy.stats import poisson
	import scipy.stats as stats
	import statsmodels.api as sm

	#Read the Takeover Bids data set
	df = pd.read_csv('takeover_bids_dataset.csv', header=0, index_col=[0])
	#Display the top 15 rows
	df.head(15)

	#Print out the summary statistics for the dependent variable: NUMBIDS
	stats_labels = ['Mean NUMBIDS', 'Variance NUMBIDS', 'Minimum NUMBIDS', 'Maximum NUMBIDS']
	stats_values = [round(df['NUMBIDS'].mean(), 2), round(df['NUMBIDS'].var(), 2), df['NUMBIDS'].min(), df['NUMBIDS'].max()]
	print(set(zip(stats_labels, stats_values)))

	#Create a grouped data set, grouping by frequency of NUMBIDS and plot the histogram
	grouped_df = df.groupby('NUMBIDS').count().sort_values(by='NUMBIDS')
	grouped_df['NUMBIDS_OBS_FREQ'] = grouped_df['SIZESQ']
	#Print the grouped dataset
	print(grouped_df['NUMBIDS_OBS_FREQ'])

	#We see that the frequencies for NUMBIDS >= 5 are very less
	#The Chi-squared test is not very accurate for bins with very small frequencies.
	#To get around this issue, we'll group together frequencies for all NUMBIDS >= 5
	grouped_df.at[5, 'NUMBIDS_OBS_FREQ'] = 5

	#Let's drop the rows for NUMBIDS > 5 as NUMBID=5 captures frequencies for all MUMBIDS >=5
	grouped_df = grouped_df.drop([6,7,10])

	#Calculate and store away the observed probabilities of NUMBIDS
	grouped_df['NUMBIDS_OBS_PROBA'] = grouped_df['NUMBIDS_OBS_FREQ']/len(df)

	#Calculate and store the expected probabilities of NUMBIDS assuming that NUMBIDS are Poisson distributed.
	grouped_df['NUMBIDS_POISSON_PMF'] = poisson.pmf(k=grouped_df.index, mu=df['NUMBIDS'].mean())
	#For NUMBIDS >=5, we will use the Poisson Survival Function which will give us the probability of seeing NUMBIDS >=5
	grouped_df.at[5, 'NUMBIDS_POISSON_PMF'] = poisson.sf(k=4, mu=df['NUMBIDS'].mean())

	#Calculate the Poisson distributed expected frequency E_i of each NUMBIDS
	grouped_df['NUMBIDS_POISSON_FREQ'] = grouped_df['NUMBIDS_POISSON_PMF']*len(df)

	#Print the observed and expected values
	print(grouped_df[['NUMBIDS_OBS_PROBA', 'NUMBIDS_OBS_FREQ', 'NUMBIDS_POISSON_PMF', 'NUMBIDS_POISSON_FREQ']])

	#Plot the O_i and E_i for all i
	labels=['0', '1', '2', '3', '4', ' >=5']
	x = np.arange(len(labels))
	width = 0.35
	fig, ax = plt.subplots()
	ax.set_title('Poisson Predicted versus Observed Frequencies of NUMBIDS')
	ax.set_xlabel('NUMBIDS')
	ax.set_ylabel('Frequency of NUMBIDS')
	ax.set_xticks(x)
	ax.set_xticklabels(labels)
	bar1 = ax.bar(x=x - width/2, height=list(grouped_df['NUMBIDS_OBS_FREQ']), width=width, label='Observed Frequency')
	bar2 = ax.bar(x=x + width/2, height=list(grouped_df['NUMBIDS_POISSON_FREQ']), width=width, label='Expected Frequency')
	ax.legend()
	fig.tight_layout()
	plt.show()



	#Now let's calculate the Chi-squared test statistic: Sigma [ (O_i-E_i)^2 / E_i ]
	grouped_df['CHI_SQUARED_STAT'] = np.power(grouped_df['NUMBIDS_OBS_FREQ']-grouped_df['NUMBIDS_POISSON_FREQ'], 2)/grouped_df['NUMBIDS_POISSON_FREQ']
	chi_squared_value = grouped_df['CHI_SQUARED_STAT'].sum()
	print('chi_squared_value='+str(chi_squared_value))

	total_degrees_of_freedom = len(grouped_df)
	#Reduce the degrees of freedom by p where p=number of params of the Poisson distribution. So p=1
	reduced_degrees_of_freedom = total_degrees_of_freedom - 1
	#Get the p-value of the Chi-squared test statistic with N-p degrees of freedom
	chi_squared_p_value = stats.chi2.sf(x=chi_squared_value, df=reduced_degrees_of_freedom)

	#Get the test statistic value corresponding to a critical alpha of 0.05 (95% confidence level)
	#We use the Inverse Survival Function for getting this value.
	#The Survival Function for any probability distribution gives you the probability of observing a value that is greater than a certain value.
	#i.e. S(X=x) = Pr(X > x)
	#The Inverse of S(x) gives you the X=x such that the probability of observing any X > x is the given q value (e.g. 0.05 or 5%)
	critical_chi_squared_value_at_95p = stats.chi2.isf(q=0.05, df=reduced_degrees_of_freedom)

	#Print out all these values
	stats_labels=['Degrees of freedom', 'Chi-squared test statistic', 'p-value', 'Maximum allowed Chi-squared test statistic for H0 at alpha=0.05']
	stats_values=[reduced_degrees_of_freedom, chi_squared_value, chi_squared_p_value, critical_chi_squared_value_at_95p]
	print(set(zip(stats_labels, stats_values)))

	#Let's also plot the Chi-squared distribution with degrees of freedom=len(df).
	#It will help us to visualize how the above values fit into the big picture
	plt.xlabel('Chi-squared distributed random number')
	plt.ylabel('Chi-squared(df='+str(reduced_degrees_of_freedom)+') probability density')
	rv = stats.chi2.rvs(df=reduced_degrees_of_freedom, size=100000)
	plt.hist(x=rv, bins=100, density=True, histtype='stepfilled')
	plt.show()

	#Scipy gives you a method to run the Chi-squared test on your data sets. So you don't have to do some of the calculations that we did earlier
	stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)

	#################################################################################
	# Using the Chi-squared test to measure the goodness of fit of a regression model
	#################################################################################

	#Form the regression expression in Patsy format. NUMBIDS is out dependent variable.
	#The variables on the RHS are the explanatory a.k.a. regression variables.
	#Patsy will automatically add an intercept of regression.
	expr = 'NUMBIDS ~ LEGLREST + REALREST + FINREST + WHITEKNT + BIDPREM + INSTHOLD + SIZE + SIZESQ + REGULATN'

	#Using Patsy, carve out the X and y matrics
	y_train, X_train = dmatrices(expr, df, return_type='dataframe')

	#Build and fit a Poisson regression model on the training data set
	poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

	#Print the model summary
	print(poisson_training_results.summary())

	#Print the predictions of the Poisson model on the training data set. These predictions are stored in the mu variable.
	#There are as many predictions as the number of rows in X (or y).
	# These mu values are also known as the conditional expectations of the Poisson rate.
	# They are Expectations because each value is the expected (a.k.a. mean) rate.
	# They are conditionals because each expected value is conditioned upon the values of the corresponding regression variables. because each value is conditioned
	print(poisson_training_results.mu)

	#For each observation y in training set, calculate the Poisson probability of observing this y using each value of
	#predicted mean y from the Poisson model as the Poisson rate parameter, taking the average of all probabilities calculated
	# in this manner
	def get_poisson_conditional_probability(row):
	num_bids = int(row.name)
	if num_bids < 5:
	return np.mean(poisson.pmf(k=num_bids, mu=poisson_training_results.mu))
	else:
	return np.mean(poisson.sf(k=num_bids, mu=poisson_training_results.mu))

	grouped_df['PM_NUMBIDS_POISSON_PMF'] = grouped_df.apply(get_poisson_conditional_probability, axis=1)
	#Calculate and store the expected frequencies.
	grouped_df['PM_NUMBIDS_POISSON_FREQ'] = grouped_df['PM_NUMBIDS_POISSON_PMF']*len(df)

	#Once again run the Chi-squared test of goodness of fit.
	stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['PM_NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)