Last active
March 29, 2021 17:01
-
-
Save sachinsdate/cf88838674eeac0d9d113fe4ff64d567 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from patsy import dmatrices | |
from matplotlib import pyplot as plt | |
from scipy.stats import poisson | |
import scipy.stats as stats | |
import statsmodels.api as sm | |
#Read the Takeover Bids data set | |
df = pd.read_csv('takeover_bids_dataset.csv', header=0, index_col=[0]) | |
#Display the top 15 rows | |
df.head(15) | |
#Print out the summary statistics for the dependent variable: NUMBIDS | |
stats_labels = ['Mean NUMBIDS', 'Variance NUMBIDS', 'Minimum NUMBIDS', 'Maximum NUMBIDS'] | |
stats_values = [round(df['NUMBIDS'].mean(), 2), round(df['NUMBIDS'].var(), 2), df['NUMBIDS'].min(), df['NUMBIDS'].max()] | |
print(set(zip(stats_labels, stats_values))) | |
#Create a grouped data set, grouping by frequency of NUMBIDS and plot the histogram | |
grouped_df = df.groupby('NUMBIDS').count().sort_values(by='NUMBIDS') | |
grouped_df['NUMBIDS_OBS_FREQ'] = grouped_df['SIZESQ'] | |
#Print the grouped dataset | |
print(grouped_df['NUMBIDS_OBS_FREQ']) | |
#We see that the frequencies for NUMBIDS >= 5 are very less | |
#The Chi-squared test is not very accurate for bins with very small frequencies. | |
#To get around this issue, we'll group together frequencies for all NUMBIDS >= 5 | |
grouped_df.at[5, 'NUMBIDS_OBS_FREQ'] = 5 | |
#Let's drop the rows for NUMBIDS > 5 as NUMBID=5 captures frequencies for all MUMBIDS >=5 | |
grouped_df = grouped_df.drop([6,7,10]) | |
#Calculate and store away the observed probabilities of NUMBIDS | |
grouped_df['NUMBIDS_OBS_PROBA'] = grouped_df['NUMBIDS_OBS_FREQ']/len(df) | |
#Calculate and store the expected probabilities of NUMBIDS assuming that NUMBIDS are Poisson distributed. | |
grouped_df['NUMBIDS_POISSON_PMF'] = poisson.pmf(k=grouped_df.index, mu=df['NUMBIDS'].mean()) | |
#For NUMBIDS >=5, we will use the Poisson Survival Function which will give us the probability of seeing NUMBIDS >=5 | |
grouped_df.at[5, 'NUMBIDS_POISSON_PMF'] = poisson.sf(k=4, mu=df['NUMBIDS'].mean()) | |
#Calculate the Poisson distributed expected frequency E_i of each NUMBIDS | |
grouped_df['NUMBIDS_POISSON_FREQ'] = grouped_df['NUMBIDS_POISSON_PMF']*len(df) | |
#Print the observed and expected values | |
print(grouped_df[['NUMBIDS_OBS_PROBA', 'NUMBIDS_OBS_FREQ', 'NUMBIDS_POISSON_PMF', 'NUMBIDS_POISSON_FREQ']]) | |
#Plot the O_i and E_i for all i | |
labels=['0', '1', '2', '3', '4', ' >=5'] | |
x = np.arange(len(labels)) | |
width = 0.35 | |
fig, ax = plt.subplots() | |
ax.set_title('Poisson Predicted versus Observed Frequencies of NUMBIDS') | |
ax.set_xlabel('NUMBIDS') | |
ax.set_ylabel('Frequency of NUMBIDS') | |
ax.set_xticks(x) | |
ax.set_xticklabels(labels) | |
bar1 = ax.bar(x=x - width/2, height=list(grouped_df['NUMBIDS_OBS_FREQ']), width=width, label='Observed Frequency') | |
bar2 = ax.bar(x=x + width/2, height=list(grouped_df['NUMBIDS_POISSON_FREQ']), width=width, label='Expected Frequency') | |
ax.legend() | |
fig.tight_layout() | |
plt.show() | |
#Now let's calculate the Chi-squared test statistic: Sigma [ (O_i-E_i)^2 / E_i ] | |
grouped_df['CHI_SQUARED_STAT'] = np.power(grouped_df['NUMBIDS_OBS_FREQ']-grouped_df['NUMBIDS_POISSON_FREQ'], 2)/grouped_df['NUMBIDS_POISSON_FREQ'] | |
chi_squared_value = grouped_df['CHI_SQUARED_STAT'].sum() | |
print('chi_squared_value='+str(chi_squared_value)) | |
total_degrees_of_freedom = len(grouped_df) | |
#Reduce the degrees of freedom by p where p=number of params of the Poisson distribution. So p=1 | |
reduced_degrees_of_freedom = total_degrees_of_freedom - 1 | |
#Get the p-value of the Chi-squared test statistic with N-p degrees of freedom | |
chi_squared_p_value = stats.chi2.sf(x=chi_squared_value, df=reduced_degrees_of_freedom) | |
#Get the test statistic value corresponding to a critical alpha of 0.05 (95% confidence level) | |
#We use the Inverse Survival Function for getting this value. | |
#The Survival Function for any probability distribution gives you the probability of observing a value that is greater than a certain value. | |
#i.e. S(X=x) = Pr(X > x) | |
#The Inverse of S(x) gives you the X=x such that the probability of observing any X > x is the given q value (e.g. 0.05 or 5%) | |
critical_chi_squared_value_at_95p = stats.chi2.isf(q=0.05, df=reduced_degrees_of_freedom) | |
#Print out all these values | |
stats_labels=['Degrees of freedom', 'Chi-squared test statistic', 'p-value', 'Maximum allowed Chi-squared test statistic for H0 at alpha=0.05'] | |
stats_values=[reduced_degrees_of_freedom, chi_squared_value, chi_squared_p_value, critical_chi_squared_value_at_95p] | |
print(set(zip(stats_labels, stats_values))) | |
#Let's also plot the Chi-squared distribution with degrees of freedom=len(df). | |
#It will help us to visualize how the above values fit into the big picture | |
plt.xlabel('Chi-squared distributed random number') | |
plt.ylabel('Chi-squared(df='+str(reduced_degrees_of_freedom)+') probability density') | |
rv = stats.chi2.rvs(df=reduced_degrees_of_freedom, size=100000) | |
plt.hist(x=rv, bins=100, density=True, histtype='stepfilled') | |
plt.show() | |
#Scipy gives you a method to run the Chi-squared test on your data sets. So you don't have to do some of the calculations that we did earlier | |
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom) | |
################################################################################# | |
# Using the Chi-squared test to measure the goodness of fit of a regression model | |
################################################################################# | |
#Form the regression expression in Patsy format. NUMBIDS is out dependent variable. | |
#The variables on the RHS are the explanatory a.k.a. regression variables. | |
#Patsy will automatically add an intercept of regression. | |
expr = 'NUMBIDS ~ LEGLREST + REALREST + FINREST + WHITEKNT + BIDPREM + INSTHOLD + SIZE + SIZESQ + REGULATN' | |
#Using Patsy, carve out the X and y matrics | |
y_train, X_train = dmatrices(expr, df, return_type='dataframe') | |
#Build and fit a Poisson regression model on the training data set | |
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit() | |
#Print the model summary | |
print(poisson_training_results.summary()) | |
#Print the predictions of the Poisson model on the training data set. These predictions are stored in the mu variable. | |
#There are as many predictions as the number of rows in X (or y). | |
# These mu values are also known as the conditional expectations of the Poisson rate. | |
# They are Expectations because each value is the expected (a.k.a. mean) rate. | |
# They are conditionals because each expected value is conditioned upon the values of the corresponding regression variables. because each value is conditioned | |
print(poisson_training_results.mu) | |
#For each observation y in training set, calculate the Poisson probability of observing this y using each value of | |
#predicted mean y from the Poisson model as the Poisson rate parameter, taking the average of all probabilities calculated | |
# in this manner | |
def get_poisson_conditional_probability(row): | |
num_bids = int(row.name) | |
if num_bids < 5: | |
return np.mean(poisson.pmf(k=num_bids, mu=poisson_training_results.mu)) | |
else: | |
return np.mean(poisson.sf(k=num_bids, mu=poisson_training_results.mu)) | |
grouped_df['PM_NUMBIDS_POISSON_PMF'] = grouped_df.apply(get_poisson_conditional_probability, axis=1) | |
#Calculate and store the expected frequencies. | |
grouped_df['PM_NUMBIDS_POISSON_FREQ'] = grouped_df['PM_NUMBIDS_POISSON_PMF']*len(df) | |
#Once again run the Chi-squared test of goodness of fit. | |
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['PM_NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment