Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active March 29, 2021 17:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/cf88838674eeac0d9d113fe4ff64d567 to your computer and use it in GitHub Desktop.
Save sachinsdate/cf88838674eeac0d9d113fe4ff64d567 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from patsy import dmatrices
from matplotlib import pyplot as plt
from scipy.stats import poisson
import scipy.stats as stats
import statsmodels.api as sm
#Read the Takeover Bids data set
df = pd.read_csv('takeover_bids_dataset.csv', header=0, index_col=[0])
#Display the top 15 rows
df.head(15)
#Print out the summary statistics for the dependent variable: NUMBIDS
stats_labels = ['Mean NUMBIDS', 'Variance NUMBIDS', 'Minimum NUMBIDS', 'Maximum NUMBIDS']
stats_values = [round(df['NUMBIDS'].mean(), 2), round(df['NUMBIDS'].var(), 2), df['NUMBIDS'].min(), df['NUMBIDS'].max()]
print(set(zip(stats_labels, stats_values)))
#Create a grouped data set, grouping by frequency of NUMBIDS and plot the histogram
grouped_df = df.groupby('NUMBIDS').count().sort_values(by='NUMBIDS')
grouped_df['NUMBIDS_OBS_FREQ'] = grouped_df['SIZESQ']
#Print the grouped dataset
print(grouped_df['NUMBIDS_OBS_FREQ'])
#We see that the frequencies for NUMBIDS >= 5 are very less
#The Chi-squared test is not very accurate for bins with very small frequencies.
#To get around this issue, we'll group together frequencies for all NUMBIDS >= 5
grouped_df.at[5, 'NUMBIDS_OBS_FREQ'] = 5
#Let's drop the rows for NUMBIDS > 5 as NUMBID=5 captures frequencies for all MUMBIDS >=5
grouped_df = grouped_df.drop([6,7,10])
#Calculate and store away the observed probabilities of NUMBIDS
grouped_df['NUMBIDS_OBS_PROBA'] = grouped_df['NUMBIDS_OBS_FREQ']/len(df)
#Calculate and store the expected probabilities of NUMBIDS assuming that NUMBIDS are Poisson distributed.
grouped_df['NUMBIDS_POISSON_PMF'] = poisson.pmf(k=grouped_df.index, mu=df['NUMBIDS'].mean())
#For NUMBIDS >=5, we will use the Poisson Survival Function which will give us the probability of seeing NUMBIDS >=5
grouped_df.at[5, 'NUMBIDS_POISSON_PMF'] = poisson.sf(k=4, mu=df['NUMBIDS'].mean())
#Calculate the Poisson distributed expected frequency E_i of each NUMBIDS
grouped_df['NUMBIDS_POISSON_FREQ'] = grouped_df['NUMBIDS_POISSON_PMF']*len(df)
#Print the observed and expected values
print(grouped_df[['NUMBIDS_OBS_PROBA', 'NUMBIDS_OBS_FREQ', 'NUMBIDS_POISSON_PMF', 'NUMBIDS_POISSON_FREQ']])
#Plot the O_i and E_i for all i
labels=['0', '1', '2', '3', '4', ' >=5']
x = np.arange(len(labels))
width = 0.35
fig, ax = plt.subplots()
ax.set_title('Poisson Predicted versus Observed Frequencies of NUMBIDS')
ax.set_xlabel('NUMBIDS')
ax.set_ylabel('Frequency of NUMBIDS')
ax.set_xticks(x)
ax.set_xticklabels(labels)
bar1 = ax.bar(x=x - width/2, height=list(grouped_df['NUMBIDS_OBS_FREQ']), width=width, label='Observed Frequency')
bar2 = ax.bar(x=x + width/2, height=list(grouped_df['NUMBIDS_POISSON_FREQ']), width=width, label='Expected Frequency')
ax.legend()
fig.tight_layout()
plt.show()
#Now let's calculate the Chi-squared test statistic: Sigma [ (O_i-E_i)^2 / E_i ]
grouped_df['CHI_SQUARED_STAT'] = np.power(grouped_df['NUMBIDS_OBS_FREQ']-grouped_df['NUMBIDS_POISSON_FREQ'], 2)/grouped_df['NUMBIDS_POISSON_FREQ']
chi_squared_value = grouped_df['CHI_SQUARED_STAT'].sum()
print('chi_squared_value='+str(chi_squared_value))
total_degrees_of_freedom = len(grouped_df)
#Reduce the degrees of freedom by p where p=number of params of the Poisson distribution. So p=1
reduced_degrees_of_freedom = total_degrees_of_freedom - 1
#Get the p-value of the Chi-squared test statistic with N-p degrees of freedom
chi_squared_p_value = stats.chi2.sf(x=chi_squared_value, df=reduced_degrees_of_freedom)
#Get the test statistic value corresponding to a critical alpha of 0.05 (95% confidence level)
#We use the Inverse Survival Function for getting this value.
#The Survival Function for any probability distribution gives you the probability of observing a value that is greater than a certain value.
#i.e. S(X=x) = Pr(X > x)
#The Inverse of S(x) gives you the X=x such that the probability of observing any X > x is the given q value (e.g. 0.05 or 5%)
critical_chi_squared_value_at_95p = stats.chi2.isf(q=0.05, df=reduced_degrees_of_freedom)
#Print out all these values
stats_labels=['Degrees of freedom', 'Chi-squared test statistic', 'p-value', 'Maximum allowed Chi-squared test statistic for H0 at alpha=0.05']
stats_values=[reduced_degrees_of_freedom, chi_squared_value, chi_squared_p_value, critical_chi_squared_value_at_95p]
print(set(zip(stats_labels, stats_values)))
#Let's also plot the Chi-squared distribution with degrees of freedom=len(df).
#It will help us to visualize how the above values fit into the big picture
plt.xlabel('Chi-squared distributed random number')
plt.ylabel('Chi-squared(df='+str(reduced_degrees_of_freedom)+') probability density')
rv = stats.chi2.rvs(df=reduced_degrees_of_freedom, size=100000)
plt.hist(x=rv, bins=100, density=True, histtype='stepfilled')
plt.show()
#Scipy gives you a method to run the Chi-squared test on your data sets. So you don't have to do some of the calculations that we did earlier
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)
#################################################################################
# Using the Chi-squared test to measure the goodness of fit of a regression model
#################################################################################
#Form the regression expression in Patsy format. NUMBIDS is out dependent variable.
#The variables on the RHS are the explanatory a.k.a. regression variables.
#Patsy will automatically add an intercept of regression.
expr = 'NUMBIDS ~ LEGLREST + REALREST + FINREST + WHITEKNT + BIDPREM + INSTHOLD + SIZE + SIZESQ + REGULATN'
#Using Patsy, carve out the X and y matrics
y_train, X_train = dmatrices(expr, df, return_type='dataframe')
#Build and fit a Poisson regression model on the training data set
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
#Print the model summary
print(poisson_training_results.summary())
#Print the predictions of the Poisson model on the training data set. These predictions are stored in the mu variable.
#There are as many predictions as the number of rows in X (or y).
# These mu values are also known as the conditional expectations of the Poisson rate.
# They are Expectations because each value is the expected (a.k.a. mean) rate.
# They are conditionals because each expected value is conditioned upon the values of the corresponding regression variables. because each value is conditioned
print(poisson_training_results.mu)
#For each observation y in training set, calculate the Poisson probability of observing this y using each value of
#predicted mean y from the Poisson model as the Poisson rate parameter, taking the average of all probabilities calculated
# in this manner
def get_poisson_conditional_probability(row):
num_bids = int(row.name)
if num_bids < 5:
return np.mean(poisson.pmf(k=num_bids, mu=poisson_training_results.mu))
else:
return np.mean(poisson.sf(k=num_bids, mu=poisson_training_results.mu))
grouped_df['PM_NUMBIDS_POISSON_PMF'] = grouped_df.apply(get_poisson_conditional_probability, axis=1)
#Calculate and store the expected frequencies.
grouped_df['PM_NUMBIDS_POISSON_FREQ'] = grouped_df['PM_NUMBIDS_POISSON_PMF']*len(df)
#Once again run the Chi-squared test of goodness of fit.
stats.chisquare(f_obs=grouped_df['NUMBIDS_OBS_FREQ'], f_exp=grouped_df['PM_NUMBIDS_POISSON_FREQ'], ddof=reduced_degrees_of_freedom)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment