Created
December 4, 2016 19:39
-
-
Save joeleonjr/cf4df48a680fa19e030e5d5f06a95cf2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy import stats | |
import collections | |
import pandas as pd | |
loansData = pd.read_csv('https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv') | |
loansData.dropna(inplace=True) | |
freq = collections.Counter(loansData['Open.CREDIT.Lines']) | |
print(stats.chisquare(freq.values())) | |
#Chi Square tests aren't valid with expected or observed frequences under 5. Can I just remove them with this? | |
values_to_remove = [] | |
for key, value in freq.items(): | |
if value < 5: | |
values_to_remove.append(key) | |
loansData = loansData[-loansData['Open.CREDIT.Lines'].isin(values_to_remove)] | |
freq = collections.Counter(loansData['Open.CREDIT.Lines']) | |
#this doesn't work in python3 | |
#only provide f_obs and not f_exp...f_exp defaults to cat are equally as likely | |
print(stats.chisquare(freq.values())) | |
#The p-value is the probability of the test statistic being (at least as extreme as the one) observed given | |
#that the null hypothesis is true. So if there is no relationship is actually true, then there is <5% chance | |
#that we’ll see the results, but since we did see those results, we reject the null hypothesis. | |
#Closer to 0 p value, the better and more statistically significant. | |
#http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-25-chi.html | |
#great tutorial on manually performing a chi squared test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment