joeleonjr/chi_squared.py

## chi_squared.py
from scipy import stats
import collections
import pandas as pd

loansData = pd.read_csv('https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv')

loansData.dropna(inplace=True)

freq = collections.Counter(loansData['Open.CREDIT.Lines'])

print(stats.chisquare(freq.values()))

#Chi Square tests aren't valid with expected or observed frequences under 5. Can I just remove them with this?
values_to_remove = []
for key, value in freq.items():
	if value < 5:
		values_to_remove.append(key)

loansData = loansData[-loansData['Open.CREDIT.Lines'].isin(values_to_remove)]

freq = collections.Counter(loansData['Open.CREDIT.Lines'])

#this doesn't work in python3
#only provide f_obs and not f_exp...f_exp defaults to cat are equally as likely
print(stats.chisquare(freq.values()))

#The p-value is the probability of the test statistic being (at least as extreme as the one) observed given
#that the null hypothesis is true.  So if there is no relationship is actually true, then there is <5% chance
#that we’ll see the results, but since we did see those results, we reject the null hypothesis.
#Closer to 0 p value, the better and more statistically significant.

#http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-25-chi.html
#great tutorial on manually performing a chi squared test
	from scipy import stats
	import collections
	import pandas as pd

	loansData = pd.read_csv('https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv')

	loansData.dropna(inplace=True)

	freq = collections.Counter(loansData['Open.CREDIT.Lines'])

	print(stats.chisquare(freq.values()))

	#Chi Square tests aren't valid with expected or observed frequences under 5. Can I just remove them with this?
	values_to_remove = []
	for key, value in freq.items():
	if value < 5:
	values_to_remove.append(key)

	loansData = loansData[-loansData['Open.CREDIT.Lines'].isin(values_to_remove)]

	freq = collections.Counter(loansData['Open.CREDIT.Lines'])

	#this doesn't work in python3
	#only provide f_obs and not f_exp...f_exp defaults to cat are equally as likely
	print(stats.chisquare(freq.values()))

	#The p-value is the probability of the test statistic being (at least as extreme as the one) observed given
	#that the null hypothesis is true. So if there is no relationship is actually true, then there is <5% chance
	#that we’ll see the results, but since we did see those results, we reject the null hypothesis.
	#Closer to 0 p value, the better and more statistically significant.

	#http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-25-chi.html
	#great tutorial on manually performing a chi squared test