johannah/statistics_for_hackers.py

## statistics_for_hackers.py
import matplotlib.pyplot as plt
import numpy as np

# example taken from Jake Vanderplas' talk "Statistics for Hackers"

def is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=False,null_hyp=.05):
    # find difference of means to test null hyp
    odiff = np.mean(s1)-np.mean(s2)
    # join the lists together
    a = np.array(s1+s2)
    asize = a.shape[0]
    inds = np.zeros(asize)
    diffs = []
    for t in range(trials):
        # set random indexes to zero
        inds[:] = 0
        # generate random indexes to simulate s2
        g2inds = np.random.randint(low=0, high=asize, size=len(s2))
        inds[g2inds] = 1
        s1sm = np.mean(a[inds==0])
        s2sm = np.mean(a[inds==1])
        d = s1sm-s2sm
        diffs.append(d)
    num_greater = np.sum([np.array(diffs)>odiff])
    greater_than_null = num_greater/float(trials)
    is_significant = greater_than_null<null_hyp
    if do_plot:
        plt.title("Obs diff: %s  s1>s2: %s at p==%s" %(odiff, is_significant, null_hyp))
        plt.hist(diffs, bins=50)
        plt.axvline(odiff, color='r', linestyle='dashed', linewidth=2)
        plt.show()

    return is_significant, greater_than_null

def bootstrap_resampling(stack, num_samples=10000):
    # idea: simulate the distribution by drawing samples with replacement
    # motivation: the data estimates its own distribution
    stack = np.array(stack)
    stack_size = stack.shape[0]
    sample_size = int(stack_size*.5)
    xbar = np.zeros(num_samples)
    for i in range(num_samples):
        ind = np.random.randint(low=0,high=stack_size,size=sample_size)
        sample = stack[ind]
        xbar[i] = np.mean(sample)

    print('mean', np.mean(xbar), '+-', np.std(xbar))
    return np.mean(xbar), np.std(xbar)

if __name__ == '__main__':
    # test scores. Is s1 smarter than s2?
    s1 = [84, 57, 63, 99, 72, 46, 76, 91]
    s2 = [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]
    #is_significant, gtn = is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=True)
    #print("s1 is statistically better than s2:", is_significant)

    yurtles = [48,24,51,12,21,41,25,23,32,61,19,24,29,21,23,13,32,18,42,18]
    # if given infinite time, what is mean of turtles that yurtle would stack?
    # what is uncertainty on that estimate?
    bootstrap_resampling(yurtles)
	import matplotlib.pyplot as plt
	import numpy as np

	# example taken from Jake Vanderplas' talk "Statistics for Hackers"

	def is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=False,null_hyp=.05):
	# find difference of means to test null hyp
	odiff = np.mean(s1)-np.mean(s2)
	# join the lists together
	a = np.array(s1+s2)
	asize = a.shape[0]
	inds = np.zeros(asize)
	diffs = []
	for t in range(trials):
	# set random indexes to zero
	inds[:] = 0
	# generate random indexes to simulate s2
	g2inds = np.random.randint(low=0, high=asize, size=len(s2))
	inds[g2inds] = 1
	s1sm = np.mean(a[inds==0])
	s2sm = np.mean(a[inds==1])
	d = s1sm-s2sm
	diffs.append(d)
	num_greater = np.sum([np.array(diffs)>odiff])
	greater_than_null = num_greater/float(trials)
	is_significant = greater_than_null<null_hyp
	if do_plot:
	plt.title("Obs diff: %s s1>s2: %s at p==%s" %(odiff, is_significant, null_hyp))
	plt.hist(diffs, bins=50)
	plt.axvline(odiff, color='r', linestyle='dashed', linewidth=2)
	plt.show()

	return is_significant, greater_than_null

	def bootstrap_resampling(stack, num_samples=10000):
	# idea: simulate the distribution by drawing samples with replacement
	# motivation: the data estimates its own distribution
	stack = np.array(stack)
	stack_size = stack.shape[0]
	sample_size = int(stack_size*.5)
	xbar = np.zeros(num_samples)
	for i in range(num_samples):
	ind = np.random.randint(low=0,high=stack_size,size=sample_size)
	sample = stack[ind]
	xbar[i] = np.mean(sample)

	print('mean', np.mean(xbar), '+-', np.std(xbar))
	return np.mean(xbar), np.std(xbar)

	if __name__ == '__main__':
	# test scores. Is s1 smarter than s2?
	s1 = [84, 57, 63, 99, 72, 46, 76, 91]
	s2 = [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]
	#is_significant, gtn = is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=True)
	#print("s1 is statistically better than s2:", is_significant)

	yurtles = [48,24,51,12,21,41,25,23,32,61,19,24,29,21,23,13,32,18,42,18]
	# if given infinite time, what is mean of turtles that yurtle would stack?
	# what is uncertainty on that estimate?
	bootstrap_resampling(yurtles)