Skip to content

Instantly share code, notes, and snippets.

@johannah
Last active July 7, 2017 18:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johannah/6d8bd3584e0c498136477740dfbfdd20 to your computer and use it in GitHub Desktop.
Save johannah/6d8bd3584e0c498136477740dfbfdd20 to your computer and use it in GitHub Desktop.
stats for hackers
import matplotlib.pyplot as plt
import numpy as np
# example taken from Jake Vanderplas' talk "Statistics for Hackers"
def is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=False,null_hyp=.05):
# find difference of means to test null hyp
odiff = np.mean(s1)-np.mean(s2)
# join the lists together
a = np.array(s1+s2)
asize = a.shape[0]
inds = np.zeros(asize)
diffs = []
for t in range(trials):
# set random indexes to zero
inds[:] = 0
# generate random indexes to simulate s2
g2inds = np.random.randint(low=0, high=asize, size=len(s2))
inds[g2inds] = 1
s1sm = np.mean(a[inds==0])
s2sm = np.mean(a[inds==1])
d = s1sm-s2sm
diffs.append(d)
num_greater = np.sum([np.array(diffs)>odiff])
greater_than_null = num_greater/float(trials)
is_significant = greater_than_null<null_hyp
if do_plot:
plt.title("Obs diff: %s s1>s2: %s at p==%s" %(odiff, is_significant, null_hyp))
plt.hist(diffs, bins=50)
plt.axvline(odiff, color='r', linestyle='dashed', linewidth=2)
plt.show()
return is_significant, greater_than_null
def bootstrap_resampling(stack, num_samples=10000):
# idea: simulate the distribution by drawing samples with replacement
# motivation: the data estimates its own distribution
stack = np.array(stack)
stack_size = stack.shape[0]
sample_size = int(stack_size*.5)
xbar = np.zeros(num_samples)
for i in range(num_samples):
ind = np.random.randint(low=0,high=stack_size,size=sample_size)
sample = stack[ind]
xbar[i] = np.mean(sample)
print('mean', np.mean(xbar), '+-', np.std(xbar))
return np.mean(xbar), np.std(xbar)
if __name__ == '__main__':
# test scores. Is s1 smarter than s2?
s1 = [84, 57, 63, 99, 72, 46, 76, 91]
s2 = [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]
#is_significant, gtn = is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=True)
#print("s1 is statistically better than s2:", is_significant)
yurtles = [48,24,51,12,21,41,25,23,32,61,19,24,29,21,23,13,32,18,42,18]
# if given infinite time, what is mean of turtles that yurtle would stack?
# what is uncertainty on that estimate?
bootstrap_resampling(yurtles)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment