Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
sample Python code of running a shuffling and sampling of two populations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
### scores
craft_mean = np.mean(frsq_311.groupby(['dba', 'street', 'inspection_date'])['score'].mean())
print craft_mean
# Out:
# 12.890173410404625
other_mean = np.mean(manHealth[~manHealth.camis.isin(frsq_311.camis)].groupby(['dba', 'street', 'inspection_date'])['score'].mean())
print other_mean
# Out:
# 14.67555658952399
score_diff = other_mean - craft_mean
score_diff
# Out:
# 1.785383179119366
print len(scores_craft)
scores_all = np.append(scores_craft, scores_other)
print scores_all
# Out:
# 177
### shuffle, sample, diff
mean_diff_list = []
for i in range(10000):
np.random.shuffle(scores_all)
x = scores_all[177:]
y = scores_all[:177]
mean_diff_list.append(x.mean() - y.mean())
print np.sum(np.array(mean_diff_list) > score_diff)/float(len(mean_diff_list))
# Out:
# 0.0082
### plot
plt.figure(figsize=(10,5))
plt.hist(mean_diff_list, bins=50)
plt.plot([score_diff, score_diff], [0, 700])
plt.title("Ten Thousand Shuffle & Compare Simulations")
plt.xlabel("Difference in Score Means")
plt.text(-3.5, 10.0, 'endlesspint.com',
fontsize=12, color='gray',
ha='left', va='bottom', alpha=0.3)
### save file localy w high(er) resolution
# plt.savefig('img/diff_hist.png', dpi=300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.