Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ianozsvald/f373f4278a303bfd5879293831298c45 to your computer and use it in GitHub Desktop.
Save ianozsvald/f373f4278a303bfd5879293831298c45 to your computer and use it in GitHub Desktop.
Take bootstrap sample of array of items (e.g. strings) to calculate CI
# take a bootstrap sample to calculate CI on counts of items
def bootstrap_sample_on_array(items, quantiles=[0.025, 0.975], n_bootstrap_samples = 1000):
all_counts = []
for n in range(n_bootstrap_samples):
sample_ids = np.random.randint(low=0, high=items.shape[0], size=items.shape[0])
sample = items[sample_ids]
uniq, cnts = np.unique(sample, return_counts=True)
c = dict(zip(uniq, cnts))
all_counts.append(c)
all_counts = pd.DataFrame(all_counts).fillna(0)
#all_counts
return all_counts.quantile(quantiles)
items = ['a'] * 100 + ['b'] * 50 + ['c'] * 5
items = np.array(items)
bootstrap_sample_on_array(items) # gives dataframe with 0.025 & 0.975 CI for each item
# equivalent to the following calculation
def calculate_ci_on_items(items):
uniq, cnts = np.unique(items, return_counts=True)
df_summary = pd.DataFrame(pd.Series(index=uniq, data=cnts)).rename(columns={0: 'counts'})
prop = df_summary['counts'] / df_summary['counts'].sum()
ci_95 = np.sqrt((prop * (1-prop)) / ser.sum()) * 1.96
df_summary = pd.DataFrame(ser, columns=['counts'])
df_summary['prop'] = prop
df_summary['prop_0.025'] = prop - ci_95
df_summary['prop_0.975'] = prop + ci_95
df_summary['0.025'] = df_summary['prop_0.025'] * df_summary['counts'].sum()
df_summary['0.975'] = df_summary['prop_0.975'] * df_summary['counts'].sum()
return df_summary.T.loc[['0.025', '0.975']]
calculate_ci_on_items(items)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment