Skip to content

Instantly share code, notes, and snippets.

@halloleo
Created November 15, 2019 00:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save halloleo/575ab08f7474a44260b9d803249762a8 to your computer and use it in GitHub Desktop.
Save halloleo/575ab08f7474a44260b9d803249762a8 to your computer and use it in GitHub Desktop.
Sample a given population many times
"""
Sample a given population many times with a fixed confidence level
"""
from pathlib import Path
import math
import csv
import numpy as np
import pandas as pd
import scipy.stats as stats
CACHE_DIR= 'cache'
#
# Utility functions
#
def in_percent(v):
"""Print value v as a percentage"""
print(f"{v*100:.1f}%")
def calc_50percent_df(n):
# Make list `one_lst` of exactly n/2 random indices
take_out_lst = [ i for i in range(n) ]
one_lst = []
for i in range(n//2):
k = len(take_out_lst)
r = np.random.randint(0,k)
one_lst.append(take_out_lst[r])
del(take_out_lst[r])
# Make list and Dataframe where for the indices from `one_lst` are set to 1.
popl = [0] *n
for i in one_lst:
popl[i]=1
df = pd.DataFrame(popl, columns=['vote'])
return df
def create_50percent_df(n=100000, use_cache=True):
"""
Create a random population with exact 50% of the population
having some feature (e.g. said yes to a question).
:param n: population size
:return: DataFrame
"""
cache_path = Path(f'{CACHE_DIR}/pop_{n}.pkl')
if cache_path.exists() and use_cache:
return pd.read_pickle(cache_path)
else:
df = calc_50percent_df(n)
df.to_pickle(cache_path)
return df
def find_zstar(cl):
"""
Calculate zstar from the confidence level
via zscore in scipy stats package
"""
cdf = 0.5+cl/2
zstar = stats.norm.ppf(cdf)
return zstar
def find_CI(sample, cl, colname='vote', f=lambda x: x==1):
"""
Find the confidence interval for a sample
:param sample: Dataframe with one sample column
:param colname: name of column to look into for "yes votes"
:param f: test function for "yes votes"
:param cl: confidence level
:return: Confidence interval as pandas Interval
"""
n = len(sample)
# Count "yes votes"
x = len(sample[f(sample[colname]) == True])
p_hat = x/n
# Margin of Error
sigma_p_hat = math.sqrt(p_hat * (1 - p_hat) / n)
z_star = find_zstar(cl) # from confidence level via scipy
E = z_star * sigma_p_hat # Margin of Error
return pd.Interval(p_hat-E, p_hat+E, closed='both')
def create_many_CIs(popdf, num_of_samples=100, sample_n=25, cl=0.95):
"""
Create an array of CIs from random samples
:param popdf: the population data
:param num_of_samples: number of samples to generate
:param sample_n: sample size
:param cl: confidence level
:return: ndarray of CIs (pandas Intervals)
"""
res = np.array([])
for i in range(num_of_samples):
df = popdf.sample(sample_n)
df = df.reset_index(drop=True)
CI = find_CI(df, cl)
res = np.append(res, CI)
return res
def test_many_CIs(popdf, p, num_of_samples=100, sample_n=25, cl=0.95):
CIs = create_many_CIs(popdf, num_of_samples, sample_n, cl)
num_of_containing_CIs = len([ CI for CI in CIs if p in CI ])
print(f"num_of_samples = {num_of_samples}")
print(f"num_of_containing_CIs = {num_of_containing_CIs}")
print(f"ratio = {num_of_containing_CIs/num_of_samples}")
def write_growing_CI_numbers(popdf, p, max_num_of_samples=1000, sample_n=50,
cl=0.95, csv_stem='growing_CI_nums'):
print(f"Generate {max_num_of_samples} CIs...", end='')
CIs = create_many_CIs(popdf, max_num_of_samples, sample_n, cl)
print("done")
max_exp = round(math.log(max_num_of_samples, 10))
csv_fname = f'{csv_stem}_cl={cl}_n={sample_n}_N={len(popdf)}_mexp={max_exp}.csv'
print(f"Write {csv_fname} ...", end='')
with open(csv_fname, 'w') as csv_f:
writer = csv.writer(csv_f)
writer.writerow(['num_of_samples', 'num_of_containing_CIs', 'ratio'])
cnt=0
for num_of_samples in [pow(10, e) for e in range(1, max_exp)] + [
max_num_of_samples]:
num_of_containing_CIs = len(
[CI for CI in CIs[:num_of_samples] if p in CI])
ratio = num_of_containing_CIs/num_of_samples
writer.writerow([num_of_samples, num_of_containing_CIs, ratio])
csv_f.flush()
cnt += 1
print(f"done ({cnt} lines written)")
if __name__ == '__main__':
print("--- Population ---")
popdf = create_50percent_df()
print(popdf.describe())
p = len(popdf[popdf.vote == 1]) / len(popdf)
print(f"p = {p}")
print("\n--- Growing samples ---")
write_growing_CI_numbers(popdf, p, max_num_of_samples=10000000, csv_stem='growing_CI_nums')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment