halloleo/Sample_a_population.py

## Sample_a_population.py
"""
Sample a given population many times with a fixed confidence level
"""

from pathlib import Path
import math
import csv

import numpy as np
import pandas as pd
import scipy.stats as stats

CACHE_DIR= 'cache'

#
# Utility functions
#
def in_percent(v):
    """Print value v as a percentage"""
    print(f"{v*100:.1f}%")

def calc_50percent_df(n):

    # Make list `one_lst` of exactly n/2 random indices
    take_out_lst = [ i for i in range(n) ]
    one_lst = []
    for i in range(n//2):
        k = len(take_out_lst)
        r = np.random.randint(0,k)
        one_lst.append(take_out_lst[r])
        del(take_out_lst[r])

    # Make list and Dataframe where for the indices from  `one_lst` are set to 1.
    popl = [0] *n
    for i in one_lst:
        popl[i]=1
    df = pd.DataFrame(popl, columns=['vote'])
    return df


def create_50percent_df(n=100000, use_cache=True):
    """
    Create a random population with exact 50% of the population
    having some feature (e.g. said yes to a question).
    :param n: population size
    :return: DataFrame
    """
    cache_path = Path(f'{CACHE_DIR}/pop_{n}.pkl')
    if cache_path.exists() and use_cache:
        return pd.read_pickle(cache_path)
    else:
        df = calc_50percent_df(n)
        df.to_pickle(cache_path)
        return df


def find_zstar(cl):
    """
    Calculate zstar from the confidence level
    via zscore in scipy stats package
    """
    cdf = 0.5+cl/2
    zstar = stats.norm.ppf(cdf)


    return zstar


def find_CI(sample, cl, colname='vote', f=lambda x: x==1):
    """
    Find the confidence interval for a sample
    :param sample: Dataframe with one sample column
    :param colname: name of column to look into for "yes votes"
    :param f: test function for "yes votes"
    :param cl: confidence level
    :return: Confidence interval as pandas Interval
    """
    n = len(sample)

    # Count "yes votes"
    x = len(sample[f(sample[colname]) == True])
    p_hat = x/n

    # Margin of Error
    sigma_p_hat = math.sqrt(p_hat * (1 - p_hat) / n)
    z_star = find_zstar(cl) # from confidence level via scipy
    E = z_star * sigma_p_hat  # Margin of Error

    return pd.Interval(p_hat-E, p_hat+E, closed='both')


def create_many_CIs(popdf, num_of_samples=100, sample_n=25, cl=0.95):
    """
    Create an array of CIs from random samples
    :param popdf: the population data
    :param num_of_samples: number of samples to generate
    :param sample_n: sample size
    :param cl: confidence level
    :return: ndarray of CIs (pandas Intervals)
    """
    res = np.array([])
    for i in range(num_of_samples):
        df = popdf.sample(sample_n)
        df = df.reset_index(drop=True)
        CI = find_CI(df, cl)
        res = np.append(res, CI)

    return res


def test_many_CIs(popdf, p, num_of_samples=100, sample_n=25, cl=0.95):
    CIs = create_many_CIs(popdf, num_of_samples, sample_n, cl)
    num_of_containing_CIs = len([ CI for CI in CIs if p in CI ])
    print(f"num_of_samples = {num_of_samples}")
    print(f"num_of_containing_CIs = {num_of_containing_CIs}")
    print(f"ratio = {num_of_containing_CIs/num_of_samples}")


def write_growing_CI_numbers(popdf, p, max_num_of_samples=1000, sample_n=50,
                             cl=0.95, csv_stem='growing_CI_nums'):
    print(f"Generate {max_num_of_samples} CIs...", end='')
    CIs = create_many_CIs(popdf, max_num_of_samples, sample_n, cl)
    print("done")

    max_exp = round(math.log(max_num_of_samples, 10))
    csv_fname = f'{csv_stem}_cl={cl}_n={sample_n}_N={len(popdf)}_mexp={max_exp}.csv'
    print(f"Write {csv_fname} ...", end='')

    with open(csv_fname, 'w') as csv_f:
        writer = csv.writer(csv_f)
        writer.writerow(['num_of_samples', 'num_of_containing_CIs', 'ratio'])
        cnt=0
        for num_of_samples in [pow(10, e) for e in range(1, max_exp)] + [
            max_num_of_samples]:
            num_of_containing_CIs = len(
                [CI for CI in CIs[:num_of_samples] if p in CI])
            ratio = num_of_containing_CIs/num_of_samples
            writer.writerow([num_of_samples, num_of_containing_CIs, ratio])
            csv_f.flush()
            cnt += 1

    print(f"done ({cnt} lines written)")


if __name__ == '__main__':

    print("--- Population ---")
    popdf = create_50percent_df()
    print(popdf.describe())
    p = len(popdf[popdf.vote == 1]) / len(popdf)
    print(f"p = {p}")

    print("\n--- Growing samples ---")
    write_growing_CI_numbers(popdf, p, max_num_of_samples=10000000, csv_stem='growing_CI_nums')
	"""
	Sample a given population many times with a fixed confidence level
	"""

	from pathlib import Path
	import math
	import csv

	import numpy as np
	import pandas as pd
	import scipy.stats as stats

	CACHE_DIR= 'cache'

	#
	# Utility functions
	#
	def in_percent(v):
	"""Print value v as a percentage"""
	print(f"{v*100:.1f}%")

	def calc_50percent_df(n):

	# Make list `one_lst` of exactly n/2 random indices
	take_out_lst = [ i for i in range(n) ]
	one_lst = []
	for i in range(n//2):
	k = len(take_out_lst)
	r = np.random.randint(0,k)
	one_lst.append(take_out_lst[r])
	del(take_out_lst[r])

	# Make list and Dataframe where for the indices from `one_lst` are set to 1.
	popl = [0] *n
	for i in one_lst:
	popl[i]=1
	df = pd.DataFrame(popl, columns=['vote'])
	return df


	def create_50percent_df(n=100000, use_cache=True):
	"""
	Create a random population with exact 50% of the population
	having some feature (e.g. said yes to a question).
	:param n: population size
	:return: DataFrame
	"""
	cache_path = Path(f'{CACHE_DIR}/pop_{n}.pkl')
	if cache_path.exists() and use_cache:
	return pd.read_pickle(cache_path)
	else:
	df = calc_50percent_df(n)
	df.to_pickle(cache_path)
	return df


	def find_zstar(cl):
	"""
	Calculate zstar from the confidence level
	via zscore in scipy stats package
	"""
	cdf = 0.5+cl/2
	zstar = stats.norm.ppf(cdf)


	return zstar


	def find_CI(sample, cl, colname='vote', f=lambda x: x==1):
	"""
	Find the confidence interval for a sample
	:param sample: Dataframe with one sample column
	:param colname: name of column to look into for "yes votes"
	:param f: test function for "yes votes"
	:param cl: confidence level
	:return: Confidence interval as pandas Interval
	"""
	n = len(sample)

	# Count "yes votes"
	x = len(sample[f(sample[colname]) == True])
	p_hat = x/n

	# Margin of Error
	sigma_p_hat = math.sqrt(p_hat * (1 - p_hat) / n)
	z_star = find_zstar(cl) # from confidence level via scipy
	E = z_star * sigma_p_hat # Margin of Error

	return pd.Interval(p_hat-E, p_hat+E, closed='both')


	def create_many_CIs(popdf, num_of_samples=100, sample_n=25, cl=0.95):
	"""
	Create an array of CIs from random samples
	:param popdf: the population data
	:param num_of_samples: number of samples to generate
	:param sample_n: sample size
	:param cl: confidence level
	:return: ndarray of CIs (pandas Intervals)
	"""
	res = np.array([])
	for i in range(num_of_samples):
	df = popdf.sample(sample_n)
	df = df.reset_index(drop=True)
	CI = find_CI(df, cl)
	res = np.append(res, CI)

	return res


	def test_many_CIs(popdf, p, num_of_samples=100, sample_n=25, cl=0.95):
	CIs = create_many_CIs(popdf, num_of_samples, sample_n, cl)
	num_of_containing_CIs = len([ CI for CI in CIs if p in CI ])
	print(f"num_of_samples = {num_of_samples}")
	print(f"num_of_containing_CIs = {num_of_containing_CIs}")
	print(f"ratio = {num_of_containing_CIs/num_of_samples}")


	def write_growing_CI_numbers(popdf, p, max_num_of_samples=1000, sample_n=50,
	cl=0.95, csv_stem='growing_CI_nums'):
	print(f"Generate {max_num_of_samples} CIs...", end='')
	CIs = create_many_CIs(popdf, max_num_of_samples, sample_n, cl)
	print("done")

	max_exp = round(math.log(max_num_of_samples, 10))
	csv_fname = f'{csv_stem}_cl={cl}_n={sample_n}_N={len(popdf)}_mexp={max_exp}.csv'
	print(f"Write {csv_fname} ...", end='')

	with open(csv_fname, 'w') as csv_f:
	writer = csv.writer(csv_f)
	writer.writerow(['num_of_samples', 'num_of_containing_CIs', 'ratio'])
	cnt=0
	for num_of_samples in [pow(10, e) for e in range(1, max_exp)] + [
	max_num_of_samples]:
	num_of_containing_CIs = len(
	[CI for CI in CIs[:num_of_samples] if p in CI])
	ratio = num_of_containing_CIs/num_of_samples
	writer.writerow([num_of_samples, num_of_containing_CIs, ratio])
	csv_f.flush()
	cnt += 1

	print(f"done ({cnt} lines written)")


	if __name__ == '__main__':

	print("--- Population ---")
	popdf = create_50percent_df()
	print(popdf.describe())
	p = len(popdf[popdf.vote == 1]) / len(popdf)
	print(f"p = {p}")

	print("\n--- Growing samples ---")
	write_growing_CI_numbers(popdf, p, max_num_of_samples=10000000, csv_stem='growing_CI_nums')