parthi2929/SDSP_helper

## SDSP_helper
# SDSP = Sample Distribution of Sample Proportions
# This is helper file for programmatic illustrations of SDSP concepts.

from random import shuffle
import pandas as pd

def create_bernoulli_population(N, p):
    """
    Given the total size of population N, probability of a specific outcome,
    and associated bernoulli variable as list (of outcomes), this returns a shuffled
    population list
    N - Population size, eg N=10000
    p - probability of interested outcome
    Returns list of 1s and 0s. 1 - indicates the interested outcome, 0 - otherwise
    """
    population_yellow = [1]*(int(p*N))
    population_others = [0]*(int((1-p)*N))
    population = population_yellow + population_others
    shuffle(population)
    return population

def get_frequency_df(raw_list):
    """
    Given a raw list, this provides frequency of duplicate items along with its probability
    Eg:
    X  n(X)  p(X)
    0  4000  0.4
    1  6000  0.6
    If you assume 1 indicates, say a yellow ball, 0 otherwise, then there are 6000 yellow balls
    in given population list, so p(yellow_balls) = 0.6
    """
    # first convert to dictionary of values
    dummy_dict = {i:raw_list.count(i) for i in raw_list}
    freq_dict = {'x':[], 'n(x)':[]}
    freq_dict['x'] = list(dummy_dict.keys())
    freq_dict['n(x)'] = list(dummy_dict.values())

    # dictionary to pd easy transform
    freq_df = pd.DataFrame.from_dict(freq_dict)
    freq_df = freq_df[['x','n(x)']]
    total = freq_df['n(x)'].sum()
    freq_df['p(x)'] = freq_df['n(x)']/total
    freq_df.sort_values('x', inplace=True)
    return freq_df
	# SDSP = Sample Distribution of Sample Proportions
	# This is helper file for programmatic illustrations of SDSP concepts.

	from random import shuffle
	import pandas as pd

	def create_bernoulli_population(N, p):
	"""
	Given the total size of population N, probability of a specific outcome,
	and associated bernoulli variable as list (of outcomes), this returns a shuffled
	population list
	N - Population size, eg N=10000
	p - probability of interested outcome
	Returns list of 1s and 0s. 1 - indicates the interested outcome, 0 - otherwise
	"""
	population_yellow = [1](int(pN))
	population_others = [0](int((1-p)N))
	population = population_yellow + population_others
	shuffle(population)
	return population

	def get_frequency_df(raw_list):
	"""
	Given a raw list, this provides frequency of duplicate items along with its probability
	Eg:
	X n(X) p(X)
	0 4000 0.4
	1 6000 0.6
	If you assume 1 indicates, say a yellow ball, 0 otherwise, then there are 6000 yellow balls
	in given population list, so p(yellow_balls) = 0.6
	"""
	# first convert to dictionary of values
	dummy_dict = {i:raw_list.count(i) for i in raw_list}
	freq_dict = {'x':[], 'n(x)':[]}
	freq_dict['x'] = list(dummy_dict.keys())
	freq_dict['n(x)'] = list(dummy_dict.values())

	# dictionary to pd easy transform
	freq_df = pd.DataFrame.from_dict(freq_dict)
	freq_df = freq_df[['x','n(x)']]
	total = freq_df['n(x)'].sum()
	freq_df['p(x)'] = freq_df['n(x)']/total
	freq_df.sort_values('x', inplace=True)
	return freq_df