Last active
September 8, 2018 17:57
-
-
Save parthi2929/716ebdb9a6d952b9d8c61031dde209b1 to your computer and use it in GitHub Desktop.
helper files for visualizing and performing statistical analysis on confidence intervals
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import shuffle | |
import matplotlib.pyplot as plt | |
from SDSPSM import get_metrics | |
from random import choices, seed | |
from math import sqrt, pi | |
def create_bernoulli_population(T, p): | |
y_freq = int(p*T) | |
y_pops = [1]*y_freq | |
o_freq = int((1-p)*T) | |
o_pops = [0]*o_freq | |
population = y_pops + o_pops | |
population_freq = [o_freq, y_freq] | |
shuffle(population) | |
return population, population_freq | |
from random import random | |
from math import floor | |
def createRandomPopulation(N, freqMax): | |
""" | |
Create a random distribution for N values. This is to illustrate Sampling Distribution of Sample Means. | |
""" | |
population = [] | |
population_freq = [] | |
for i in range(0,N): | |
temp_freq = (floor(random() * freqMax)) # random frequency for each population | |
temp_list = [i]*temp_freq | |
population += temp_list | |
population_freq.append(temp_freq) | |
shuffle(population) | |
return population, population_freq | |
def mini_plot_SDSP(raw_list, ax1,ax2,ax3, norm_off=False, width=0.1): | |
ax1.hist(raw_list) | |
ax1.set_title('Distribution') | |
_, bins,_ = ax2.hist(raw_list, density=True) | |
ax2.set_title('Density') | |
# probability mass | |
dummy_dict = {i:raw_list.count(i) for i in raw_list} | |
total = sum(list(dummy_dict.values())) | |
pmf = {key: round(val/total,4) for key, val in dummy_dict.items()} | |
ax3.bar(list(pmf.keys()), list(pmf.values()), width=width) | |
ax3.set_title('PMF') | |
mu, var, sigma = get_metrics(raw_list) | |
metrics_text = '$\mu_x:{}$ \n$\sigma_x:{}$'.format(mu, sigma) | |
ax2.text(0.97, 0.98,metrics_text,ha='right', va='top',transform = ax2.transAxes,fontsize=10,color='red') | |
# normal approx overlay if needed | |
if norm_off == False: # so user wants normal curve overlay | |
import numpy as np | |
X = np.linspace(min(bins),max(bins),10*len(bins)) | |
from math import sqrt, pi | |
Cp = 1/(sigma*sqrt(2*pi)) | |
Ep = -1/2*((X-mu)/sigma)**2 | |
G = Cp*np.exp(Ep) | |
ax2.plot(X, G, color='red') | |
def mini_plot_SDSM(raw_list,ax1,ax2,ax3, bins, width=0.5): | |
ax1.hist(raw_list, bins) | |
ax1.set_title('Distribution') | |
n, bins,_ = ax2.hist(raw_list, bins, density=True) | |
ax2.set_title('Density') | |
# probability mass | |
dummy_dict = {i:raw_list.count(i) for i in raw_list} | |
total = sum(list(dummy_dict.values())) | |
pmf = {key: round(val/total,4) for key, val in dummy_dict.items()} | |
ax3.bar(list(pmf.keys()), list(pmf.values()), width=width) | |
ax3.set_title('PMF') | |
def sample_with_CI(N, n, population, sigma=1, mode='z'): | |
""" | |
N - no of trials/experiments | |
n - sample size | |
sigma - population SD (needed in z mode) | |
""" | |
Y_hat = [] | |
Y_mean_list = [] | |
CI_list = [] | |
for each_experiment in range(N): | |
Y_hat = choices(population, k=n) | |
Y_mean = sum(Y_hat)/len(Y_hat) | |
if mode == 'z': # then use population SD sigma, not typically practical | |
c = 1.96 # which comprises of 95% of data points in std. normal distribution | |
Y_sigma = sigma | |
CI_err = round(c*Y_sigma,4) | |
else: # t distribution, use unbiased variance | |
from scipy import stats | |
c = stats.t.ppf(1-0.025, n-1) # t value varies depending on degrees of freedom | |
Y_variance = sum([(y - Y_mean)**2 for y in Y_hat])/(n-1) # unbiased estimator | |
Y_sigma = round(sqrt(Y_variance), 4) | |
CI_err = round(c*(Y_sigma/sqrt(n)),4) | |
CI_list.append((Y_mean, CI_err)) | |
Y_mean_list.append(Y_mean) | |
return Y_mean_list, CI_list | |
def plot_ci_accuracy_1(ax, CI_list, mu): | |
mean, err = zip(*CI_list) | |
index = range(0,len(mean)) | |
err_count = 0 | |
# each CI interval, check if it contains mu or not | |
for each_ci in index: | |
each_mid = mean[each_ci] | |
each_err = err[each_ci] | |
low_err = each_mid - each_err | |
hig_err = each_mid + each_err | |
c = 'C0' | |
if (hig_err <= mu) or (low_err >= mu): # outliers | |
c = 'C1' | |
err_count += 1 | |
ax.errorbar(each_ci,each_mid, yerr=each_err, fmt='o', color=c) | |
# cosmetics | |
ax.axhline(y=mu, color='r') | |
ax.set_xticks(index) | |
ax.xaxis.grid(True, alpha=0.3) | |
accuracy = (1-round(err_count/len(mean),4))*100 | |
print('CI containing pop.mean:{}%'.format(accuracy)) | |
return accuracy | |
def get_ci_accuracy(CI_list, mu): | |
mean, err = zip(*CI_list) | |
index = range(0,len(mean)) | |
err_count = 0 | |
# each CI interval, check if it contains mu or not | |
for each_ci in index: | |
each_mid = mean[each_ci] | |
each_err = err[each_ci] | |
low_err = each_mid - each_err | |
hig_err = each_mid + each_err | |
if (hig_err <= mu) or (low_err >= mu): # outliers | |
err_count += 1 | |
accuracy = (1-round(err_count/len(mean),4))*100 | |
return accuracy | |
from random import sample | |
def repeated_experiments_with_CI(population,mu, sigma, N_list=[], n_list=[], mode=1, dist=0, format='b'): | |
""" | |
population - raw population from which sample to be taken | |
mu, sigma - population parameters | |
C - 85% CI constant (for eg, 1.96 or 2.093 etc) | |
N_list - Experiment size or no of times experiments to be conducted per trial | |
n_list - Sample size or no of samples per experiment | |
Mode 1 - use population SD | |
Mode 2 - use unbiased sample SD | |
Mode 3 - usebiased sample SD | |
dist - 0 - use Z distribution | |
dist - 1 - use t distribution | |
""" | |
accuracy_list = [] | |
for each_N in N_list: # no of experiments | |
for each_n in n_list: # sample size for each experiment | |
err_count = 0 | |
for each_E in range(each_N): # for each experiment of experiment size | |
Y_hat = sample(population, k=each_n) # pick n samples | |
Y_mean = sum(Y_hat)/len(Y_hat) | |
if dist == 0: | |
C = 1.96 | |
elif dist == 1: | |
from scipy import stats | |
C = stats.t.ppf(1-0.025, each_n-1) # t value varies depending on degrees of freedom which is (no of samples - 1) | |
if mode == 1: | |
Y_sigma = sigma | |
CI_err = round(C*Y_sigma,4) # if population SD, there is no sample size in denominator | |
elif mode == 2: | |
Y_variance = sum([(y - Y_mean)**2 for y in Y_hat])/(each_n-1) # unbiased estimator | |
Y_sigma = round(sqrt(Y_variance), 4) | |
CI_err = round(C*(Y_sigma/sqrt(each_n)),4) | |
elif mode == 3: | |
Y_variance = sum([(y - Y_mean)**2 for y in Y_hat])/(each_n) # biased estimator | |
Y_sigma = round(sqrt(Y_variance), 4) | |
CI_err = round(C*(Y_sigma/sqrt(each_n)),4) | |
else: | |
raise ValueError('Wrong mode') | |
low_err = Y_mean - CI_err | |
hig_err = Y_mean + CI_err | |
#print(CI_err,Y_mean,low_err,hig_err,mu) | |
if (hig_err <= mu) or (low_err >= mu): | |
err_count += 1 | |
accuracy = round((1-err_count/each_N)*100,4) | |
success = 0 if accuracy < 95 else 1 | |
if format == 'b': | |
accuracy_list.append((each_N, each_n, success)) | |
else: # anything other than b | |
accuracy_list.append((each_N, each_n, accuracy)) | |
return accuracy_list | |
def get_mode_label(mode): | |
mode_labels={ | |
1:"population SD", | |
2:"unbiased sample SD", | |
3:"biased sample SD", | |
4:"Wilson Score method" | |
} | |
return mode_labels.get(mode, "invalid mode") | |
def get_dist_label(dist): | |
dist_labels={ | |
0:"Z distribution", | |
1:"T distribution" | |
} | |
return dist_labels.get(dist, "invalid dist") | |
def plot_ci_accuracy_2(ax, accuracy_list): | |
x,y,z=zip(*accuracy_list) | |
labels = ['$< 95\%$', '$ \geq 95\%$'] | |
colors = ['red','green'] | |
for xi, yi, zi in zip(x,y,z): | |
if zi == 1: | |
s = ax.scatter(yi, xi, c=colors[zi], label=labels[zi], s=30, edgecolors='None', alpha=0.75) | |
else: | |
f = ax.scatter(yi, xi, c=colors[zi], label=labels[zi], s=30, edgecolors='None', alpha=0.75) | |
ax.set_ylabel('Experiment Size $N$') | |
ax.set_xlabel('Sample Size $n$') | |
#ax.legend((s,f),(labels[1],labels[0]), loc='lower right', scatterpoints=1) | |
xmin = 0 | |
xmax = ax.get_xlim()[1] | |
from math import ceil,floor | |
xminint = floor(xmin) | |
xmaxint = ceil(xmax) | |
xint = range(xminint, xmaxint, 50) | |
ax.set_xticks(xint) | |
ax.xaxis.set_tick_params(labelsize=7) | |
for tick in ax.get_xticklabels(): | |
tick.set_rotation(90) | |
#sdg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment