Skip to content

Instantly share code, notes, and snippets.

@pranavgade20
Created November 16, 2021 19:56
Show Gist options
  • Save pranavgade20/537bfa6385e4220e482eba8c7b4f348b to your computer and use it in GitHub Desktop.
Save pranavgade20/537bfa6385e4220e482eba8c7b4f348b to your computer and use it in GitHub Desktop.
statistics_toolkit.py
import scipy.stats as stats
import numpy as np
def pearson_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
"""
Calculates the Pearson correlation coefficient and the p-value. double the p val for 1 tailed
:param x: input array 1
:param y: corresponding input 2
:return: (pearson correlation coeff, p value for testing correlation)
"""
return stats.pearsonr(x, y)
def rank_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
"""
Calculates the Spearman rank-order correlation coefficient and the p-value
:param x: input array 1
:param y: corresponding input 2
:return: (spearman correlation coeff, p value for H0 == no correlation)
"""
return stats.spearmanr(x, y)
def spearman_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
"""
Calculates the Spearman rank-order correlation coefficient and the p-value
:param x: input array 1
:param y: corresponding input 2
:return: (spearman correlation coeff, p value for H0 == no correlation)
"""
return rank_correlation_2tail(x, y)
def linear_regression_2tail(x: list[int], y: list[int]):
"""
Calculates the Spearman rank-order correlation coefficient and the p-value
:param x: input array 1
:param y: corresponding input 2
:return: an object with slope, intercept, rvalue(correlation coeff), pvalue(The p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic. See alternative above for alternative hypotheses.), etc
"""
return stats.linregress(x, y)
def contingency_table_chi2_test_pval(x: list[int, int]) -> float:
"""
Chi-square test of independence of variables in a contingency table.
:param x: n*m table of values in table
:return: p value of this test
"""
return stats.chi2_contingency(x)[1]
def goodness_fit_chi2(x) -> None:
"""see stats.chisquare https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html"""
raise NotImplementedError
def confidence_interval_mean_normal_2sided(alpha: float, array: list[int], parent_sigma=None):
"""confidence interval for parent distribution of array"""
if parent_sigma is None:
return stats.norm.interval(alpha, loc=np.mean(array), scale=np.std(array) / np.sqrt(len(array)))
else:
return stats.norm.interval(alpha, loc=np.mean(array), scale=parent_sigma)
def confidence_interval_mean_t_distribution_2sided(alpha: float, array: list[int]):
"""confidence interval, use this when sample size is small/parent sigma is unknown"""
return stats.t.interval(
alpha,
len(array) - 1,
loc=np.mean(array),
scale=np.std(array) * np.sqrt(len(array)) / np.sqrt(len(array) - 1))
def confidence_interval_difference_between_means_normal(alpha, x1, x2, sigma1, sigma2, n1, n2):
"""
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
assume parent distributions are normal
:param alpha: desired confidence level
:param x1: observed mean of values from 1st distribution
:param x2: observed mean of values from 2nd distribution
:param sigma1: standard deviation of parent of 1st distribution
:param sigma2: standard deviation of parent of 2nd distribution
:param n1: number of values in 1st distribution
:param n2: number of values in 3nd distribution
:return: confidence interval
"""
return stats.norm.interval(alpha, loc=(x1 - x2), scale=(((sigma1 ** 2) / n1) + ((sigma2 ** 2) / n2)))
def confidence_interval_difference_between_means_t_distribution(alpha, x1, x2, sigma1, sigma2, n1, n2):
"""
use when parent sigma is not known
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
assume parent distributions are normal
:param alpha: desired confidence level
:param x1: observed mean of values from 1st distribution
:param x2: observed mean of values from 2nd distribution
:param sigma1: observed standard deviation of values from 1st distribution
:param sigma2: observed standard deviation of values from 2nd distribution
:param n1: number of values in 1st distribution
:param n2: number of values in 2nd distribution
:return: confidence interval
"""
sigma1 = (sigma1 * np.sqrt(n1)) / np.sqrt(n1 - 1)
sigma2 = (sigma2 * np.sqrt(n2)) / np.sqrt(n2 - 1)
sp = np.sqrt((((n1 - 1) * sigma1 ** 2) + ((n2 - 1) * sigma2 ** 2)) / (n1 + n2 - 2))
return stats.t.interval(alpha, n1 + n2 - 2, loc=(x1 - x2), scale=(sp * np.sqrt((1 / n1) + (1 / n2))))
def confidence_interval_difference_between_means_t_distribution_from_array(alpha, array1, array2):
"""
use when parent sigma is not known
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
assume parent distributions are normal
:param alpha: desired confidence level
:param array2: observations from 1st distribution of length n
:param array1: observations from 2nd distribution of length m
:return: confidence interval
"""
return confidence_interval_difference_between_means_t_distribution(
alpha, np.mean(array1), np.mean(array1), np.std(array1), np.std(array2), len(array1), len(array2))
def confidence_interval_binomial_theta(alpha: float, x: float, n: float):
"""
when number of successes in binomial trials in known, this estimates the parameter theta
:param alpha: desired confidence level
:param x: number of successes
:param n: number of trials
:return: confidence interval
"""
t = x / n
return stats.norm.interval(alpha, loc=t, scale=np.sqrt((t * (1 - t)) / n))
def confidence_interval_binomial_difference_between_theta(alpha: float, x1: float, x2: float, n1: float, n2: float):
"""
when number of successes in two binomial trials in known, this estimates the difference between thetas
:param alpha: desired confidence level
:param x1: number of successes in 1st distribution
:param x2: number of successes in 2nd distribution
:param n1: number of trials in 1st distribution
:param n2: number of trials in 2nd distribution
:return: confidence interval
"""
t1 = x1 / n1
t2 = x2 / n2
return stats.norm.interval(alpha, loc=t1 - t2, scale=np.sqrt(((t1 * (1 - t1)) / n1) + ((t2 * (1 - t2)) / n2)))
def confidence_interval_variance(alpha: float, sigma: float, n: float):
"""
estimate parent standard deviation given sigma from n samples
:param alpha: desired confidence level
:param sigma: observed standard deviation from n samples
:param n: number of samples
:return: confidence interval
"""
left = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[0])
right = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[1])
return np.sqrt(left), np.sqrt(right)
def confidence_interval_variance_from_array(alpha: float, array: list[float]):
"""
estimate parent standard deviation given sigma from n samples
:param alpha: desired confidence level
:param array: array of samples
:return: confidence interval
"""
return confidence_interval_variance(alpha, np.std(array), len(array))
def confidence_interval_variance_ratio(alpha: float, s1: float, s2: float, n1: float, n2: float):
"""
estimate the ratio of standard deviations of two distributions, given observed standard deviations
:param alpha: desired confidence level
:param s1: observed standard deviation in first distribution
:param s2: observed standard deviation in second distribution
:param n1: number of samples in first distribution
:param n2: number of samples in second distribution
:return: confidence interval
"""
left = (s1**2/s2**2) * (1/stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
right = (s1**2/s2**2) * (stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
return np.sqrt(left), np.sqrt(right)
def confidence_interval_variance_ratio_from_arrays(alpha: float, array1: list[float], array2: list[float]):
"""
estimate the ratio of standard deviations of two distributions, given observed values
:param alpha: desired confidence level
:param array1: observed standard deviation in first distribution
:param array2: observed standard deviation in second distribution
:return: confidence interval
"""
return confidence_interval_variance_ratio(alpha, np.std(array1), np.std(array2), len(array1), len(array2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment