Created
November 16, 2021 19:56
-
-
Save pranavgade20/537bfa6385e4220e482eba8c7b4f348b to your computer and use it in GitHub Desktop.
statistics_toolkit.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scipy.stats as stats | |
import numpy as np | |
def pearson_correlation_2tail(x: list[int], y: list[int]) -> (float, float): | |
""" | |
Calculates the Pearson correlation coefficient and the p-value. double the p val for 1 tailed | |
:param x: input array 1 | |
:param y: corresponding input 2 | |
:return: (pearson correlation coeff, p value for testing correlation) | |
""" | |
return stats.pearsonr(x, y) | |
def rank_correlation_2tail(x: list[int], y: list[int]) -> (float, float): | |
""" | |
Calculates the Spearman rank-order correlation coefficient and the p-value | |
:param x: input array 1 | |
:param y: corresponding input 2 | |
:return: (spearman correlation coeff, p value for H0 == no correlation) | |
""" | |
return stats.spearmanr(x, y) | |
def spearman_correlation_2tail(x: list[int], y: list[int]) -> (float, float): | |
""" | |
Calculates the Spearman rank-order correlation coefficient and the p-value | |
:param x: input array 1 | |
:param y: corresponding input 2 | |
:return: (spearman correlation coeff, p value for H0 == no correlation) | |
""" | |
return rank_correlation_2tail(x, y) | |
def linear_regression_2tail(x: list[int], y: list[int]): | |
""" | |
Calculates the Spearman rank-order correlation coefficient and the p-value | |
:param x: input array 1 | |
:param y: corresponding input 2 | |
:return: an object with slope, intercept, rvalue(correlation coeff), pvalue(The p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic. See alternative above for alternative hypotheses.), etc | |
""" | |
return stats.linregress(x, y) | |
def contingency_table_chi2_test_pval(x: list[int, int]) -> float: | |
""" | |
Chi-square test of independence of variables in a contingency table. | |
:param x: n*m table of values in table | |
:return: p value of this test | |
""" | |
return stats.chi2_contingency(x)[1] | |
def goodness_fit_chi2(x) -> None: | |
"""see stats.chisquare https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html""" | |
raise NotImplementedError | |
def confidence_interval_mean_normal_2sided(alpha: float, array: list[int], parent_sigma=None): | |
"""confidence interval for parent distribution of array""" | |
if parent_sigma is None: | |
return stats.norm.interval(alpha, loc=np.mean(array), scale=np.std(array) / np.sqrt(len(array))) | |
else: | |
return stats.norm.interval(alpha, loc=np.mean(array), scale=parent_sigma) | |
def confidence_interval_mean_t_distribution_2sided(alpha: float, array: list[int]): | |
"""confidence interval, use this when sample size is small/parent sigma is unknown""" | |
return stats.t.interval( | |
alpha, | |
len(array) - 1, | |
loc=np.mean(array), | |
scale=np.std(array) * np.sqrt(len(array)) / np.sqrt(len(array) - 1)) | |
def confidence_interval_difference_between_means_normal(alpha, x1, x2, sigma1, sigma2, n1, n2): | |
""" | |
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions | |
assume parent distributions are normal | |
:param alpha: desired confidence level | |
:param x1: observed mean of values from 1st distribution | |
:param x2: observed mean of values from 2nd distribution | |
:param sigma1: standard deviation of parent of 1st distribution | |
:param sigma2: standard deviation of parent of 2nd distribution | |
:param n1: number of values in 1st distribution | |
:param n2: number of values in 3nd distribution | |
:return: confidence interval | |
""" | |
return stats.norm.interval(alpha, loc=(x1 - x2), scale=(((sigma1 ** 2) / n1) + ((sigma2 ** 2) / n2))) | |
def confidence_interval_difference_between_means_t_distribution(alpha, x1, x2, sigma1, sigma2, n1, n2): | |
""" | |
use when parent sigma is not known | |
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions | |
assume parent distributions are normal | |
:param alpha: desired confidence level | |
:param x1: observed mean of values from 1st distribution | |
:param x2: observed mean of values from 2nd distribution | |
:param sigma1: observed standard deviation of values from 1st distribution | |
:param sigma2: observed standard deviation of values from 2nd distribution | |
:param n1: number of values in 1st distribution | |
:param n2: number of values in 2nd distribution | |
:return: confidence interval | |
""" | |
sigma1 = (sigma1 * np.sqrt(n1)) / np.sqrt(n1 - 1) | |
sigma2 = (sigma2 * np.sqrt(n2)) / np.sqrt(n2 - 1) | |
sp = np.sqrt((((n1 - 1) * sigma1 ** 2) + ((n2 - 1) * sigma2 ** 2)) / (n1 + n2 - 2)) | |
return stats.t.interval(alpha, n1 + n2 - 2, loc=(x1 - x2), scale=(sp * np.sqrt((1 / n1) + (1 / n2)))) | |
def confidence_interval_difference_between_means_t_distribution_from_array(alpha, array1, array2): | |
""" | |
use when parent sigma is not known | |
gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions | |
assume parent distributions are normal | |
:param alpha: desired confidence level | |
:param array2: observations from 1st distribution of length n | |
:param array1: observations from 2nd distribution of length m | |
:return: confidence interval | |
""" | |
return confidence_interval_difference_between_means_t_distribution( | |
alpha, np.mean(array1), np.mean(array1), np.std(array1), np.std(array2), len(array1), len(array2)) | |
def confidence_interval_binomial_theta(alpha: float, x: float, n: float): | |
""" | |
when number of successes in binomial trials in known, this estimates the parameter theta | |
:param alpha: desired confidence level | |
:param x: number of successes | |
:param n: number of trials | |
:return: confidence interval | |
""" | |
t = x / n | |
return stats.norm.interval(alpha, loc=t, scale=np.sqrt((t * (1 - t)) / n)) | |
def confidence_interval_binomial_difference_between_theta(alpha: float, x1: float, x2: float, n1: float, n2: float): | |
""" | |
when number of successes in two binomial trials in known, this estimates the difference between thetas | |
:param alpha: desired confidence level | |
:param x1: number of successes in 1st distribution | |
:param x2: number of successes in 2nd distribution | |
:param n1: number of trials in 1st distribution | |
:param n2: number of trials in 2nd distribution | |
:return: confidence interval | |
""" | |
t1 = x1 / n1 | |
t2 = x2 / n2 | |
return stats.norm.interval(alpha, loc=t1 - t2, scale=np.sqrt(((t1 * (1 - t1)) / n1) + ((t2 * (1 - t2)) / n2))) | |
def confidence_interval_variance(alpha: float, sigma: float, n: float): | |
""" | |
estimate parent standard deviation given sigma from n samples | |
:param alpha: desired confidence level | |
:param sigma: observed standard deviation from n samples | |
:param n: number of samples | |
:return: confidence interval | |
""" | |
left = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[0]) | |
right = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[1]) | |
return np.sqrt(left), np.sqrt(right) | |
def confidence_interval_variance_from_array(alpha: float, array: list[float]): | |
""" | |
estimate parent standard deviation given sigma from n samples | |
:param alpha: desired confidence level | |
:param array: array of samples | |
:return: confidence interval | |
""" | |
return confidence_interval_variance(alpha, np.std(array), len(array)) | |
def confidence_interval_variance_ratio(alpha: float, s1: float, s2: float, n1: float, n2: float): | |
""" | |
estimate the ratio of standard deviations of two distributions, given observed standard deviations | |
:param alpha: desired confidence level | |
:param s1: observed standard deviation in first distribution | |
:param s2: observed standard deviation in second distribution | |
:param n1: number of samples in first distribution | |
:param n2: number of samples in second distribution | |
:return: confidence interval | |
""" | |
left = (s1**2/s2**2) * (1/stats.f.interval(alpha, dfn=n1-1, dfd=n2-1)) | |
right = (s1**2/s2**2) * (stats.f.interval(alpha, dfn=n1-1, dfd=n2-1)) | |
return np.sqrt(left), np.sqrt(right) | |
def confidence_interval_variance_ratio_from_arrays(alpha: float, array1: list[float], array2: list[float]): | |
""" | |
estimate the ratio of standard deviations of two distributions, given observed values | |
:param alpha: desired confidence level | |
:param array1: observed standard deviation in first distribution | |
:param array2: observed standard deviation in second distribution | |
:return: confidence interval | |
""" | |
return confidence_interval_variance_ratio(alpha, np.std(array1), np.std(array2), len(array1), len(array2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment