pranavgade20/statistics_toolkit.py

## statistics_toolkit.py
import scipy.stats as stats
import numpy as np


def pearson_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
    """
    Calculates the Pearson correlation coefficient and the p-value. double the p val for 1 tailed
    :param x: input array 1
    :param y: corresponding input 2
    :return: (pearson correlation coeff, p value for testing correlation)
    """
    return stats.pearsonr(x, y)


def rank_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
    """
    Calculates the Spearman rank-order correlation coefficient and the p-value
    :param x: input array 1
    :param y: corresponding input 2
    :return: (spearman correlation coeff, p value for H0 == no correlation)
    """
    return stats.spearmanr(x, y)


def spearman_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
    """
    Calculates the Spearman rank-order correlation coefficient and the p-value
    :param x: input array 1
    :param y: corresponding input 2
    :return: (spearman correlation coeff, p value for H0 == no correlation)
    """
    return rank_correlation_2tail(x, y)


def linear_regression_2tail(x: list[int], y: list[int]):
    """
    Calculates the Spearman rank-order correlation coefficient and the p-value
    :param x: input array 1
    :param y: corresponding input 2
    :return: an object with slope, intercept, rvalue(correlation coeff), pvalue(The p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic. See alternative above for alternative hypotheses.), etc
    """
    return stats.linregress(x, y)


def contingency_table_chi2_test_pval(x: list[int, int]) -> float:
    """
    Chi-square test of independence of variables in a contingency table.
    :param x: n*m table of values in table
    :return: p value of this test
    """
    return stats.chi2_contingency(x)[1]


def goodness_fit_chi2(x) -> None:
    """see stats.chisquare https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html"""
    raise NotImplementedError
def confidence_interval_mean_normal_2sided(alpha: float, array: list[int], parent_sigma=None):
    """confidence interval for parent distribution of array"""
    if parent_sigma is None:
        return stats.norm.interval(alpha, loc=np.mean(array), scale=np.std(array) / np.sqrt(len(array)))
    else:
        return stats.norm.interval(alpha, loc=np.mean(array), scale=parent_sigma)


def confidence_interval_mean_t_distribution_2sided(alpha: float, array: list[int]):
    """confidence interval, use this when sample size is small/parent sigma is unknown"""
    return stats.t.interval(
        alpha,
        len(array) - 1,
        loc=np.mean(array),
        scale=np.std(array) * np.sqrt(len(array)) / np.sqrt(len(array) - 1))


def confidence_interval_difference_between_means_normal(alpha, x1, x2, sigma1, sigma2, n1, n2):
    """
    gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
    assume parent distributions are normal
    :param alpha: desired confidence level
    :param x1: observed mean of values from 1st distribution
    :param x2: observed mean of values from 2nd distribution
    :param sigma1: standard deviation of parent of 1st distribution
    :param sigma2: standard deviation of parent of 2nd distribution
    :param n1: number of values in 1st distribution
    :param n2: number of values in 3nd distribution
    :return: confidence interval
    """
    return stats.norm.interval(alpha, loc=(x1 - x2), scale=(((sigma1 ** 2) / n1) + ((sigma2 ** 2) / n2)))


def confidence_interval_difference_between_means_t_distribution(alpha, x1, x2, sigma1, sigma2, n1, n2):
    """
    use when parent sigma is not known
    gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
    assume parent distributions are normal
    :param alpha: desired confidence level
    :param x1: observed mean of values from 1st distribution
    :param x2: observed mean of values from 2nd distribution
    :param sigma1: observed standard deviation of values from 1st distribution
    :param sigma2: observed standard deviation of values from 2nd distribution
    :param n1: number of values in 1st distribution
    :param n2: number of values in 2nd distribution
    :return: confidence interval
    """
    sigma1 = (sigma1 * np.sqrt(n1)) / np.sqrt(n1 - 1)
    sigma2 = (sigma2 * np.sqrt(n2)) / np.sqrt(n2 - 1)
    sp = np.sqrt((((n1 - 1) * sigma1 ** 2) + ((n2 - 1) * sigma2 ** 2)) / (n1 + n2 - 2))
    return stats.t.interval(alpha, n1 + n2 - 2, loc=(x1 - x2), scale=(sp * np.sqrt((1 / n1) + (1 / n2))))


def confidence_interval_difference_between_means_t_distribution_from_array(alpha, array1, array2):
    """
    use when parent sigma is not known
    gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
    assume parent distributions are normal
    :param alpha: desired confidence level
    :param array2: observations from 1st distribution of length n
    :param array1: observations from 2nd distribution of length m
    :return: confidence interval
    """
    return confidence_interval_difference_between_means_t_distribution(
        alpha, np.mean(array1), np.mean(array1), np.std(array1), np.std(array2), len(array1), len(array2))


def confidence_interval_binomial_theta(alpha: float, x: float, n: float):
    """
    when number of successes in binomial trials in known, this estimates the parameter theta
    :param alpha: desired confidence level
    :param x: number of successes
    :param n: number of trials
    :return: confidence interval
    """
    t = x / n
    return stats.norm.interval(alpha, loc=t, scale=np.sqrt((t * (1 - t)) / n))


def confidence_interval_binomial_difference_between_theta(alpha: float, x1: float, x2: float, n1: float, n2: float):
    """
    when number of successes in two binomial trials in known, this estimates the difference between thetas
    :param alpha: desired confidence level
    :param x1: number of successes in 1st distribution
    :param x2: number of successes in 2nd distribution
    :param n1: number of trials in 1st distribution
    :param n2: number of trials in 2nd distribution
    :return: confidence interval
    """
    t1 = x1 / n1
    t2 = x2 / n2
    return stats.norm.interval(alpha, loc=t1 - t2, scale=np.sqrt(((t1 * (1 - t1)) / n1) + ((t2 * (1 - t2)) / n2)))


def confidence_interval_variance(alpha: float, sigma: float, n: float):
    """
    estimate parent standard deviation given sigma from n samples
    :param alpha: desired confidence level
    :param sigma: observed standard deviation from n samples
    :param n: number of samples
    :return: confidence interval
    """
    left = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[0])
    right = ((n-1)*(sigma**2))/(stats.chi2.interval(alpha, df=n-1)[1])
    return np.sqrt(left), np.sqrt(right)


def confidence_interval_variance_from_array(alpha: float, array: list[float]):
    """
    estimate parent standard deviation given sigma from n samples
    :param alpha: desired confidence level
    :param array: array of samples
    :return: confidence interval
    """
    return confidence_interval_variance(alpha, np.std(array), len(array))


def confidence_interval_variance_ratio(alpha: float, s1: float, s2: float, n1: float, n2: float):
    """
    estimate the ratio of standard deviations of two distributions, given observed standard deviations
    :param alpha: desired confidence level
    :param s1: observed standard deviation in first distribution
    :param s2: observed standard deviation in second distribution
    :param n1: number of samples in first distribution
    :param n2: number of samples in second distribution
    :return: confidence interval
    """
    left = (s1**2/s2**2) * (1/stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
    right = (s1**2/s2**2) * (stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
    return np.sqrt(left), np.sqrt(right)


def confidence_interval_variance_ratio_from_arrays(alpha: float, array1: list[float], array2: list[float]):
    """
    estimate the ratio of standard deviations of two distributions, given observed values
    :param alpha: desired confidence level
    :param array1: observed standard deviation in first distribution
    :param array2: observed standard deviation in second distribution
    :return: confidence interval
    """
    return confidence_interval_variance_ratio(alpha, np.std(array1), np.std(array2), len(array1), len(array2))
	import scipy.stats as stats
	import numpy as np


	def pearson_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
	"""
	Calculates the Pearson correlation coefficient and the p-value. double the p val for 1 tailed
	:param x: input array 1
	:param y: corresponding input 2
	:return: (pearson correlation coeff, p value for testing correlation)
	"""
	return stats.pearsonr(x, y)


	def rank_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
	"""
	Calculates the Spearman rank-order correlation coefficient and the p-value
	:param x: input array 1
	:param y: corresponding input 2
	:return: (spearman correlation coeff, p value for H0 == no correlation)
	"""
	return stats.spearmanr(x, y)


	def spearman_correlation_2tail(x: list[int], y: list[int]) -> (float, float):
	"""
	Calculates the Spearman rank-order correlation coefficient and the p-value
	:param x: input array 1
	:param y: corresponding input 2
	:return: (spearman correlation coeff, p value for H0 == no correlation)
	"""
	return rank_correlation_2tail(x, y)


	def linear_regression_2tail(x: list[int], y: list[int]):
	"""
	Calculates the Spearman rank-order correlation coefficient and the p-value
	:param x: input array 1
	:param y: corresponding input 2
	:return: an object with slope, intercept, rvalue(correlation coeff), pvalue(The p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic. See alternative above for alternative hypotheses.), etc
	"""
	return stats.linregress(x, y)


	def contingency_table_chi2_test_pval(x: list[int, int]) -> float:
	"""
	Chi-square test of independence of variables in a contingency table.
	:param x: n*m table of values in table
	:return: p value of this test
	"""
	return stats.chi2_contingency(x)[1]


	def goodness_fit_chi2(x) -> None:
	"""see stats.chisquare https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html"""
	raise NotImplementedError
	def confidence_interval_mean_normal_2sided(alpha: float, array: list[int], parent_sigma=None):
	"""confidence interval for parent distribution of array"""
	if parent_sigma is None:
	return stats.norm.interval(alpha, loc=np.mean(array), scale=np.std(array) / np.sqrt(len(array)))
	else:
	return stats.norm.interval(alpha, loc=np.mean(array), scale=parent_sigma)


	def confidence_interval_mean_t_distribution_2sided(alpha: float, array: list[int]):
	"""confidence interval, use this when sample size is small/parent sigma is unknown"""
	return stats.t.interval(
	alpha,
	len(array) - 1,
	loc=np.mean(array),
	scale=np.std(array) * np.sqrt(len(array)) / np.sqrt(len(array) - 1))


	def confidence_interval_difference_between_means_normal(alpha, x1, x2, sigma1, sigma2, n1, n2):
	"""
	gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
	assume parent distributions are normal
	:param alpha: desired confidence level
	:param x1: observed mean of values from 1st distribution
	:param x2: observed mean of values from 2nd distribution
	:param sigma1: standard deviation of parent of 1st distribution
	:param sigma2: standard deviation of parent of 2nd distribution
	:param n1: number of values in 1st distribution
	:param n2: number of values in 3nd distribution
	:return: confidence interval
	"""
	return stats.norm.interval(alpha, loc=(x1 - x2), scale=(((sigma1 2) / n1) + ((sigma2 2) / n2)))


	def confidence_interval_difference_between_means_t_distribution(alpha, x1, x2, sigma1, sigma2, n1, n2):
	"""
	use when parent sigma is not known
	gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
	assume parent distributions are normal
	:param alpha: desired confidence level
	:param x1: observed mean of values from 1st distribution
	:param x2: observed mean of values from 2nd distribution
	:param sigma1: observed standard deviation of values from 1st distribution
	:param sigma2: observed standard deviation of values from 2nd distribution
	:param n1: number of values in 1st distribution
	:param n2: number of values in 2nd distribution
	:return: confidence interval
	"""
	sigma1 = (sigma1 * np.sqrt(n1)) / np.sqrt(n1 - 1)
	sigma2 = (sigma2 * np.sqrt(n2)) / np.sqrt(n2 - 1)
	sp = np.sqrt((((n1 - 1) * sigma1 ** 2) + ((n2 - 1) * sigma2 ** 2)) / (n1 + n2 - 2))
	return stats.t.interval(alpha, n1 + n2 - 2, loc=(x1 - x2), scale=(sp * np.sqrt((1 / n1) + (1 / n2))))


	def confidence_interval_difference_between_means_t_distribution_from_array(alpha, array1, array2):
	"""
	use when parent sigma is not known
	gives (1-alpha)100% confidence interval for mu1-mu2, the difference between means of parent distributions
	assume parent distributions are normal
	:param alpha: desired confidence level
	:param array2: observations from 1st distribution of length n
	:param array1: observations from 2nd distribution of length m
	:return: confidence interval
	"""
	return confidence_interval_difference_between_means_t_distribution(
	alpha, np.mean(array1), np.mean(array1), np.std(array1), np.std(array2), len(array1), len(array2))


	def confidence_interval_binomial_theta(alpha: float, x: float, n: float):
	"""
	when number of successes in binomial trials in known, this estimates the parameter theta
	:param alpha: desired confidence level
	:param x: number of successes
	:param n: number of trials
	:return: confidence interval
	"""
	t = x / n
	return stats.norm.interval(alpha, loc=t, scale=np.sqrt((t * (1 - t)) / n))


	def confidence_interval_binomial_difference_between_theta(alpha: float, x1: float, x2: float, n1: float, n2: float):
	"""
	when number of successes in two binomial trials in known, this estimates the difference between thetas
	:param alpha: desired confidence level
	:param x1: number of successes in 1st distribution
	:param x2: number of successes in 2nd distribution
	:param n1: number of trials in 1st distribution
	:param n2: number of trials in 2nd distribution
	:return: confidence interval
	"""
	t1 = x1 / n1
	t2 = x2 / n2
	return stats.norm.interval(alpha, loc=t1 - t2, scale=np.sqrt(((t1 * (1 - t1)) / n1) + ((t2 * (1 - t2)) / n2)))


	def confidence_interval_variance(alpha: float, sigma: float, n: float):
	"""
	estimate parent standard deviation given sigma from n samples
	:param alpha: desired confidence level
	:param sigma: observed standard deviation from n samples
	:param n: number of samples
	:return: confidence interval
	"""
	left = ((n-1)(sigma*2))/(stats.chi2.interval(alpha, df=n-1)[0])
	right = ((n-1)(sigma*2))/(stats.chi2.interval(alpha, df=n-1)[1])
	return np.sqrt(left), np.sqrt(right)


	def confidence_interval_variance_from_array(alpha: float, array: list[float]):
	"""
	estimate parent standard deviation given sigma from n samples
	:param alpha: desired confidence level
	:param array: array of samples
	:return: confidence interval
	"""
	return confidence_interval_variance(alpha, np.std(array), len(array))


	def confidence_interval_variance_ratio(alpha: float, s1: float, s2: float, n1: float, n2: float):
	"""
	estimate the ratio of standard deviations of two distributions, given observed standard deviations
	:param alpha: desired confidence level
	:param s1: observed standard deviation in first distribution
	:param s2: observed standard deviation in second distribution
	:param n1: number of samples in first distribution
	:param n2: number of samples in second distribution
	:return: confidence interval
	"""
	left = (s12/s22) * (1/stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
	right = (s12/s22) * (stats.f.interval(alpha, dfn=n1-1, dfd=n2-1))
	return np.sqrt(left), np.sqrt(right)


	def confidence_interval_variance_ratio_from_arrays(alpha: float, array1: list[float], array2: list[float]):
	"""
	estimate the ratio of standard deviations of two distributions, given observed values
	:param alpha: desired confidence level
	:param array1: observed standard deviation in first distribution
	:param array2: observed standard deviation in second distribution
	:return: confidence interval
	"""
	return confidence_interval_variance_ratio(alpha, np.std(array1), np.std(array2), len(array1), len(array2))