kwinkunks/sdhe.py

## sdhe.py
# Properties of the scaled standard deviational hyperellipsoid.
#
# Author: Matt Hall, kwinkunks@gmail.com
# Copyright: 2022, Matt Hall
# Licence: Apache 2.0, https://www.apache.org/licenses/LICENSE-2.0
#
# These small functions implement n-dimensional lookup of the beta-distribution
# approximation to this problem. They answer the questions, "What proportion
# of a multivariate Gaussian distribution is contained by `r` standard
# deviations?" and "How many standard deviations contain a proportion `p` of
# the distribution?". They roughly follow the language of this paper:
#
# Wang B, Shi W, Miao Z (2015) Confidence Analysis of Standard Deviational Ellipse
# and Its Extension into Higher Dimensional Euclidean Space. PLoS ONE 10(3):
# e0118537. doi:10.1371/journal.pone.0118537.
# https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0118537
#
# The proof that these distributions can be modeled by a beta distribution is
# given in this paper:
#
# D. Ververidis and C. Kotropoulos, "Gaussian Mixture Modeling by Exploiting the
# Mahalanobis Distance," in IEEE Transactions on Signal Processing, vol. 56,
# no. 7, pp. 2797-2811, July 2008, doi: 10.1109/TSP.2008.917350.
# http://poseidon.csd.auth.gr/papers/PUBLISHED/JOURNAL/pdf/Ververidis08a.pdf
#
# I got to those papers via this post on Cross Validated:
# https://stats.stackexchange.com/questions/20543
#
# I also found this post on Cross Validated useful:
# https://stats.stackexchange.com/questions/35012

from scipy.stats import beta
from scipy.optimize import fsolve


def stdev_to_proportion(r: float, D: float=1, N: float=1e9) -> float:
    """
    Estimate the confidence level of the scaled standard deviational
    hyperellipsoid (SDHE). This is the proportion of points whose Mahalanobis
    distance is within `r` standard deviations, for the given number of
    dimensions `D`.

    For example, 68.27% of samples lie within ±1 stdev of the mean in the
    univariate normal distribution. For two dimensions, `D` = 2 and 39.35% of
    the samples are within ±1 stdev of the mean.

    This is an approximation good to about 6 significant figures (depending on
    N). It uses the beta distribution to model the true distribution; for more
    about this see the following paper:
    http://poseidon.csd.auth.gr/papers/PUBLISHED/JOURNAL/pdf/Ververidis08a.pdf

    For a table of test cases see Table 1 in:
    https://doi.org/10.1371/journal.pone.0118537

    Args:
        r (float): The number of standard deviations (or 'magnification
            ratio').
        D (float): The number of dimensions.
        N (float): The number of instances; just needs to be large for a
            proportion with decent precision.

    Returns:
        float. The confidence level.

    Example:
        >>> stdev_to_proportion(1)
        0.6826894921370859
        >>> stdev_to_proportion(3)
        0.9973002039367398
        >>> stdev_to_proportion(1, D=2)
        0.39346933952920327
        >>> stdev_to_proportion(5, D=10)
        0.9946544947734935
    """
    return beta.cdf(x=1/N, a=D/2, b=(N-D-1)/2, scale=1/r**2)


def proportion_to_stdev(p: float, D: float=1, N: float=1e9) -> float:
    """
    The inverse of `stdev_to_proportion`.

    Estimate the 'magnification ratio' (number of standard deviations) of the
    scaled standard deviational hyperellipsoid (SDHE) at the given confidence
    level and for the given number of dimensions, `D`.

    This tells us the number of standard deviations containing the given
    proportion of instances. For example, 80% of samples lie within ±1.2816
    standard deviations.

    For more about this and a table of test cases (Table 2) see:
    https://doi.org/10.1371/journal.pone.0118537

    Args:
        p (float): The confidence level as a decimal fraction, e.g. 0.8.
        D (float): The number of dimensions. Default 1 (the univariate Gaussian
            distribution).
        N (float): The number of instances; just needs to be large for a
            proportion with decent precision. `Default 1e9`.

    Returns:
        float. The estimated number of standard deviations ('magnification ratio').

    Examples:
        >>> proportion_to_stdev(0.99, D=1)
        2.575829302496098
        >>> proportion_to_stdev(0.90, D=5)
        3.039137525465009
        >>> stdev_to_proportion(proportion_to_stdev(0.80, D=1))
        0.8000000000000003
    """
    func = lambda r_, D_, N_: stdev_to_proportion(r_, D_, N_) - p
    r_hat, = fsolve(func, x0=2, args=(D, N))
    return r_hat
	# Properties of the scaled standard deviational hyperellipsoid.
	#
	# Author: Matt Hall, kwinkunks@gmail.com
	# Copyright: 2022, Matt Hall
	# Licence: Apache 2.0, https://www.apache.org/licenses/LICENSE-2.0
	#
	# These small functions implement n-dimensional lookup of the beta-distribution
	# approximation to this problem. They answer the questions, "What proportion
	# of a multivariate Gaussian distribution is contained by `r` standard
	# deviations?" and "How many standard deviations contain a proportion `p` of
	# the distribution?". They roughly follow the language of this paper:
	#
	# Wang B, Shi W, Miao Z (2015) Confidence Analysis of Standard Deviational Ellipse
	# and Its Extension into Higher Dimensional Euclidean Space. PLoS ONE 10(3):
	# e0118537. doi:10.1371/journal.pone.0118537.
	# https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0118537
	#
	# The proof that these distributions can be modeled by a beta distribution is
	# given in this paper:
	#
	# D. Ververidis and C. Kotropoulos, "Gaussian Mixture Modeling by Exploiting the
	# Mahalanobis Distance," in IEEE Transactions on Signal Processing, vol. 56,
	# no. 7, pp. 2797-2811, July 2008, doi: 10.1109/TSP.2008.917350.
	# http://poseidon.csd.auth.gr/papers/PUBLISHED/JOURNAL/pdf/Ververidis08a.pdf
	#
	# I got to those papers via this post on Cross Validated:
	# https://stats.stackexchange.com/questions/20543
	#
	# I also found this post on Cross Validated useful:
	# https://stats.stackexchange.com/questions/35012

	from scipy.stats import beta
	from scipy.optimize import fsolve


	def stdev_to_proportion(r: float, D: float=1, N: float=1e9) -> float:
	"""
	Estimate the confidence level of the scaled standard deviational
	hyperellipsoid (SDHE). This is the proportion of points whose Mahalanobis
	distance is within `r` standard deviations, for the given number of
	dimensions `D`.

	For example, 68.27% of samples lie within ±1 stdev of the mean in the
	univariate normal distribution. For two dimensions, `D` = 2 and 39.35% of
	the samples are within ±1 stdev of the mean.

	This is an approximation good to about 6 significant figures (depending on
	N). It uses the beta distribution to model the true distribution; for more
	about this see the following paper:
	http://poseidon.csd.auth.gr/papers/PUBLISHED/JOURNAL/pdf/Ververidis08a.pdf

	For a table of test cases see Table 1 in:
	https://doi.org/10.1371/journal.pone.0118537

	Args:
	r (float): The number of standard deviations (or 'magnification
	ratio').
	D (float): The number of dimensions.
	N (float): The number of instances; just needs to be large for a
	proportion with decent precision.

	Returns:
	float. The confidence level.

	Example:
	>>> stdev_to_proportion(1)
	0.6826894921370859
	>>> stdev_to_proportion(3)
	0.9973002039367398
	>>> stdev_to_proportion(1, D=2)
	0.39346933952920327
	>>> stdev_to_proportion(5, D=10)
	0.9946544947734935
	"""
	return beta.cdf(x=1/N, a=D/2, b=(N-D-1)/2, scale=1/r**2)


	def proportion_to_stdev(p: float, D: float=1, N: float=1e9) -> float:
	"""
	The inverse of `stdev_to_proportion`.

	Estimate the 'magnification ratio' (number of standard deviations) of the
	scaled standard deviational hyperellipsoid (SDHE) at the given confidence
	level and for the given number of dimensions, `D`.

	This tells us the number of standard deviations containing the given
	proportion of instances. For example, 80% of samples lie within ±1.2816
	standard deviations.

	For more about this and a table of test cases (Table 2) see:
	https://doi.org/10.1371/journal.pone.0118537

	Args:
	p (float): The confidence level as a decimal fraction, e.g. 0.8.
	D (float): The number of dimensions. Default 1 (the univariate Gaussian
	distribution).
	N (float): The number of instances; just needs to be large for a
	proportion with decent precision. `Default 1e9`.

	Returns:
	float. The estimated number of standard deviations ('magnification ratio').

	Examples:
	>>> proportion_to_stdev(0.99, D=1)
	2.575829302496098
	>>> proportion_to_stdev(0.90, D=5)
	3.039137525465009
	>>> stdev_to_proportion(proportion_to_stdev(0.80, D=1))
	0.8000000000000003
	"""
	func = lambda r_, D_, N_: stdev_to_proportion(r_, D_, N_) - p
	r_hat, = fsolve(func, x0=2, args=(D, N))
	return r_hat