jseabold/binary_rng.py

## binary_rng.py
"""
Heavily inspired by the R package bindata.
"""
import numpy as np
from scipy import interpolate
from scipy import stats


def corr_to_joint(corr, marginals):
    """
    Create joint probabilities from a correlations and marginal probabilities

    Parameters
    ----------
    corr : array_like
        A correlation matrix
    marginals : array_like
        Marginal probabilities

    Returns
    -------
    joint : ndarray
        An array of joint probabilities
    """
    corr = np.asarray(corr)
    marginals = np.asarray(marginals)
    std = (marginals * (1 - marginals)) ** .5
    return corr * np.outer(std, std) + np.outer(marginals, marginals)


def create_joint_probability_grid(marginals, corr):
    """
    Probability that two random variates are above zero.

    Parameters
    ----------
    marginals : array_like
        1d array containing the grid of marginal probabilities
    corr : array_like
        1d array containing the grid of correlations

    Returns
    -------
    joint : ndarray
        A N x N X P array with N == marginals.shape[0] and P == corr.shape[0]
        Each (i, j, k) entry contains the probability that both components
        of a normal random variable with mean `
        scipy.stats.norm.ppf(marginals[i, j])`  and correlation corr[k] are
        larger than zero.

    Examples
    --------
    >>> marginal_grid = np.linspace(0, 1, 51)
    >>> corr_grid = np.linspace(-1, 1, 101)
    >>> values = create_joint_probability_grid(marginal_grid, corr_grid)
    """
    marginals = np.asarray(marginals)
    corr = np.asarray(corr)

    n_marginals = marginals.shape[0]
    n_corr = corr.shape[0]

    z = np.zeros((n_marginals, n_marginals, n_corr))

    mu = stats.norm.ppf(marginals)

    for k in range(n_corr):
        cov = np.array([[1, corr[k]], [corr[k], 1]])
        for m in range(n_marginals):
            mu1 = mu[m]
            for n in range(m, n_marginals):
                # could take care of the special cases of corr == -1, 0, 1
                # as well
                # if corr[k] == -1:  # this can't ever be non-zero?
                #     z[m, n, k] = max(marginals[m] + marginals[n] - 1, 0)
                # elif corr[k] == 0:
                #     z[m, n, k] = marginals[m] * marginals[n]
                # elif corr[k] == 1:
                #     z[m, n, k] = min(marginals[m], marginals[n])
                if marginals[m] * marginals[n] == 0:
                    z[m, n, k] = 0
                elif marginals[m] == 1:
                    z[m, n, k] = marginals[n]
                elif marginals[n] == 1:
                    z[m, n, k] = marginals[m]
                # 10 is a "large" variable for the upper bound of integration
                else:
                    z[m, n, k], _ = stats.mvn.mvnun(
                        [0, 0], [10, 10], [mu1, mu[n]], cov)
                z[n, m, k] = z[m, n, k]
    return z


def joint_to_cov(joint):
    """
    Convert a joint probability matrix to a multivariate normal covariance

    Parameters
    ----------
    joint : array_like
        An ndarray of joint probabilities for two random variables

    Returns
    -------
    cov : ndarray
        The covariance matrix

    Notes
    -----
    Computes a covariance matrix for a normal distribution correpsonding to
    a binary distribution with marginal probabilities on the diagonal of
    `joint` and pairwise probabilities on the off-diagonal. This uses the
    hard-coded SIMULVALS created by `create_joint_probability_grid` to do
    interpolation.
    """
    marginals = np.diag(joint)  # you might already have these
    # TODO: check joint for pos-def and symmetric
    cov = np.eye(joint.shape[0])

    # alternatively simulvals could be a dataframe so we can keep up with
    # the grid it was generated on
    corr_grid = np.linspace(-1, 1, SIMULVALS.shape[-1])
    marginal_grid = np.linspace(0, 1, SIMULVALS.shape[0])
    simul_grid = (marginal_grid, marginal_grid, corr_grid)

    for m in range(joint.shape[1] - 1):
        for n in range(m + 1, joint.shape[0]):
            # given the observed mu1, mu2, and the joint probabilities
            # back out the covariance that would have given these observations
            x = marginals[np.repeat([m, n], corr_grid.shape[0])]
            x = x.reshape(-1, 2, order='F')
            x = np.column_stack((x, corr_grid))
            # interpolate joint probabilities for these marginals over the
            # correlations
            y = interpolate.interpolate.interpn(simul_grid, SIMULVALS, x)
            # given this get a function that maps joint -> cov and
            # evaluate at joint
            func = interpolate.interp1d(y, corr_grid)
            cov[m, n] = cov[n, m] = func(joint[m, n])
    return cov


def correlated_binary_rvs(marginals, corr, size=0):
    """
    Generate correlated binary random variables

    Parameters
    ----------
    marginals : array_like
        1d array of marginal probabilities
    corr : array_like
        2d array of binary correlation for the variables
    size : int
        The number of observations for the returned variables

    Returns
    -------
    y : ndarray
        The correlated random binary variables
    """
    joint = corr_to_joint(corr, marginals)
    cov = joint_to_cov(joint)

    X = np.random.multivariate_normal(stats.norm.ppf(marginals),
                                      cov=cov, size=size) > 0
    return X.astype(float)


MARGINAL_GRID = np.linspace(0, 1, 51)
CORR_GRID = np.linspace(-1, 1, 101)
SIMULVALS = create_joint_probability_grid(MARGINAL_GRID, CORR_GRID)
	"""
	Heavily inspired by the R package bindata.
	"""
	import numpy as np
	from scipy import interpolate
	from scipy import stats


	def corr_to_joint(corr, marginals):
	"""
	Create joint probabilities from a correlations and marginal probabilities

	Parameters
	----------
	corr : array_like
	A correlation matrix
	marginals : array_like
	Marginal probabilities

	Returns
	-------
	joint : ndarray
	An array of joint probabilities
	"""
	corr = np.asarray(corr)
	marginals = np.asarray(marginals)
	std = (marginals * (1 - marginals)) ** .5
	return corr * np.outer(std, std) + np.outer(marginals, marginals)


	def create_joint_probability_grid(marginals, corr):
	"""
	Probability that two random variates are above zero.

	Parameters
	----------
	marginals : array_like
	1d array containing the grid of marginal probabilities
	corr : array_like
	1d array containing the grid of correlations

	Returns
	-------
	joint : ndarray
	A N x N X P array with N == marginals.shape[0] and P == corr.shape[0]
	Each (i, j, k) entry contains the probability that both components
	of a normal random variable with mean `
	scipy.stats.norm.ppf(marginals[i, j])` and correlation corr[k] are
	larger than zero.

	Examples
	--------
	>>> marginal_grid = np.linspace(0, 1, 51)
	>>> corr_grid = np.linspace(-1, 1, 101)
	>>> values = create_joint_probability_grid(marginal_grid, corr_grid)
	"""
	marginals = np.asarray(marginals)
	corr = np.asarray(corr)

	n_marginals = marginals.shape[0]
	n_corr = corr.shape[0]

	z = np.zeros((n_marginals, n_marginals, n_corr))

	mu = stats.norm.ppf(marginals)

	for k in range(n_corr):
	cov = np.array([[1, corr[k]], [corr[k], 1]])
	for m in range(n_marginals):
	mu1 = mu[m]
	for n in range(m, n_marginals):
	# could take care of the special cases of corr == -1, 0, 1
	# as well
	# if corr[k] == -1: # this can't ever be non-zero?
	# z[m, n, k] = max(marginals[m] + marginals[n] - 1, 0)
	# elif corr[k] == 0:
	# z[m, n, k] = marginals[m] * marginals[n]
	# elif corr[k] == 1:
	# z[m, n, k] = min(marginals[m], marginals[n])
	if marginals[m] * marginals[n] == 0:
	z[m, n, k] = 0
	elif marginals[m] == 1:
	z[m, n, k] = marginals[n]
	elif marginals[n] == 1:
	z[m, n, k] = marginals[m]
	# 10 is a "large" variable for the upper bound of integration
	else:
	z[m, n, k], _ = stats.mvn.mvnun(
	[0, 0], [10, 10], [mu1, mu[n]], cov)
	z[n, m, k] = z[m, n, k]
	return z


	def joint_to_cov(joint):
	"""
	Convert a joint probability matrix to a multivariate normal covariance

	Parameters
	----------
	joint : array_like
	An ndarray of joint probabilities for two random variables

	Returns
	-------
	cov : ndarray
	The covariance matrix

	Notes
	-----
	Computes a covariance matrix for a normal distribution correpsonding to
	a binary distribution with marginal probabilities on the diagonal of
	`joint` and pairwise probabilities on the off-diagonal. This uses the
	hard-coded SIMULVALS created by `create_joint_probability_grid` to do
	interpolation.
	"""
	marginals = np.diag(joint) # you might already have these
	# TODO: check joint for pos-def and symmetric
	cov = np.eye(joint.shape[0])

	# alternatively simulvals could be a dataframe so we can keep up with
	# the grid it was generated on
	corr_grid = np.linspace(-1, 1, SIMULVALS.shape[-1])
	marginal_grid = np.linspace(0, 1, SIMULVALS.shape[0])
	simul_grid = (marginal_grid, marginal_grid, corr_grid)

	for m in range(joint.shape[1] - 1):
	for n in range(m + 1, joint.shape[0]):
	# given the observed mu1, mu2, and the joint probabilities
	# back out the covariance that would have given these observations
	x = marginals[np.repeat([m, n], corr_grid.shape[0])]
	x = x.reshape(-1, 2, order='F')
	x = np.column_stack((x, corr_grid))
	# interpolate joint probabilities for these marginals over the
	# correlations
	y = interpolate.interpolate.interpn(simul_grid, SIMULVALS, x)
	# given this get a function that maps joint -> cov and
	# evaluate at joint
	func = interpolate.interp1d(y, corr_grid)
	cov[m, n] = cov[n, m] = func(joint[m, n])
	return cov


	def correlated_binary_rvs(marginals, corr, size=0):
	"""
	Generate correlated binary random variables

	Parameters
	----------
	marginals : array_like
	1d array of marginal probabilities
	corr : array_like
	2d array of binary correlation for the variables
	size : int
	The number of observations for the returned variables

	Returns
	-------
	y : ndarray
	The correlated random binary variables
	"""
	joint = corr_to_joint(corr, marginals)
	cov = joint_to_cov(joint)

	X = np.random.multivariate_normal(stats.norm.ppf(marginals),
	cov=cov, size=size) > 0
	return X.astype(float)


	MARGINAL_GRID = np.linspace(0, 1, 51)
	CORR_GRID = np.linspace(-1, 1, 101)
	SIMULVALS = create_joint_probability_grid(MARGINAL_GRID, CORR_GRID)