michaelchughes/calc_elbo_super_lda_for_single_doc.py

## calc_elbo_super_lda_for_single_doc.py
import autograd.numpy as np
from autograd.scipy.special import gammaln, digamma
from autograd import grad, hessian

from scipy.optimize import fmin_l_bfgs_b

def L_theta(
        theta_d_K=None,
        N_d_K=None,
        alpha_K=None,
        y_d=None,
        eta_K=None,
        delta_stddev=None,
        ):
    ''' Compute ELBO objective as function of theta, with other terms constant.

    Args
    ----
    N_d_K : 1D array, size K, positive reals
        vector of counts of topic usage in current document
    alpha_K : 1D array, size K, positive reals
        vector of "pseudo-counts" from prior on doc-topic probas
    y_d : None or real scalar
    eta_K : None or 1D array of size K
        regression weight coefficient vector
    delta : None or positive real scalar
        standard deviation

    Returns
    -------
    L_theta : scalar real

    Examples (unsupervised)
    -----------------------
    >>> K = 3
    >>> np.set_printoptions(precision=3, suppress=1, linewidth=80)
    >>> N_d_K = np.asarray([100., 10., 1.])
    >>> alpha_K = np.asarray([1.0, 1.0, 1.0])

    # Compute optimal theta vector
    >>> opt_theta_d_K = N_d_K + alpha_K
    >>> opt_L = L_theta(opt_theta_d_K, N_d_K, alpha_K)

    # Verify elbo scores optimal theta higher than bad theta vector
    >>> bad_theta_d_K = np.ones(K)
    >>> bad_L = L_theta(bad_theta_d_K, N_d_K, alpha_K)
    >>> opt_L > bad_L
    True

    # Verify also better than nearly-optimal theta
    >>> prng = np.random.RandomState(0)
    >>> nearopt_theta_d_K = N_d_K + alpha_K + 0.1 * prng.randn(K)
    >>> nearopt_L = L_theta(nearopt_theta_d_K, N_d_K, alpha_K)
    >>> opt_L > nearopt_L
    True

    # Verify via autograd that gradient is zero at optimal point
    >>> grad_of_L = grad(lambda theta_d_K: L_theta(theta_d_K, N_d_K, alpha_K))
    >>> grad_of_L(opt_theta_d_K)
    array([ 0.,  0.,  0.])

    Examples (supervised)
    -----------------------
    >>> K = 3
    >>> y_d = 0.0
    >>> eta_K = np.asarray([5.0, 5.0, 5.0])
    >>> delta = 1.0

    >>> grad_of_L_super = grad(
    ...     lambda theta_d_K:
    ...         L_theta(theta_d_K, N_d_K, alpha_K, y_d, eta_K, delta))
    >>> grad_of_L_super(opt_theta_d_K).max() < 1e-8
    True
    '''
    sum_theta_d = np.sum(theta_d_K)
    E_log_pi_d_K = digamma(theta_d_K) - digamma(sum_theta_d)
    L_theta_unsuper = (
        np.inner(N_d_K + alpha_K - theta_d_K, E_log_pi_d_K)
        - gammaln(sum_theta_d) + np.sum(gammaln(theta_d_K)))
    if y_d is None:
        L_theta_super = 0.0
    else:
        E_pi_d_K = theta_d_K / np.sum(theta_d_K)
        E_eta_pi = np.inner(eta_K, E_pi_d_K)
        E_etaT_pi_piT_eta = 1.0 / (sum_theta_d + sum_theta_d**2) * (
            np.inner(np.square(eta_K), theta_d_K)
            + np.inner(eta_K, theta_d_K)**2)
        L_theta_super = 1.0 / (delta_stddev * delta_stddev) * (
            y_d * E_eta_pi - 0.5 * E_etaT_pi_piT_eta)
    return L_theta_unsuper + L_theta_super

def calc_theta_d_K_to_optimize_L(
        N_d_K=None,
        alpha_K=None,
        y_d=None,
        eta_K=None,
        delta_stddev=None,
        seed=0,
        ):
    ''' Estimate theta vector via gradient descent to optimize ELBO objective

    Args
    ----
    See L_theta

    Returns
    -------
    theta_d_K : 1D array, size K, positive values
        Represents parameter vector for K-dim Dirichlet posterior
    info_dict : dict
        Contains gory details from optimization procedure

    >>> K = 3
    >>> np.set_printoptions(precision=3, suppress=1, linewidth=80)
    >>> N_d_K = np.asarray([100., 10., 1.])
    >>> alpha_K = np.asarray([1.0, 1.0, 1.0])

    # Compute optimal theta vector by hand
    >>> opt_theta_d_K = N_d_K + alpha_K

    # Compute optimal theta via numerical gradient descent
    >>> est_theta_d_K, _ = calc_theta_d_K_to_optimize_L(N_d_K, alpha_K)

    # Verify they are the same
    >>> np.allclose(opt_theta_d_K, est_theta_d_K)
    True
    '''
    K = alpha_K.size
    prng = np.random.RandomState(seed)

    # Initial guess
    init_theta_d_K = prng.rand(K)
    init_eta_d_K = theta2eta(init_theta_d_K)

    # Define optimization objective in UNCONSTRAINED SPACE
    # Minimize the negative ELBO function
    def calc_loss_wrt_eta(eta_K):
        return -1 * L_theta(
            eta2theta(eta_K),
            N_d_K, alpha_K, y_d, eta_K, delta_stddev)
    grad_of_loss_wrt_eta = grad(calc_loss_wrt_eta)

    # Solve UNCONSTRAINED problem
    opt_eta_d_K, opt_f_val, info_dict = fmin_l_bfgs_b(
        calc_loss_wrt_eta,
        init_eta_d_K,
        grad_of_loss_wrt_eta)

    # Transform back to theta and return
    return eta2theta(opt_eta_d_K), info_dict


def theta2eta(theta_K):
    ''' Transform positive theta vector into unconstrained eta vector

    Args
    ----
    theta_K : 1D array, size K, positive real values

    Returns
    -------
    eta_K : 1D array, size K, unconstrained real values
    '''
    return np.log(theta_K)

def eta2theta(eta_K):
    ''' Transform unconstrained eta vector into positive theta vector

    Args
    ----
    eta_K : 1D array, size K, unconstrained real values

    Returns
    -------
    theta_K : 1D array, size K, positive real values
    '''
    return np.exp(eta_K)


""" IGNORE BELOW HERE
def L_resp(
        resp_K=None,
        log_weights_K=None):
    '''
    >>> K = 3
    >>> prng = np.random.RandomState(0)
    >>> log_weights_K = prng.randn(K)
    >>> opt_resp_K = np.exp(log_weights_K)
    >>> opt_resp_K /= opt_resp_K.sum()

    >>> hess_of_L = hessian(
    ...     lambda resp_K: L_resp(resp_K, log_weights_K))
    >>> np.all(np.linalg.eigvals(hess_of_L(opt_resp_K)) < 0)
    True
    >>> for i in range(100):
    ...     rand_resp_K = prng.rand(K)
    ...     rand_resp_K /= rand_resp_K
    ...     H_KK = hess_of_L(rand_resp_K)
    ...     if not np.all(np.linalg.eigvals(H_KK) < 0):
    ...         print H_KK
    ...         print sorted(np.linalg.eigvals(H_KK))
    '''
    return np.inner(resp_K, log_weights_K) - np.inner(resp_K, np.log(resp_K))
"""
	import autograd.numpy as np
	from autograd.scipy.special import gammaln, digamma
	from autograd import grad, hessian

	from scipy.optimize import fmin_l_bfgs_b

	def L_theta(
	theta_d_K=None,
	N_d_K=None,
	alpha_K=None,
	y_d=None,
	eta_K=None,
	delta_stddev=None,
	):
	''' Compute ELBO objective as function of theta, with other terms constant.

	Args
	----
	N_d_K : 1D array, size K, positive reals
	vector of counts of topic usage in current document
	alpha_K : 1D array, size K, positive reals
	vector of "pseudo-counts" from prior on doc-topic probas
	y_d : None or real scalar
	eta_K : None or 1D array of size K
	regression weight coefficient vector
	delta : None or positive real scalar
	standard deviation

	Returns
	-------
	L_theta : scalar real

	Examples (unsupervised)
	-----------------------
	>>> K = 3
	>>> np.set_printoptions(precision=3, suppress=1, linewidth=80)
	>>> N_d_K = np.asarray([100., 10., 1.])
	>>> alpha_K = np.asarray([1.0, 1.0, 1.0])

	# Compute optimal theta vector
	>>> opt_theta_d_K = N_d_K + alpha_K
	>>> opt_L = L_theta(opt_theta_d_K, N_d_K, alpha_K)

	# Verify elbo scores optimal theta higher than bad theta vector
	>>> bad_theta_d_K = np.ones(K)
	>>> bad_L = L_theta(bad_theta_d_K, N_d_K, alpha_K)
	>>> opt_L > bad_L
	True

	# Verify also better than nearly-optimal theta
	>>> prng = np.random.RandomState(0)
	>>> nearopt_theta_d_K = N_d_K + alpha_K + 0.1 * prng.randn(K)
	>>> nearopt_L = L_theta(nearopt_theta_d_K, N_d_K, alpha_K)
	>>> opt_L > nearopt_L
	True

	# Verify via autograd that gradient is zero at optimal point
	>>> grad_of_L = grad(lambda theta_d_K: L_theta(theta_d_K, N_d_K, alpha_K))
	>>> grad_of_L(opt_theta_d_K)
	array([ 0., 0., 0.])

	Examples (supervised)
	-----------------------
	>>> K = 3
	>>> y_d = 0.0
	>>> eta_K = np.asarray([5.0, 5.0, 5.0])
	>>> delta = 1.0

	>>> grad_of_L_super = grad(
	... lambda theta_d_K:
	... L_theta(theta_d_K, N_d_K, alpha_K, y_d, eta_K, delta))
	>>> grad_of_L_super(opt_theta_d_K).max() < 1e-8
	True
	'''
	sum_theta_d = np.sum(theta_d_K)
	E_log_pi_d_K = digamma(theta_d_K) - digamma(sum_theta_d)
	L_theta_unsuper = (
	np.inner(N_d_K + alpha_K - theta_d_K, E_log_pi_d_K)
	- gammaln(sum_theta_d) + np.sum(gammaln(theta_d_K)))
	if y_d is None:
	L_theta_super = 0.0
	else:
	E_pi_d_K = theta_d_K / np.sum(theta_d_K)
	E_eta_pi = np.inner(eta_K, E_pi_d_K)
	E_etaT_pi_piT_eta = 1.0 / (sum_theta_d + sum_theta_d*2) (
	np.inner(np.square(eta_K), theta_d_K)
	+ np.inner(eta_K, theta_d_K)**2)
	L_theta_super = 1.0 / (delta_stddev * delta_stddev) * (
	y_d * E_eta_pi - 0.5 * E_etaT_pi_piT_eta)
	return L_theta_unsuper + L_theta_super

	def calc_theta_d_K_to_optimize_L(
	N_d_K=None,
	alpha_K=None,
	y_d=None,
	eta_K=None,
	delta_stddev=None,
	seed=0,
	):
	''' Estimate theta vector via gradient descent to optimize ELBO objective

	Args
	----
	See L_theta

	Returns
	-------
	theta_d_K : 1D array, size K, positive values
	Represents parameter vector for K-dim Dirichlet posterior
	info_dict : dict
	Contains gory details from optimization procedure

	>>> K = 3
	>>> np.set_printoptions(precision=3, suppress=1, linewidth=80)
	>>> N_d_K = np.asarray([100., 10., 1.])
	>>> alpha_K = np.asarray([1.0, 1.0, 1.0])

	# Compute optimal theta vector by hand
	>>> opt_theta_d_K = N_d_K + alpha_K

	# Compute optimal theta via numerical gradient descent
	>>> est_theta_d_K, _ = calc_theta_d_K_to_optimize_L(N_d_K, alpha_K)

	# Verify they are the same
	>>> np.allclose(opt_theta_d_K, est_theta_d_K)
	True
	'''
	K = alpha_K.size
	prng = np.random.RandomState(seed)

	# Initial guess
	init_theta_d_K = prng.rand(K)
	init_eta_d_K = theta2eta(init_theta_d_K)

	# Define optimization objective in UNCONSTRAINED SPACE
	# Minimize the negative ELBO function
	def calc_loss_wrt_eta(eta_K):
	return -1 * L_theta(
	eta2theta(eta_K),
	N_d_K, alpha_K, y_d, eta_K, delta_stddev)
	grad_of_loss_wrt_eta = grad(calc_loss_wrt_eta)

	# Solve UNCONSTRAINED problem
	opt_eta_d_K, opt_f_val, info_dict = fmin_l_bfgs_b(
	calc_loss_wrt_eta,
	init_eta_d_K,
	grad_of_loss_wrt_eta)

	# Transform back to theta and return
	return eta2theta(opt_eta_d_K), info_dict


	def theta2eta(theta_K):
	''' Transform positive theta vector into unconstrained eta vector

	Args
	----
	theta_K : 1D array, size K, positive real values

	Returns
	-------
	eta_K : 1D array, size K, unconstrained real values
	'''
	return np.log(theta_K)

	def eta2theta(eta_K):
	''' Transform unconstrained eta vector into positive theta vector

	Args
	----
	eta_K : 1D array, size K, unconstrained real values

	Returns
	-------
	theta_K : 1D array, size K, positive real values
	'''
	return np.exp(eta_K)



	""" IGNORE BELOW HERE
	def L_resp(
	resp_K=None,
	log_weights_K=None):
	'''
	>>> K = 3
	>>> prng = np.random.RandomState(0)
	>>> log_weights_K = prng.randn(K)
	>>> opt_resp_K = np.exp(log_weights_K)
	>>> opt_resp_K /= opt_resp_K.sum()

	>>> hess_of_L = hessian(
	... lambda resp_K: L_resp(resp_K, log_weights_K))
	>>> np.all(np.linalg.eigvals(hess_of_L(opt_resp_K)) < 0)
	True
	>>> for i in range(100):
	... rand_resp_K = prng.rand(K)
	... rand_resp_K /= rand_resp_K
	... H_KK = hess_of_L(rand_resp_K)
	... if not np.all(np.linalg.eigvals(H_KK) < 0):
	... print H_KK
	... print sorted(np.linalg.eigvals(H_KK))
	'''
	return np.inner(resp_K, log_weights_K) - np.inner(resp_K, np.log(resp_K))
	"""