alextp/mtl.fs

## mtl.fs
open MathNet.Numerics;
open System

let a = Fn.Factorial 10

let elementwiseProduct (a: float array) (b: float array) = [| for i in 0..a.Length-1 -> a.[i]*b.[i] |]
let arraySum (a: float array) (b: float array) = [|for i in 0..a.Length-1 -> a.[i]+b.[i] |]
let scalarProduct (a: float) (b: float array) = [| for i in 0..b.Length-1 -> a*b.[i] |]
let expectation (z: float array) (v: float array array) =
    let prod = [| for i in 0..v.Length-1 -> scalarProduct z.[i] v.[i] |]
    let mutable l = prod.[0]
    for i in 1..prod.Length-1 do
      l <- arraySum l prod.[i]
    l
let dot (a: float array) (b: float array) = Array.sum (elementwiseProduct a b)
let sqnorm a = dot a a
let arraySub a b = arraySum a (scalarProduct -1.0 b)
let ones k = [| for i in 0..k-1 -> 1.0 |]
let sumall (v: float array array) =
    let mutable s = 0.0
    for i in 0..v.Length-1 do
        s <- s + Array.sum v.[i]
    s


let logiBeta (alpha: float) (k: int) =
    let K = float k
    Fn.GammaLn(alpha*K) - K*Fn.GammaLn(alpha)
let logiBetav (alpha: float array) = Fn.GammaLn(Array.sum(alpha)) - Array.sum(Array.map Fn.GammaLn alpha)


// This is the usual mean-field lower-bound for a dirichlet random variable. You get it by computing
// E_q[\log P(\phi)] - E_q[\log Q(\phi)]. The second term is a standard entropy of a dirichlet
// distribution, and the first term can be computed rather trivially using the known fact that
// the expectation of the log of a dirichlet-distributed probability is
// (digamma(alpha_i) - digamma(sum_alpha))
let clusBoundPhi (alpha: float) (phi: float array) =
    let k = phi.Length
    let K = float k
    let alpha0 = K*alpha
    let phi0 = Array.sum(phi)
    let vd a = (Fn.Digamma a) - (Fn.Digamma phi0)
    ((logiBeta alpha k) + (Array.sum(elementwiseProduct (scalarProduct (alpha-1.0) (ones k))
                                                       (Array.map vd phi)))
     - (logiBetav phi) + (phi0-K)*(Fn.Digamma phi0)
     - (Array.sum  (elementwiseProduct (arraySub phi (ones k)) (Array.map Fn.Digamma phi))))

// This is the mean-field lower bound for a standard gaussian variable. It is - 0.5 || mu ||^2 + C
let clusBoundMu (mu: float array) = -0.5*(sqnorm mu)

// The bound for z is by far the simplest. It's the sum of the expected logarithms of the
// probabilities sampled from phi minus the sum of the logarithms of the z variables.
let clusBoundZ (z: float array array) (phi: float array) =
    let phi0 = Array.sum(phi)
    let vd a = (Fn.Digamma a) - (Fn.Digamma phi0)
    (Array.sum (Array.map vd phi)) - (sumall (Array.map (fun i -> Array.map Math.Log i) z))

// The lower bound for theta in the cluster model.
// You can get it by computing E_q [\log P(theta) ] - E_q[\log Q(\theta)]
// note that the second term above is a constant (as we're not modeling the variance
// of the variational approximation). The first term is constants plus
// - 0.5 D \log sigma - 0.5 || theta - E_z[mu] ||^2/sigma (you can get this from
// the expression for the multivariate gaussian distribution).
let clusBoundTheta (theta: float array) (mu: float array array) (z: float array) (sigma: float) =
    let mmu = expectation z mu
    -0.5*(float theta.Length)*(System.Math.Log sigma) - 0.5*(sqnorm (arraySub theta mmu))/sigma

let clusBoundThetas (theta: float array array)  mu (z: float array array) sigma =
    let mutable bound = 0.0
    for i in 0..theta.Length-1 do
        bound <- bound + (clusBoundTheta theta.[i]  mu z.[i] sigma)
    bound

// Finally, the lower bound for Y. As we're not approximating the distribution for Y there is no
// need for an entropy term. This is essentially the log-likelihood of Y|X according to a specified
// value of theta. After ignoring constants, it's the sum of -0.5 (Y- theta^T X)^2
let clusBoundY (theta: float array) (Y: float array) (X: float array array) =
    let mutable ps = 0.
    for i in 0..Y.Length-1 do
        let pred = dot theta X.[i]
        ps <- ps - 0.5*(Y.[i]-pred)*(Y.[i]-pred)
    ps

let clusBoundYs (theta: float array array) (X: float array array array) (Y: float array array) =
    let mutable bound = 0.0
    for i in 0..Y.Length-1 do
        bound <- bound + (clusBoundY theta.[i] Y.[i] X.[i])
    bound


let clusLowerBound (T: int) (K: int) (D: int) theta mu sigma alpha phi (z: float array array) X Y =
    ((clusBoundPhi alpha phi) + (Array.sum (Array.map clusBoundMu  mu)) + (clusBoundZ z phi) + (clusBoundThetas theta mu z sigma) + (clusBoundYs theta X Y))


// This is the standard mean-field update for a dirichlet-multinomial. See Blei, Ng, and Jordan's LDA
// journal paper for a derivation
let clusUpdatePhi (alpha: float) K (z: float array array) =
    let mutable phi = Array.init z.[0].Length (fun i -> alpha)
    for i in 0..z.Length-1 do
        let zi = z.[i]
        for j in 0..zi.Length-1 do
          phi.[j] <- phi.[j] + zi.[j]
    phi

// This is the standard mean-field update for the mean of a mixture of gaussians when the variational
// distribution doesn't model the variances. Essentially it's a weighted expectation over the thetas
// weighted by the corresponding z variables, with a small regularization to pull things toward the
// prior.
let clusUpdateMu (theta: float array array) (z: float array array) =
    let T = theta.Length
    let D = theta.[0].Length
    let K = z.[0].Length
    let mutable mu = Array.init K (fun i -> Array.init D (fun j -> 0.))
    for i in 0..K-1 do
        let num = mu.[i]
        let mutable den = 1.0
        for j in 0..T-1 do
            for d in 0..D-1 do
                num.[d] <- num.[d] + z.[j].[i]*theta.[j].[d]
            den <- den + z.[j].[i]
        mu.[i] <- scalarProduct (11.0/den) num
    mu

// This is the standard exp-digamma update for z
let clusUpdateZ (theta: float array array) (mu: float array array) (phi: float array) sigma =
    let T = theta.Length
    let K = mu.Length
    let mutable z = Array.init T (fun i -> Array.init K (fun j -> 0.))
    for i in 0..T-1 do
        let num = z.[i]
        let mutable sum = 0.0
        for j in 0..K-1 do
            let term = Math.Exp ((Fn.Digamma phi.[j]) + 0.5*(sqnorm (arraySub theta.[i] mu.[j]))/sigma)
            num.[j] <- term
            sum <- sum + term
        for j in 0..K-1 do
            num.[j] <- num.[j] / sum
        z.[i] <- num
    z


// This might not look trivial, but it's just computing the ridge regression of theta with a
// prior equal to the expected mu according to z
let clusUpdateTheta (mu: float array array) (z: float array array) (x: float array array array) (y: float array array) (sigma: float) =
    let T = z.Length
    let D = mu.[0].Length
    let K = z.[0].Length
    let mutable theta = Array.init T (fun i -> Array.init D (fun j -> 0.))
    for i in 0..T-1 do
        let mmu = expectation z.[i] mu
        let mutable num = scalarProduct (1.0/sigma) mmu
        let den = Array2D.init D D (fun i j -> if i = j then sigma else 0.0)
        let X = x.[i]
        let Y = y.[i]
        for j in 0..X.Length-1 do
            num <- arraySum num (scalarProduct Y.[j] X.[j])
            for k in 0..D-1 do
                for l in 0..D-1 do
                    den.[k,l] <- den.[k,l] * X.[j].[k] * X.[j].[l]
        let a = LinearAlgebra.Matrix.Create(den)
        let b = LinearAlgebra.Matrix.Create([|num|])
        let c = a.SolveTranspose b
        theta.[i] <- c.GetArray().[0]
    theta

// MathNet.Numerics.

let r = RandomSources.MersenneTwisterRandomSource()
let randomVector D = [| for i in 0..D  -> (float (r.NextDouble()))|]

let initialize (x: float array array array) (y: float array array) (alpha: float) (sigma: float) (K: int) =
    let T = x.Length
    let D = x.[0].Length
    let theta = Array.init T (fun i -> randomVector D)
    let mu = Array.init K (fun i -> randomVector D)
    let z = Array.init T (fun i -> ones K)
    let phi = Array.init K (fun i -> alpha)
    (phi, mu, z, theta)
	open MathNet.Numerics;
	open System

	let a = Fn.Factorial 10

	let elementwiseProduct (a: float array) (b: float array) = [\| for i in 0..a.Length-1 -> a.[i]*b.[i] \|]
	let arraySum (a: float array) (b: float array) = [\|for i in 0..a.Length-1 -> a.[i]+b.[i] \|]
	let scalarProduct (a: float) (b: float array) = [\| for i in 0..b.Length-1 -> a*b.[i] \|]
	let expectation (z: float array) (v: float array array) =
	let prod = [\| for i in 0..v.Length-1 -> scalarProduct z.[i] v.[i] \|]
	let mutable l = prod.[0]
	for i in 1..prod.Length-1 do
	l <- arraySum l prod.[i]
	l
	let dot (a: float array) (b: float array) = Array.sum (elementwiseProduct a b)
	let sqnorm a = dot a a
	let arraySub a b = arraySum a (scalarProduct -1.0 b)
	let ones k = [\| for i in 0..k-1 -> 1.0 \|]
	let sumall (v: float array array) =
	let mutable s = 0.0
	for i in 0..v.Length-1 do
	s <- s + Array.sum v.[i]
	s



	let logiBeta (alpha: float) (k: int) =
	let K = float k
	Fn.GammaLn(alphaK) - KFn.GammaLn(alpha)
	let logiBetav (alpha: float array) = Fn.GammaLn(Array.sum(alpha)) - Array.sum(Array.map Fn.GammaLn alpha)


	// This is the usual mean-field lower-bound for a dirichlet random variable. You get it by computing
	// E_q[\log P(\phi)] - E_q[\log Q(\phi)]. The second term is a standard entropy of a dirichlet
	// distribution, and the first term can be computed rather trivially using the known fact that
	// the expectation of the log of a dirichlet-distributed probability is
	// (digamma(alpha_i) - digamma(sum_alpha))
	let clusBoundPhi (alpha: float) (phi: float array) =
	let k = phi.Length
	let K = float k
	let alpha0 = K*alpha
	let phi0 = Array.sum(phi)
	let vd a = (Fn.Digamma a) - (Fn.Digamma phi0)
	((logiBeta alpha k) + (Array.sum(elementwiseProduct (scalarProduct (alpha-1.0) (ones k))
	(Array.map vd phi)))
	- (logiBetav phi) + (phi0-K)*(Fn.Digamma phi0)
	- (Array.sum (elementwiseProduct (arraySub phi (ones k)) (Array.map Fn.Digamma phi))))

	// This is the mean-field lower bound for a standard gaussian variable. It is - 0.5 \|\| mu \|\|^2 + C
	let clusBoundMu (mu: float array) = -0.5*(sqnorm mu)

	// The bound for z is by far the simplest. It's the sum of the expected logarithms of the
	// probabilities sampled from phi minus the sum of the logarithms of the z variables.
	let clusBoundZ (z: float array array) (phi: float array) =
	let phi0 = Array.sum(phi)
	let vd a = (Fn.Digamma a) - (Fn.Digamma phi0)
	(Array.sum (Array.map vd phi)) - (sumall (Array.map (fun i -> Array.map Math.Log i) z))

	// The lower bound for theta in the cluster model.
	// You can get it by computing E_q [\log P(theta) ] - E_q[\log Q(\theta)]
	// note that the second term above is a constant (as we're not modeling the variance
	// of the variational approximation). The first term is constants plus
	// - 0.5 D \log sigma - 0.5 \|\| theta - E_z[mu] \|\|^2/sigma (you can get this from
	// the expression for the multivariate gaussian distribution).
	let clusBoundTheta (theta: float array) (mu: float array array) (z: float array) (sigma: float) =
	let mmu = expectation z mu
	-0.5(float theta.Length)(System.Math.Log sigma) - 0.5*(sqnorm (arraySub theta mmu))/sigma

	let clusBoundThetas (theta: float array array) mu (z: float array array) sigma =
	let mutable bound = 0.0
	for i in 0..theta.Length-1 do
	bound <- bound + (clusBoundTheta theta.[i] mu z.[i] sigma)
	bound

	// Finally, the lower bound for Y. As we're not approximating the distribution for Y there is no
	// need for an entropy term. This is essentially the log-likelihood of Y\|X according to a specified
	// value of theta. After ignoring constants, it's the sum of -0.5 (Y- theta^T X)^2
	let clusBoundY (theta: float array) (Y: float array) (X: float array array) =
	let mutable ps = 0.
	for i in 0..Y.Length-1 do
	let pred = dot theta X.[i]
	ps <- ps - 0.5(Y.[i]-pred)(Y.[i]-pred)
	ps

	let clusBoundYs (theta: float array array) (X: float array array array) (Y: float array array) =
	let mutable bound = 0.0
	for i in 0..Y.Length-1 do
	bound <- bound + (clusBoundY theta.[i] Y.[i] X.[i])
	bound


	let clusLowerBound (T: int) (K: int) (D: int) theta mu sigma alpha phi (z: float array array) X Y =
	((clusBoundPhi alpha phi) + (Array.sum (Array.map clusBoundMu mu)) + (clusBoundZ z phi) + (clusBoundThetas theta mu z sigma) + (clusBoundYs theta X Y))


	// This is the standard mean-field update for a dirichlet-multinomial. See Blei, Ng, and Jordan's LDA
	// journal paper for a derivation
	let clusUpdatePhi (alpha: float) K (z: float array array) =
	let mutable phi = Array.init z.[0].Length (fun i -> alpha)
	for i in 0..z.Length-1 do
	let zi = z.[i]
	for j in 0..zi.Length-1 do
	phi.[j] <- phi.[j] + zi.[j]
	phi

	// This is the standard mean-field update for the mean of a mixture of gaussians when the variational
	// distribution doesn't model the variances. Essentially it's a weighted expectation over the thetas
	// weighted by the corresponding z variables, with a small regularization to pull things toward the
	// prior.
	let clusUpdateMu (theta: float array array) (z: float array array) =
	let T = theta.Length
	let D = theta.[0].Length
	let K = z.[0].Length
	let mutable mu = Array.init K (fun i -> Array.init D (fun j -> 0.))
	for i in 0..K-1 do
	let num = mu.[i]
	let mutable den = 1.0
	for j in 0..T-1 do
	for d in 0..D-1 do
	num.[d] <- num.[d] + z.[j].[i]*theta.[j].[d]
	den <- den + z.[j].[i]
	mu.[i] <- scalarProduct (11.0/den) num
	mu

	// This is the standard exp-digamma update for z
	let clusUpdateZ (theta: float array array) (mu: float array array) (phi: float array) sigma =
	let T = theta.Length
	let K = mu.Length
	let mutable z = Array.init T (fun i -> Array.init K (fun j -> 0.))
	for i in 0..T-1 do
	let num = z.[i]
	let mutable sum = 0.0
	for j in 0..K-1 do
	let term = Math.Exp ((Fn.Digamma phi.[j]) + 0.5*(sqnorm (arraySub theta.[i] mu.[j]))/sigma)
	num.[j] <- term
	sum <- sum + term
	for j in 0..K-1 do
	num.[j] <- num.[j] / sum
	z.[i] <- num
	z


	// This might not look trivial, but it's just computing the ridge regression of theta with a
	// prior equal to the expected mu according to z
	let clusUpdateTheta (mu: float array array) (z: float array array) (x: float array array array) (y: float array array) (sigma: float) =
	let T = z.Length
	let D = mu.[0].Length
	let K = z.[0].Length
	let mutable theta = Array.init T (fun i -> Array.init D (fun j -> 0.))
	for i in 0..T-1 do
	let mmu = expectation z.[i] mu
	let mutable num = scalarProduct (1.0/sigma) mmu
	let den = Array2D.init D D (fun i j -> if i = j then sigma else 0.0)
	let X = x.[i]
	let Y = y.[i]
	for j in 0..X.Length-1 do
	num <- arraySum num (scalarProduct Y.[j] X.[j])
	for k in 0..D-1 do
	for l in 0..D-1 do
	den.[k,l] <- den.[k,l] * X.[j].[k] * X.[j].[l]
	let a = LinearAlgebra.Matrix.Create(den)
	let b = LinearAlgebra.Matrix.Create([\|num\|])
	let c = a.SolveTranspose b
	theta.[i] <- c.GetArray().[0]
	theta

	// MathNet.Numerics.

	let r = RandomSources.MersenneTwisterRandomSource()
	let randomVector D = [\| for i in 0..D -> (float (r.NextDouble()))\|]

	let initialize (x: float array array array) (y: float array array) (alpha: float) (sigma: float) (K: int) =
	let T = x.Length
	let D = x.[0].Length
	let theta = Array.init T (fun i -> randomVector D)
	let mu = Array.init K (fun i -> randomVector D)
	let z = Array.init T (fun i -> ones K)
	let phi = Array.init K (fun i -> alpha)
	(phi, mu, z, theta)