torfjelde/advi.png

## advi.png

      
    Raw
  

              advi.png
            
          
## turing_advi_example.jl
using Turing, Bijectors, ForwardDiff, LinearAlgebra, Optim, LineSearches
import StatsBase: sample
import Optim: optimize

#############
# Utilities #
#############

function jac_inv_transform(dist::Distribution, x::T where T<:Real)
    ForwardDiff.derivative(x -> invlink(dist, x), x)
end

function jac_inv_transform(dist::Distribution, x::Array{T} where T <: Real)
    ForwardDiff.jacobian(x -> invlink(dist, x), x)
end

function center_diag_gaussian(x, μ, σ)
    # instead of creating a diagonal matrix, we just do elementwise multiplication
    (σ .^(-1)) .* (x - μ)
end

function center_diag_gaussian_inv(η, μ, σ)
    (η .* σ) + μ
end

#########################
# Variational Inference #
#########################

abstract type VariationalInference end

"""
    sample(vi::VariationalInference, num_samples)

Produces `num_samples` samples for the given VI method using number of samples equal to `num_samples`.
"""
function sample(vi::VariationalInference, num_samples) end

"""
    elbo(vi::VariationalInference, num_samples)

Computes empirical estimates of ELBO for the given VI method using number of samples equal to `num_samples`.
"""
function elbo(vi::VariationalInference, num_samples) end

"""
    optimize(vi::VariationalInference)

Finds parameters which maximizes the ELBO for the given VI method.
"""
function optimize(vi::VariationalInference) end


"""
    ADVI(model::Turing.Model)

Automatic Differentiation Variational Inference (ADVI) for a given model.
"""
struct ADVI{T <: Real} <: VariationalInference
    model::Turing.Model
    μ::Vector{T}
    ω::Vector{T}
end

ADVI(model::Turing.Model) = begin
    # setup
    var_info = Turing.VarInfo()
    model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    ADVI(model, zeros(num_params), zeros(num_params))
end

function sample(vi::ADVI, num_samples)
    # setup
    var_info = Turing.VarInfo()
    vi.model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    # convenience
    μ, ω = vi.μ, vi.ω

    # buffer
    samples = zeros(num_samples, num_params)

    for i = 1:size(var_info.dists, 1)
        prior = var_info.dists[i]
        r = var_info.ranges[i]

        # initials
        μ_i = μ[r]
        ω_i = ω[r]

        # # sample from VI posterior
        θ_acc = zeros(length(μ_i))

        for j = 1:num_samples
            η = randn(length(μ_i))
            ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))
            θ = invlink(prior, ζ)

            samples[j, r] = θ
        end
    end

    return samples
end

function optimize(vi::ADVI; samples_per_step = 10, max_iters = 500)
    # setup
    var_info = Turing.VarInfo()
    vi.model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    function objective(x)
        # extract the mean-field Gaussian params
        μ, ω = x[1:num_params], x[num_params + 1: end]

        - elbo(vi, μ, ω, samples_per_step)
    end

    # for every param we need a mean μ and variance ω
    x = zeros(2 * num_params)
    diff_result = DiffResults.GradientResult(x)

    # used for truncated adaGrad as suggested in (Blei et al, 2015).
    η = 0.1
    τ = 1.0
    ρ = zeros(2 * num_params)
    s = zeros(2 * num_params)
    g² = zeros(2 * num_params)

    # number of previous gradients to use to compute `s` in adaGrad
    stepsize_num_prev = 10

    i = 0
    while (i < max_iters) # & converged # <= add criterion? A running mean maybe?
        # compute gradient
        ForwardDiff.gradient!(diff_result, objective, x)

        # recursive implementation of updating the step-size
        # if beyound first sequence of steps we subtract of the previous g² before adding the next
        if i > stepsize_num_prev
            s -= g²
        end

        # update parameters for adaGrad
        g² .= DiffResults.gradient(diff_result).^2
        s += g²

        # compute stepsize
        @. ρ = η / (τ + sqrt(s))

        x .= x - ρ .* DiffResults.gradient(diff_result)
        @info "Step $i" ρ DiffResults.value(diff_result) norm(DiffResults.gradient(diff_result))

        i += 1
    end

    μ, ω = x[1:num_params], x[num_params + 1: end]

    return μ, ω
end

function elbo(vi::ADVI, μ::Vector{T}, ω::Vector{T}, num_samples) where T <: Real
    # setup
    var_info = Turing.VarInfo()

    # initial `Var_Info` object
    vi.model(var_info, Turing.SampleFromUniform())

    num_params = size(var_info.vals, 1)

    elbo_acc = 0.0

    for i = 1:num_samples
        # iterate through priors, sample and update
        for i = 1:size(var_info.dists, 1)
            prior = var_info.dists[i]
            r = var_info.ranges[i]

            # mean-field params for this set of model params
            μ_i = μ[r]
            ω_i = ω[r]

            # obtain samples from mean-field posterior approximation
            η = randn(length(μ_i))
            ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))

            # inverse-transform back to original param space
            θ = invlink(prior, ζ)

            # update
            var_info.vals[r] = θ

            # add the log-det-jacobian of inverse transform
            elbo_acc += log(abs(det(jac_inv_transform(prior, ζ)))) / num_samples
        end

        # sample with updated variables
        vi.model(var_info)
        elbo_acc += var_info.logp / num_samples
    end

    # add the term for the entropy of the variational posterior
    variational_posterior_entropy = sum(ω)
    elbo_acc += variational_posterior_entropy

    elbo_acc
end

function elbo(vi::ADVI, num_samples)
    # extract the mean-field Gaussian params
    μ, ω = vi.μ, vi.ω

    elbo(vi, μ, ω, num_samples)
end

##################
# Simple example #
##################

@model demo(x) = begin
    s ~ InverseGamma(2,3)
    m ~ Normal(0.0, sqrt(s))  # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance
    for i = 1:length(x)
        x[i] ~ Normal(m, sqrt(s))
    end
end

# generate data
x = randn(1, 1000);

# produce "true" samples using NUTS
m = demo(x)
chain = sample(m, NUTS(2000, 200, 0.65))

# ADVI
m = demo(x)
vi = ADVI(m)                                                # default construction of ADVI
μ, ω = optimize(vi, samples_per_step = 5, max_iters = 5000) # maximize ELBO
vi = ADVI(m, μ, ω)                                          # construct new from optimized values
samples = sample(vi, 2000)

# quick check
println([mean(samples, dims=1), [var(x), mean(x)]])

# closed form
using ConjugatePriors

# prior
# notation mapping has been verified by explicitly computing expressions
# in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
μ₀ = 0.0 # => μ
κ₀ = 1.0 # => ν, which scales the precision of the Normal
α₀ = 2.0 # => "shape"
β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale"

pri = NormalGamma(μ₀, κ₀, α₀, β₀)

# posterior
post = posterior(pri, Normal, x)

# marginal distribution of τ = 1 / σ²
# Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
# `scale(post)` = θ
p_τ = Gamma(post.shape, scale(post))
p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ²

# marginal of μ
# Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
p_μ = TDist(2 * post.shape)

μₙ = post.mu    # μ → μ
κₙ = post.nu    # κ → ν
αₙ = post.shape # α → shape
βₙ = post.rate  # β → rate

# numerically more stable but doesn't seem to have effect; issue is probably internal to
# `pdf` which needs to compute ≈ Γ(1000)
p_μ_pdf = z -> exp(logpdf(p_μ, (z - μₙ) * exp(- 0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ))))
# p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ)))

#################
# Visualization #
#################

# visualize
using Plots, StatsPlots, LaTeXStrings
pyplot()

p1 = plot();
density!(samples[:, 1], label = "s (ADVI)", color = :blue, linestyle = :dash)
histogram!(samples[:, 1], label = "", normed = true, alpha = 0.3, color = :blue);

density!([chain[:s].value...], label = "s (NUTS)", color = :green, linestyle = :dashdot)
histogram!([chain[:s].value...], label = "", normed = true, color = :green, alpha = 0.3)

# normalize using Riemann approx. because of (almost certainly) numerical issues
Δ = 0.001
r = 0.75:0.001:1.25
norm_const = sum(p_σ²_pdf.(r) .* Δ)
plot!(r, p_σ²_pdf, label = "s (posterior)", color = :red);
vline!([var(x)], label = "s (data)", linewidth = 1.5, color = :black, alpha = 0.7)

xlims!(0.5, 1.5)

title!(L"$x_i \sim \mathcal{N}(0, 1)$ for $i = 1,\dots,1000$")

p2 = plot()
density!(samples[:, 2], label = "m (ADVI)", color = :blue, linestyle = :dash)
histogram!(samples[:, 2], label = "", normed = true, alpha = 0.3, color = :blue)

density!([chain[:m].value...], label = "m (NUTS)", color = :green, linestyle = :dashdot)
histogram!([chain[:m].value...], label = "", normed = true, color = :green, alpha = 0.3)

# normalize using Riemann approx. because of (almost certainly) numerical issues
Δ = 0.0001
r = -0.1 + mean(x):Δ:0.1 + mean(x)
norm_const = sum(p_μ_pdf.(r) .* Δ)
plot!(r, z -> p_μ_pdf(z) / norm_const, label = "m (posterior)", color = :red);
vline!([mean(x)], label = "m (data)", linewidth = 1.5, color = :black, alpha = 0.7)

xlims!(-0.25, 0.25)

p = plot(p1, p2; layout = (2, 1))

savefig(p, "advi_proper.png")
	using Turing, Bijectors, ForwardDiff, LinearAlgebra, Optim, LineSearches
	import StatsBase: sample
	import Optim: optimize

	#############
	# Utilities #
	#############

	function jac_inv_transform(dist::Distribution, x::T where T<:Real)
	ForwardDiff.derivative(x -> invlink(dist, x), x)
	end

	function jac_inv_transform(dist::Distribution, x::Array{T} where T <: Real)
	ForwardDiff.jacobian(x -> invlink(dist, x), x)
	end

	function center_diag_gaussian(x, μ, σ)
	# instead of creating a diagonal matrix, we just do elementwise multiplication
	(σ .^(-1)) .* (x - μ)
	end

	function center_diag_gaussian_inv(η, μ, σ)
	(η .* σ) + μ
	end

	#########################
	# Variational Inference #
	#########################

	abstract type VariationalInference end

	"""
	sample(vi::VariationalInference, num_samples)

	Produces `num_samples` samples for the given VI method using number of samples equal to `num_samples`.
	"""
	function sample(vi::VariationalInference, num_samples) end

	"""
	elbo(vi::VariationalInference, num_samples)

	Computes empirical estimates of ELBO for the given VI method using number of samples equal to `num_samples`.
	"""
	function elbo(vi::VariationalInference, num_samples) end

	"""
	optimize(vi::VariationalInference)

	Finds parameters which maximizes the ELBO for the given VI method.
	"""
	function optimize(vi::VariationalInference) end


	"""
	ADVI(model::Turing.Model)

	Automatic Differentiation Variational Inference (ADVI) for a given model.
	"""
	struct ADVI{T <: Real} <: VariationalInference
	model::Turing.Model
	μ::Vector{T}
	ω::Vector{T}
	end

	ADVI(model::Turing.Model) = begin
	# setup
	var_info = Turing.VarInfo()
	model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	ADVI(model, zeros(num_params), zeros(num_params))
	end

	function sample(vi::ADVI, num_samples)
	# setup
	var_info = Turing.VarInfo()
	vi.model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	# convenience
	μ, ω = vi.μ, vi.ω

	# buffer
	samples = zeros(num_samples, num_params)

	for i = 1:size(var_info.dists, 1)
	prior = var_info.dists[i]
	r = var_info.ranges[i]

	# initials
	μ_i = μ[r]
	ω_i = ω[r]

	# # sample from VI posterior
	θ_acc = zeros(length(μ_i))

	for j = 1:num_samples
	η = randn(length(μ_i))
	ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))
	θ = invlink(prior, ζ)

	samples[j, r] = θ
	end
	end

	return samples
	end

	function optimize(vi::ADVI; samples_per_step = 10, max_iters = 500)
	# setup
	var_info = Turing.VarInfo()
	vi.model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	function objective(x)
	# extract the mean-field Gaussian params
	μ, ω = x[1:num_params], x[num_params + 1: end]

	- elbo(vi, μ, ω, samples_per_step)
	end

	# for every param we need a mean μ and variance ω
	x = zeros(2 * num_params)
	diff_result = DiffResults.GradientResult(x)

	# used for truncated adaGrad as suggested in (Blei et al, 2015).
	η = 0.1
	τ = 1.0
	ρ = zeros(2 * num_params)
	s = zeros(2 * num_params)
	g² = zeros(2 * num_params)

	# number of previous gradients to use to compute `s` in adaGrad
	stepsize_num_prev = 10

	i = 0
	while (i < max_iters) # & converged # <= add criterion? A running mean maybe?
	# compute gradient
	ForwardDiff.gradient!(diff_result, objective, x)

	# recursive implementation of updating the step-size
	# if beyound first sequence of steps we subtract of the previous g² before adding the next
	if i > stepsize_num_prev
	s -= g²
	end

	# update parameters for adaGrad
	g² .= DiffResults.gradient(diff_result).^2
	s += g²

	# compute stepsize
	@. ρ = η / (τ + sqrt(s))

	x .= x - ρ .* DiffResults.gradient(diff_result)
	@info "Step $i" ρ DiffResults.value(diff_result) norm(DiffResults.gradient(diff_result))

	i += 1
	end

	μ, ω = x[1:num_params], x[num_params + 1: end]

	return μ, ω
	end

	function elbo(vi::ADVI, μ::Vector{T}, ω::Vector{T}, num_samples) where T <: Real
	# setup
	var_info = Turing.VarInfo()

	# initial `Var_Info` object
	vi.model(var_info, Turing.SampleFromUniform())

	num_params = size(var_info.vals, 1)

	elbo_acc = 0.0

	for i = 1:num_samples
	# iterate through priors, sample and update
	for i = 1:size(var_info.dists, 1)
	prior = var_info.dists[i]
	r = var_info.ranges[i]

	# mean-field params for this set of model params
	μ_i = μ[r]
	ω_i = ω[r]

	# obtain samples from mean-field posterior approximation
	η = randn(length(μ_i))
	ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))

	# inverse-transform back to original param space
	θ = invlink(prior, ζ)

	# update
	var_info.vals[r] = θ

	# add the log-det-jacobian of inverse transform
	elbo_acc += log(abs(det(jac_inv_transform(prior, ζ)))) / num_samples
	end

	# sample with updated variables
	vi.model(var_info)
	elbo_acc += var_info.logp / num_samples
	end

	# add the term for the entropy of the variational posterior
	variational_posterior_entropy = sum(ω)
	elbo_acc += variational_posterior_entropy

	elbo_acc
	end

	function elbo(vi::ADVI, num_samples)
	# extract the mean-field Gaussian params
	μ, ω = vi.μ, vi.ω

	elbo(vi, μ, ω, num_samples)
	end

	##################
	# Simple example #
	##################

	@model demo(x) = begin
	s ~ InverseGamma(2,3)
	m ~ Normal(0.0, sqrt(s)) # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance
	for i = 1:length(x)
	x[i] ~ Normal(m, sqrt(s))
	end
	end

	# generate data
	x = randn(1, 1000);

	# produce "true" samples using NUTS
	m = demo(x)
	chain = sample(m, NUTS(2000, 200, 0.65))

	# ADVI
	m = demo(x)
	vi = ADVI(m) # default construction of ADVI
	μ, ω = optimize(vi, samples_per_step = 5, max_iters = 5000) # maximize ELBO
	vi = ADVI(m, μ, ω) # construct new from optimized values
	samples = sample(vi, 2000)

	# quick check
	println([mean(samples, dims=1), [var(x), mean(x)]])

	# closed form
	using ConjugatePriors

	# prior
	# notation mapping has been verified by explicitly computing expressions
	# in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	μ₀ = 0.0 # => μ
	κ₀ = 1.0 # => ν, which scales the precision of the Normal
	α₀ = 2.0 # => "shape"
	β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale"

	pri = NormalGamma(μ₀, κ₀, α₀, β₀)

	# posterior
	post = posterior(pri, Normal, x)

	# marginal distribution of τ = 1 / σ²
	# Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	# `scale(post)` = θ
	p_τ = Gamma(post.shape, scale(post))
	p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ²

	# marginal of μ
	# Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	p_μ = TDist(2 * post.shape)

	μₙ = post.mu # μ → μ
	κₙ = post.nu # κ → ν
	αₙ = post.shape # α → shape
	βₙ = post.rate # β → rate

	# numerically more stable but doesn't seem to have effect; issue is probably internal to
	# `pdf` which needs to compute ≈ Γ(1000)
	p_μ_pdf = z -> exp(logpdf(p_μ, (z - μₙ) * exp(- 0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ))))
	# p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ)))

	#################
	# Visualization #
	#################

	# visualize
	using Plots, StatsPlots, LaTeXStrings
	pyplot()

	p1 = plot();
	density!(samples[:, 1], label = "s (ADVI)", color = :blue, linestyle = :dash)
	histogram!(samples[:, 1], label = "", normed = true, alpha = 0.3, color = :blue);

	density!([chain[:s].value...], label = "s (NUTS)", color = :green, linestyle = :dashdot)
	histogram!([chain[:s].value...], label = "", normed = true, color = :green, alpha = 0.3)

	# normalize using Riemann approx. because of (almost certainly) numerical issues
	Δ = 0.001
	r = 0.75:0.001:1.25
	norm_const = sum(p_σ²_pdf.(r) .* Δ)
	plot!(r, p_σ²_pdf, label = "s (posterior)", color = :red);
	vline!([var(x)], label = "s (data)", linewidth = 1.5, color = :black, alpha = 0.7)

	xlims!(0.5, 1.5)

	title!(L"$x_i \sim \mathcal{N}(0, 1)$ for $i = 1,\dots,1000$")

	p2 = plot()
	density!(samples[:, 2], label = "m (ADVI)", color = :blue, linestyle = :dash)
	histogram!(samples[:, 2], label = "", normed = true, alpha = 0.3, color = :blue)

	density!([chain[:m].value...], label = "m (NUTS)", color = :green, linestyle = :dashdot)
	histogram!([chain[:m].value...], label = "", normed = true, color = :green, alpha = 0.3)

	# normalize using Riemann approx. because of (almost certainly) numerical issues
	Δ = 0.0001
	r = -0.1 + mean(x):Δ:0.1 + mean(x)
	norm_const = sum(p_μ_pdf.(r) .* Δ)
	plot!(r, z -> p_μ_pdf(z) / norm_const, label = "m (posterior)", color = :red);
	vline!([mean(x)], label = "m (data)", linewidth = 1.5, color = :black, alpha = 0.7)

	xlims!(-0.25, 0.25)

	p = plot(p1, p2; layout = (2, 1))

	savefig(p, "advi_proper.png")