cscherrer/online-gaussian.jl

## online-gaussian.jl

using PositiveFactorizations
using LinearAlgebra
using StatsFuns
using Random
using OnlineStatsBase
using Statistics
const OSB = OnlineStatsBase

const bessel = OSB.bessel

mutable struct Gaussian{T} <: OnlineStat{Union{Tuple, NamedTuple, AbstractVector}, } where T<:Number
    value::Matrix{T}
    A::Matrix{T}  # x'x/n
    b::Vector{T}  # 1'x/n
    log∑w::T
    log∑w²::T
    L::Matrix{T}  # Lower Cholesky factor
    Lsync::Bool   # Indicates whether L is up to date
    weight
    n::Int
end

# Kish's Effective Sample Size, see https://en.wikipedia.org/wiki/Effective_sample_size
n_eff(o::Gaussian) = exp(2 * o.log∑w - o.log∑w²)

function Gaussian(::Type{T}, p::Int=0; weight = EqualWeight()) where T<:Number
    logε = nextfloat(typemin(T))
    Gaussian(zeros(T,p,p), zeros(T,p,p), zeros(T,p), logε, logε, zeros(T,p,p), false, weight, 0)
end

Gaussian(p::Int=0; weight = EqualWeight()) = Gaussian(Float64, p; weight=weight)

# The computation for γ,
#
#     γ = min(o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w), 1.0)
#
# could use come explanation. Let's first look at the first argument inside the
# `min`,
#
#     o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w)
#
# The first factor,
#
#     o.weight(o.n += 1)
#
# is to allow the use of built-in weights from OnlineStats.jl. The last factor,
#
#     logistic(ℓ - o.log∑w)
#
# takes into account that each observtation has its own log-weight. But now
# we've counted the weights twice, so there's a question of how to correct for
# this. We could take a square root to get the geometric mean of the two
# effects, but this would leave neither of them working as expected.
#
# With OnlineStats.EqualWeight, the weight for the nth observation would be 1/n.
# So we multiply by n (the second factor).
#
# The `logistic` factor is typically O(1/n), but there are no guarantees about
# this. So the `min` is to account for the (hopefuly rare) case where
#
#     o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w) > 1
#
# Finally, allowing γ=1 causes problems, because this makes the distribution
# collapse to a point. So as a heuristic, we set this to 0.99 to prevent this
# problem.
function OSB._fit!(o::Gaussian{T}, xℓ) where {T}
    o.Lsync = false
    (x,ℓ) = xℓ
    γ = min(o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w), 0.99)
    o.log∑w = logaddexp(o.log∑w, ℓ)
    o.log∑w² = logaddexp(o.log∑w, 2ℓ)
    if isempty(o.A)
        p = length(x)
        o.b = zeros(T, p)
        o.A = zeros(T, p, p)
        o.L = zeros(T, p, p)
        o.value = zeros(T, p, p)
    end
    OSB.smooth!(o.b, x, γ)
    OSB.smooth_syr!(o.A, x, γ)
end


OSB.nvars(o::Gaussian) = size(o.A, 1)

function OSB.value(o::Gaussian)
    o.value[:] = Matrix(Symmetric((o.A - o.b * o.b')))
    o.value
end

function OSB._merge!(o::Gaussian, o2::Gaussian)
    o.Lsync = false
    o.n += o2.n
    γ = logistic(o.log∑w - o2.log∑w)
    o.log∑w = logaddexp(o.log∑w, o2.log∑w)
    o.log∑w² = logaddexp(o.log∑w², o2.log∑w²)
    OSB.smooth!(o.A, o2.A, γ)
    OSB.smooth!(o.b, o2.b, γ)
end

Statistics.cov(o::Gaussian) = value(o)

Statistics.mean(o::Gaussian) = o.b

Statistics.var(o::Gaussian; kw...) = diag(value(o; kw...))

function Statistics.cor(o::Gaussian; kw...)
    value(o; kw...)
    v = 1.0 ./ sqrt.(diag(o.value))
    rmul!(o.value, Diagonal(v))
    lmul!(Diagonal(v), o.value)
    o.value
end

function LinearAlgebra.cholesky(o::Gaussian)
    o.Lsync && return Cholesky(LowerTriangular(o.L), :L, 0)

    copyto!(o.L, value(o))
    C = cholesky!(Positive, o.L)
    o.Lsync = true
    return C
end

function Random.rand!(rng::AbstractRNG, x::AbstractArray, o::Gaussian{T}) where {T}
    randn!(rng, x)
    L = cholesky(o).L
    lmul!(L, x)
    x .+= mean(o)
    return x
end

function Base.rand(rng::AbstractRNG, o::Gaussian{T}) where {T}
    x = Vector{T}(undef, OSB.nvars(o))
    rand!(rng, x, o)
end

# Some checks
#
# o = Gaussian()
# for j in 1:1000
#     fit!(o, (randn(3), randn()))
# end
# value(o)
# mean(o)
# cov(o)
# cholesky(o)
# rand!(zeros(3), o)
# rand(o)

using MeasureTheory
using TransformVariables

t = as𝕀

p = Pullback(t, Beta(4,2))

q0 = Normal(0,10)

logdensity(p, q0, randn())

o = Gaussian(weight=EqualWeight())

while min(nobs(o), n_eff(o)) < 10
    x = rand(q0)
    ℓ = logdensity(p, q0, x)
    fit!(o, (x, ℓ))
end

while n_eff(o) < 1000
    n = n_eff(o)
    μ = mean(o)[1]
    σ = std(o)[1]
    q = Normal(μ,σ)

    # Train in minibatches
    while n_eff(o) < n + 100
        x = rand(q)
        ℓ = logdensity(p, q, x)
        fit!(o, (x, ℓ))
    end
end

using UnicodePlots

xx = 0.01:0.01:0.99

q = Normal(mean(o)[1], std(o)[1])

μμ = [density(Pushforward(t,p), x) for x in xx];
νν =  [density(Pushforward(t, q), x) for x in xx];

factor = sum(μμ) / sum(νν)

plt = lineplot(xx, νν*factor);
lineplot!(plt, xx, μμ)

# julia> lineplot!(plt, xx, μμ)
#      ┌────────────────────────────────────────┐
#    3 │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠀⠀⠀⠀⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠒⠛⠥⡉⢢⡀⠀⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠤⡟⠉⠀⠀⠀⠀⠈⢢⡑⡄⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⡠⡗⠊⠀⠀⠀⠀⠀⠀⠀⠀⠑⣵⠀⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡮⠊⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⡇⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡴⠋⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢹⠀⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣔⠕⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣇⠀⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠞⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢻⡄⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠔⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠸⣧⠀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣤⠞⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢿⡀│
#      │⠀⠀⠀⠀⠀⠀⠀⠀⢀⣠⡶⠛⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⢇│
#    0 │⣀⣀⣀⣀⣀⣤⠴⠞⠋⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⡇│
#      └────────────────────────────────────────┘
#      0                                        1

	using PositiveFactorizations
	using LinearAlgebra
	using StatsFuns
	using Random
	using OnlineStatsBase
	using Statistics
	const OSB = OnlineStatsBase

	const bessel = OSB.bessel

	mutable struct Gaussian{T} <: OnlineStat{Union{Tuple, NamedTuple, AbstractVector}, } where T<:Number
	value::Matrix{T}
	A::Matrix{T} # x'x/n
	b::Vector{T} # 1'x/n
	log∑w::T
	log∑w²::T
	L::Matrix{T} # Lower Cholesky factor
	Lsync::Bool # Indicates whether L is up to date
	weight
	n::Int
	end

	# Kish's Effective Sample Size, see https://en.wikipedia.org/wiki/Effective_sample_size
	n_eff(o::Gaussian) = exp(2 * o.log∑w - o.log∑w²)

	function Gaussian(::Type{T}, p::Int=0; weight = EqualWeight()) where T<:Number
	logε = nextfloat(typemin(T))
	Gaussian(zeros(T,p,p), zeros(T,p,p), zeros(T,p), logε, logε, zeros(T,p,p), false, weight, 0)
	end

	Gaussian(p::Int=0; weight = EqualWeight()) = Gaussian(Float64, p; weight=weight)

	# The computation for γ,
	#
	# γ = min(o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w), 1.0)
	#
	# could use come explanation. Let's first look at the first argument inside the
	# `min`,
	#
	# o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w)
	#
	# The first factor,
	#
	# o.weight(o.n += 1)
	#
	# is to allow the use of built-in weights from OnlineStats.jl. The last factor,
	#
	# logistic(ℓ - o.log∑w)
	#
	# takes into account that each observtation has its own log-weight. But now
	# we've counted the weights twice, so there's a question of how to correct for
	# this. We could take a square root to get the geometric mean of the two
	# effects, but this would leave neither of them working as expected.
	#
	# With OnlineStats.EqualWeight, the weight for the nth observation would be 1/n.
	# So we multiply by n (the second factor).
	#
	# The `logistic` factor is typically O(1/n), but there are no guarantees about
	# this. So the `min` is to account for the (hopefuly rare) case where
	#
	# o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w) > 1
	#
	# Finally, allowing γ=1 causes problems, because this makes the distribution
	# collapse to a point. So as a heuristic, we set this to 0.99 to prevent this
	# problem.
	function OSB._fit!(o::Gaussian{T}, xℓ) where {T}
	o.Lsync = false
	(x,ℓ) = xℓ
	γ = min(o.weight(o.n += 1) * o.n * logistic(ℓ - o.log∑w), 0.99)
	o.log∑w = logaddexp(o.log∑w, ℓ)
	o.log∑w² = logaddexp(o.log∑w, 2ℓ)
	if isempty(o.A)
	p = length(x)
	o.b = zeros(T, p)
	o.A = zeros(T, p, p)
	o.L = zeros(T, p, p)
	o.value = zeros(T, p, p)
	end
	OSB.smooth!(o.b, x, γ)
	OSB.smooth_syr!(o.A, x, γ)
	end


	OSB.nvars(o::Gaussian) = size(o.A, 1)

	function OSB.value(o::Gaussian)
	o.value[:] = Matrix(Symmetric((o.A - o.b * o.b')))
	o.value
	end

	function OSB._merge!(o::Gaussian, o2::Gaussian)
	o.Lsync = false
	o.n += o2.n
	γ = logistic(o.log∑w - o2.log∑w)
	o.log∑w = logaddexp(o.log∑w, o2.log∑w)
	o.log∑w² = logaddexp(o.log∑w², o2.log∑w²)
	OSB.smooth!(o.A, o2.A, γ)
	OSB.smooth!(o.b, o2.b, γ)
	end

	Statistics.cov(o::Gaussian) = value(o)

	Statistics.mean(o::Gaussian) = o.b

	Statistics.var(o::Gaussian; kw...) = diag(value(o; kw...))

	function Statistics.cor(o::Gaussian; kw...)
	value(o; kw...)
	v = 1.0 ./ sqrt.(diag(o.value))
	rmul!(o.value, Diagonal(v))
	lmul!(Diagonal(v), o.value)
	o.value
	end

	function LinearAlgebra.cholesky(o::Gaussian)
	o.Lsync && return Cholesky(LowerTriangular(o.L), :L, 0)

	copyto!(o.L, value(o))
	C = cholesky!(Positive, o.L)
	o.Lsync = true
	return C
	end

	function Random.rand!(rng::AbstractRNG, x::AbstractArray, o::Gaussian{T}) where {T}
	randn!(rng, x)
	L = cholesky(o).L
	lmul!(L, x)
	x .+= mean(o)
	return x
	end

	function Base.rand(rng::AbstractRNG, o::Gaussian{T}) where {T}
	x = Vector{T}(undef, OSB.nvars(o))
	rand!(rng, x, o)
	end

	# Some checks
	#
	# o = Gaussian()
	# for j in 1:1000
	# fit!(o, (randn(3), randn()))
	# end
	# value(o)
	# mean(o)
	# cov(o)
	# cholesky(o)
	# rand!(zeros(3), o)
	# rand(o)

	using MeasureTheory
	using TransformVariables

	t = as𝕀

	p = Pullback(t, Beta(4,2))

	q0 = Normal(0,10)

	logdensity(p, q0, randn())

	o = Gaussian(weight=EqualWeight())

	while min(nobs(o), n_eff(o)) < 10
	x = rand(q0)
	ℓ = logdensity(p, q0, x)
	fit!(o, (x, ℓ))
	end

	while n_eff(o) < 1000
	n = n_eff(o)
	μ = mean(o)[1]
	σ = std(o)[1]
	q = Normal(μ,σ)

	# Train in minibatches
	while n_eff(o) < n + 100
	x = rand(q)
	ℓ = logdensity(p, q, x)
	fit!(o, (x, ℓ))
	end
	end

	using UnicodePlots

	xx = 0.01:0.01:0.99

	q = Normal(mean(o)[1], std(o)[1])

	μμ = [density(Pushforward(t,p), x) for x in xx];
	νν = [density(Pushforward(t, q), x) for x in xx];

	factor = sum(μμ) / sum(νν)

	plt = lineplot(xx, νν*factor);
	lineplot!(plt, xx, μμ)

	# julia> lineplot!(plt, xx, μμ)
	# ┌────────────────────────────────────────┐
	# 3 │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠀⠀⠀⠀⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠒⠛⠥⡉⢢⡀⠀⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠤⡟⠉⠀⠀⠀⠀⠈⢢⡑⡄⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⡠⡗⠊⠀⠀⠀⠀⠀⠀⠀⠀⠑⣵⠀⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡮⠊⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⡇⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡴⠋⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢹⠀⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣔⠕⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣇⠀⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠞⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢻⡄⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠔⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠸⣧⠀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣤⠞⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢿⡀│
	# │⠀⠀⠀⠀⠀⠀⠀⠀⢀⣠⡶⠛⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⢇│
	# 0 │⣀⣀⣀⣀⣀⣤⠴⠞⠋⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⡇│
	# └────────────────────────────────────────┘
	# 0 1