phipsgabler/simulation.jl

## simulation.jl
using CategoricalArrays
using DataFrames
using Distributions
using DynamicPPL
using MixedModels
using Plots
using StatsFuns
using StatsModels
using StatsPlots
using Turing


function modelsize(X, Z)
    N, K = size(X)
    @assert size(Z, 1) == N
    M = size(Z, 2)
    return N, K, M
end


# See https://arxiv.org/pdf/1201.2375.pdf for mixed-effects beta regression
@model function mixed_effects_beta(
    X,
    Z
)
    N, K, M = modelsize(X, Z)
    y = similar(X, N)

    # priors
    # TODO: use TDist priors for random effects -- outlier users!
    θ_fixed ~ MvNormal(zeros(K), ones(K) * 30)
    θ_random ~ MvNormal(zeros(M), M == 0 ? ones(0, 0) : ones(M) * 30)
    # conditional to circumevent PDMats bug: https://github.com/JuliaStats/PDMats.jl/issues/138

    γ_fixed ~ MvNormal(zeros(K), ones(K) * 30)
    γ_random ~ MvNormal(zeros(M), M == 0 ? ones(0, 0) : ones(M) * 30)

    # observations
    η = X * θ_fixed .+ Z * θ_random
    μ = logistic.(η)

    ψ = X * γ_fixed .+ Z * γ_random
    ϕ = exp.(ψ)

    α = μ .* ϕ
    β = (1 .- μ) .* ϕ
    ϵ = eps(eltype(α))
    y .~ Beta.(clamp.(α, ϵ, Inf), clamp.(β, ϵ, Inf)) # prevent numerically zero α and β
end

apply_formula(f, data) = apply_schema(f, schema(f, data), MixedModel)

function get_modelcols(f, data)
    f = apply_formula(f, data)
    _, M =  modelcols(f, data)

    if M isa Tuple
        return M
    else
        # no random effects -> no Z matrix
        return M, similar(M, size(M, 1), 0)
    end
end


function dummy_data(n_observations, n_stimuli, n_authors)
    stimuli = [Symbol("stimulus_", i) => rand(n_observations) for i = 1:n_stimuli]
    return DataFrame(stimuli...,
                     :author => categorical(rand(1:n_authors, n_observations)))
end

function construct_model(f, data, conditions)
    X, Z = get_modelcols(f, data)

    N, K, M = modelsize(X, Z)
    haskey(conditions, :θ_fixed) && @assert size(conditions.θ_fixed) == (K,)
    haskey(conditions, :γ_fixed) && @assert size(conditions.γ_fixed) == (K,)
    haskey(conditions, :θ_random) && @assert size(conditions.θ_random) == (M,)
    haskey(conditions, :γ_random) && @assert size(conditions.γ_random) == (M,)
    haskey(conditions, :y) && @assert size(conditions.y) == (N,)

    model = mixed_effects_beta(X, Z) | conditions
    return model
end


function simulate(f, n_observations, n_authors, n_samples; conditions...)
    data = simulate_data(n_observations, n_authors)
    return simulate(f, data, n_samples; conditions...)
end

function simulate(f, data, n_samples; conditions...)
    model = construct_model(f, data, NamedTuple(conditions))
    return [model() for _ in 1:n_samples]
end

infer(f::FormulaTerm, data, n_samples; conditions...) = infer(f, data, NUTS(0.65), n_samples; conditions...)
function infer(f::FormulaTerm, data, sampler, n_samples; conditions...)

    model = construct_model(f, data, NamedTuple(conditions))
    return Turing.sample(model, sampler, n_samples)
end


## Example:

# data = dummy_data(10, 2, 2);

# data.response = simulate(
#     @formula(0 ~ 1 + stimulus_1 + stimulus_2 + (1|author)), data, 1,
#     θ_fixed = [-5, 5, 0], γ_fixed = [2, 0, 0], θ_random = zeros(2), γ_random = zeros(2)
# )[1];

# s = infer(@formula(0 ~ 1 + stimulus_1 + stimulus_2 + (1|author)), data, NUTS(0.65), 5000;
#           y = data.response, θ_random = zeros(2), γ_random = zeros(2));

# plot(s)
	using CategoricalArrays
	using DataFrames
	using Distributions
	using DynamicPPL
	using MixedModels
	using Plots
	using StatsFuns
	using StatsModels
	using StatsPlots
	using Turing


	function modelsize(X, Z)
	N, K = size(X)
	@assert size(Z, 1) == N
	M = size(Z, 2)
	return N, K, M
	end


	# See https://arxiv.org/pdf/1201.2375.pdf for mixed-effects beta regression
	@model function mixed_effects_beta(
	X,
	Z
	)
	N, K, M = modelsize(X, Z)
	y = similar(X, N)

	# priors
	# TODO: use TDist priors for random effects -- outlier users!
	θ_fixed ~ MvNormal(zeros(K), ones(K) * 30)
	θ_random ~ MvNormal(zeros(M), M == 0 ? ones(0, 0) : ones(M) * 30)
	# conditional to circumevent PDMats bug: https://github.com/JuliaStats/PDMats.jl/issues/138

	γ_fixed ~ MvNormal(zeros(K), ones(K) * 30)
	γ_random ~ MvNormal(zeros(M), M == 0 ? ones(0, 0) : ones(M) * 30)

	# observations
	η = X * θ_fixed .+ Z * θ_random
	μ = logistic.(η)

	ψ = X * γ_fixed .+ Z * γ_random
	ϕ = exp.(ψ)

	α = μ .* ϕ
	β = (1 .- μ) .* ϕ
	ϵ = eps(eltype(α))
	y .~ Beta.(clamp.(α, ϵ, Inf), clamp.(β, ϵ, Inf)) # prevent numerically zero α and β
	end

	apply_formula(f, data) = apply_schema(f, schema(f, data), MixedModel)

	function get_modelcols(f, data)
	f = apply_formula(f, data)
	_, M = modelcols(f, data)

	if M isa Tuple
	return M
	else
	# no random effects -> no Z matrix
	return M, similar(M, size(M, 1), 0)
	end
	end


	function dummy_data(n_observations, n_stimuli, n_authors)
	stimuli = [Symbol("stimulus_", i) => rand(n_observations) for i = 1:n_stimuli]
	return DataFrame(stimuli...,
	:author => categorical(rand(1:n_authors, n_observations)))
	end

	function construct_model(f, data, conditions)
	X, Z = get_modelcols(f, data)

	N, K, M = modelsize(X, Z)
	haskey(conditions, :θ_fixed) && @assert size(conditions.θ_fixed) == (K,)
	haskey(conditions, :γ_fixed) && @assert size(conditions.γ_fixed) == (K,)
	haskey(conditions, :θ_random) && @assert size(conditions.θ_random) == (M,)
	haskey(conditions, :γ_random) && @assert size(conditions.γ_random) == (M,)
	haskey(conditions, :y) && @assert size(conditions.y) == (N,)

	model = mixed_effects_beta(X, Z) \| conditions
	return model
	end


	function simulate(f, n_observations, n_authors, n_samples; conditions...)
	data = simulate_data(n_observations, n_authors)
	return simulate(f, data, n_samples; conditions...)
	end

	function simulate(f, data, n_samples; conditions...)
	model = construct_model(f, data, NamedTuple(conditions))
	return [model() for _ in 1:n_samples]
	end

	infer(f::FormulaTerm, data, n_samples; conditions...) = infer(f, data, NUTS(0.65), n_samples; conditions...)
	function infer(f::FormulaTerm, data, sampler, n_samples; conditions...)

	model = construct_model(f, data, NamedTuple(conditions))
	return Turing.sample(model, sampler, n_samples)
	end


	## Example:

	# data = dummy_data(10, 2, 2);

	# data.response = simulate(
	# @formula(0 ~ 1 + stimulus_1 + stimulus_2 + (1\|author)), data, 1,
	# θ_fixed = [-5, 5, 0], γ_fixed = [2, 0, 0], θ_random = zeros(2), γ_random = zeros(2)
	# )[1];

	# s = infer(@formula(0 ~ 1 + stimulus_1 + stimulus_2 + (1\|author)), data, NUTS(0.65), 5000;
	# y = data.response, θ_random = zeros(2), γ_random = zeros(2));

	# plot(s)