Nosferican/gist:597bf20e73baf86e66b7ab208715f175

## gistfile1.jl
# Author: José Bayoán Santiago Calderón (@Nosferican)

# Set up the environment for this application

using Distributions: Normal, TDist
using Plots: Plot, PyPlotBackend, plot, density, savefig, plot!, pyplot
using StatPlots: StatPlots, @df
using Random: seed!
using Statistics: mean, quantile
using StatsBase: coef, coefnames, confint, countmap, dof_residual, levelsmap,
                 RegressionModel, sample, shuffle, vcov, weights
using DataFrames: AbstractDataFrame, categorical!, DataFrame
using FixedEffectModels: reg, @model

import StatsBase: sample

pyplot()

# Helpers

"""
    SamplingDesign

Abstract type for sampling designs.
"""
abstract type SamplingDesign end
"""
    RandomSampling

Struct for random sampling (i.e., every observation has equal probability of
being observed). The sampling probability is given by `p`.
"""
struct RandomSampling <: SamplingDesign
    p::Float64
    function RandomSampling(p)
        zero(p) ≤ p ≤ one(p) || throw(ArgumentError("p ∉ [0, 1]"))
        return new(p)
    end
end
"""
    ClusteredSampling

Struct for clustered sampling (i.e., the sampling probability of an observation
is cluster dependent). The sampling probability for each cluster is given by
`p`.
"""
struct ClusteredSampling <: SamplingDesign
    p::Vector{Float64}
    function ClusteredSampling(p::AbstractVector)
        all(elem -> zero(elem) ≤ elem ≤ one(elem), p) ||
        throw(ArgumentError("p contains non valid probability elements"))
        return new(p)
    end
end
function sample(cluster::AbstractVector, design::RandomSampling)
    p = getfield(design, :p)
    sample([true, false], weights([p, one(p) - p]), length(cluster))
end
function sample(cluster::AbstractVector, design::ClusteredSampling)
    design = ClusteredSampling(0.1:0.05:0.3)
    p = Dict(zip(unique(data[:cluster]), shuffle(getfield(design, :p))))
    output = sample.(Ref([true, false]),
                     weights.((p -> [p, one(p) - p]).(get.(Ref(p),
                                                           data[:cluster],
                                                           nothing))))
    return output
end

"""
    linearcombination(model::RegressionModel, subgroups::AbstractVector)

Given a model and subgroups (with a single interaction term), it gives the
parameter estimate and standard error.
"""
function linearcombination(model::RegressionModel, subgroups::AbstractVector)
    Vars = occursin.("&", coefnames(model))
    shares = values(sort(countmap(subgroups)))
    shares = shares ./ sum(shares)
    L = zeros(length(Vars))
    L[Vars] = shares
    β̂₁ = L' * coef(model)
    σ̂₁ = sqrt(L' *  vcov(model) * L)
    tstar = quantile(TDist(dof_residual(model)), 1 - 5e-2 / 2)
    movement = tstar * σ̂₁
    LB = β̂₁ - movement
    UB = β̂₁ + movement
    return [LB UB]
end

"""
    trial(data::AbstractDataFrame,
          sampling_design::SamplingDesign = RandomSampling(1e-1))

Return the estimates for pooling, within, and interactions under the sampling.
"""
function trial(data::AbstractDataFrame,
               sampling_design::SamplingDesign = RandomSampling(1e-1))
    obs = sample(data[:cluster], sampling_design)
    subdata = view(data, obs)
    if sampling_design isa RandomSampling
        pooling = reg(subdata, @model(y ~ x1 + x2))
        within = reg(subdata, @model(y ~ x1 + x2,
                                     fe = cluster))
        interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
    else
        pooling = reg(subdata, @model(y ~ x1 + x2,
                                      vcov = cluster(cluster)))
        within = reg(subdata, @model(y ~ x1 + x2,
                                     fe = cluster,
                                     vcov = cluster(cluster)))
        interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
    end
    ests = vcat(confint(pooling)[findfirst(isequal("x1"), coefnames(pooling)):findfirst(isequal("x1"), coefnames(pooling)),:],
                confint(within)[findfirst(isequal("x1"), coefnames(within)):findfirst(isequal("x1"), coefnames(within)),:],
                linearcombination(interaction, data[:cluster]),
                )

    ests2 = vcat(confint(pooling)[findfirst(isequal("x2"), coefnames(pooling)):findfirst(isequal("x2"), coefnames(pooling)),:],
                confint(within)[findfirst(isequal("x2"), coefnames(within)):findfirst(isequal("x2"), coefnames(within)),:],
                confint(interaction)[findfirst(isequal("x2"), coefnames(interaction)):findfirst(isequal("x2"), coefnames(interaction)),:],
                )
    hcat(ests, ests2)
end

"""
    trials_n(data::AbstractDataFrame,
             sampling_design::SamplingDesign,
             n::Integer = 1000)

Return plots and matrices for distribution of parameter estimates and empirical
coverage rates.
"""
function trials_n(data::AbstractDataFrame,
                  sampling_design::SamplingDesign,
                  n::Integer = 1000)
    seed!(0)
    trials = map(t -> trial(data, sampling_design), 1:n)
    parameters = mapreduce(t -> vcat(mean(t[:,1:2], dims = 2),
                                     mean(t[:,3:4], dims = 2)),
                           hcat,
                           trials)
    data_parameters =
        DataFrame(model = repeat(["Pooling", "Within", "Interaction"],
                                 outer = 2length(trials)),
                  dimension = repeat(repeat(["β", "β₂"],
                                            inner = 3),
                                     outer = size(parameters, 2)),
                  estimate = vec(parameters))
    plt = @df(data_parameters,
              density(:estimate,
                      group  = (:dimension, :model),
                      fill   = (.25, 0),
                      alpha = 0.85,
                      xlab = "Parameter Estimates",
                      ylab = "Density (%)")
             )
    plot!(plt, [β], seriestype = "vline", label =  "β (ATE for X₁)")
    plot!(plt, [0.5], seriestype = "vline", label = "β₂ (ATE for X₂)")
    limits = vec(mean(parameters, dims = 2))

    coverage = mapreduce(t -> compute_coverage(limits, t),
                         vcat,
                         trials)
    coverage = mapreduce(model -> mean(coverage[model:2:end,:], dims = 1),
                         vcat,
                         1:3)
    return (plt, coverage)
end

"""
    compute_coverage(limits::AbstractVector{<:Number},
                     trials::AbstractMatrix{<:Number})

Return indicators for whether the probability limits are within the confidence
intervals.
"""
function compute_coverage(limits::AbstractVector{<:Number},
                          trials::AbstractMatrix{<:Number})
    k = 1:size(trials, 2)
    output = mapreduce(col -> trials[:,col], vcat, filter(isodd, k)) .≤
                 limits .≤
             mapreduce(col -> trials[:,col], vcat, filter(iseven, k))
    reshape(output, 3, 2)
end

"""
    Results(::AbstractDataFrame)

Results hold the plots and matrices which record the results of the trials
for each condition.
"""
struct Results
    plt::Vector{Plot{PyPlotBackend}}
    ecr::Vector{Matrix{Float64}}
    function Results(data::AbstractDataFrame, n::Integer = 1000)
        plt_random, coverage_random =
            trials_n(data, RandomSampling(1e-1), n)
        plt_clustered, coverage_clustered =
            trials_n(data, ClusteredSampling(0.1:0.05:0.3), n)
        plt = [plt_random, plt_clustered]
        ecr = [coverage_random, coverage_clustered]
        return new(plt, ecr)
    end
end

# Population Analysis
seed!(0)
β = mean(1 .+ sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))

# HTE dimension constant_mean_variance
seed!(0)
data = DataFrame(cluster = sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
data[:x1] = rand(Normal(), size(data, 1))
data[:x2] = rand(Normal(-1, 0.5), size(data, 1))
data[:y]  = -0.2 .+
            (1 .+ data[:cluster]) .* data[:x1] .+
            0.5 .* data[:x2] .+
            rand(Normal(), size(data, 1))
categorical!(data, :cluster)

# Need to run this for some reason
# trials_n(data, RandomSampling(1e-1), 1)

constant_mean_variance = Results(data)
	# Author: José Bayoán Santiago Calderón (@Nosferican)

	# Set up the environment for this application

	using Distributions: Normal, TDist
	using Plots: Plot, PyPlotBackend, plot, density, savefig, plot!, pyplot
	using StatPlots: StatPlots, @df
	using Random: seed!
	using Statistics: mean, quantile
	using StatsBase: coef, coefnames, confint, countmap, dof_residual, levelsmap,
	RegressionModel, sample, shuffle, vcov, weights
	using DataFrames: AbstractDataFrame, categorical!, DataFrame
	using FixedEffectModels: reg, @model

	import StatsBase: sample

	pyplot()

	# Helpers

	"""
	SamplingDesign

	Abstract type for sampling designs.
	"""
	abstract type SamplingDesign end
	"""
	RandomSampling

	Struct for random sampling (i.e., every observation has equal probability of
	being observed). The sampling probability is given by `p`.
	"""
	struct RandomSampling <: SamplingDesign
	p::Float64
	function RandomSampling(p)
	zero(p) ≤ p ≤ one(p) \|\| throw(ArgumentError("p ∉ [0, 1]"))
	return new(p)
	end
	end
	"""
	ClusteredSampling

	Struct for clustered sampling (i.e., the sampling probability of an observation
	is cluster dependent). The sampling probability for each cluster is given by
	`p`.
	"""
	struct ClusteredSampling <: SamplingDesign
	p::Vector{Float64}
	function ClusteredSampling(p::AbstractVector)
	all(elem -> zero(elem) ≤ elem ≤ one(elem), p) \|\|
	throw(ArgumentError("p contains non valid probability elements"))
	return new(p)
	end
	end
	function sample(cluster::AbstractVector, design::RandomSampling)
	p = getfield(design, :p)
	sample([true, false], weights([p, one(p) - p]), length(cluster))
	end
	function sample(cluster::AbstractVector, design::ClusteredSampling)
	design = ClusteredSampling(0.1:0.05:0.3)
	p = Dict(zip(unique(data[:cluster]), shuffle(getfield(design, :p))))
	output = sample.(Ref([true, false]),
	weights.((p -> [p, one(p) - p]).(get.(Ref(p),
	data[:cluster],
	nothing))))
	return output
	end

	"""
	linearcombination(model::RegressionModel, subgroups::AbstractVector)

	Given a model and subgroups (with a single interaction term), it gives the
	parameter estimate and standard error.
	"""
	function linearcombination(model::RegressionModel, subgroups::AbstractVector)
	Vars = occursin.("&", coefnames(model))
	shares = values(sort(countmap(subgroups)))
	shares = shares ./ sum(shares)
	L = zeros(length(Vars))
	L[Vars] = shares
	β̂₁ = L' * coef(model)
	σ̂₁ = sqrt(L' * vcov(model) * L)
	tstar = quantile(TDist(dof_residual(model)), 1 - 5e-2 / 2)
	movement = tstar * σ̂₁
	LB = β̂₁ - movement
	UB = β̂₁ + movement
	return [LB UB]
	end

	"""
	trial(data::AbstractDataFrame,
	sampling_design::SamplingDesign = RandomSampling(1e-1))

	Return the estimates for pooling, within, and interactions under the sampling.
	"""
	function trial(data::AbstractDataFrame,
	sampling_design::SamplingDesign = RandomSampling(1e-1))
	obs = sample(data[:cluster], sampling_design)
	subdata = view(data, obs)
	if sampling_design isa RandomSampling
	pooling = reg(subdata, @model(y ~ x1 + x2))
	within = reg(subdata, @model(y ~ x1 + x2,
	fe = cluster))
	interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
	else
	pooling = reg(subdata, @model(y ~ x1 + x2,
	vcov = cluster(cluster)))
	within = reg(subdata, @model(y ~ x1 + x2,
	fe = cluster,
	vcov = cluster(cluster)))
	interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
	end
	ests = vcat(confint(pooling)[findfirst(isequal("x1"), coefnames(pooling)):findfirst(isequal("x1"), coefnames(pooling)),:],
	confint(within)[findfirst(isequal("x1"), coefnames(within)):findfirst(isequal("x1"), coefnames(within)),:],
	linearcombination(interaction, data[:cluster]),
	)

	ests2 = vcat(confint(pooling)[findfirst(isequal("x2"), coefnames(pooling)):findfirst(isequal("x2"), coefnames(pooling)),:],
	confint(within)[findfirst(isequal("x2"), coefnames(within)):findfirst(isequal("x2"), coefnames(within)),:],
	confint(interaction)[findfirst(isequal("x2"), coefnames(interaction)):findfirst(isequal("x2"), coefnames(interaction)),:],
	)
	hcat(ests, ests2)
	end

	"""
	trials_n(data::AbstractDataFrame,
	sampling_design::SamplingDesign,
	n::Integer = 1000)

	Return plots and matrices for distribution of parameter estimates and empirical
	coverage rates.
	"""
	function trials_n(data::AbstractDataFrame,
	sampling_design::SamplingDesign,
	n::Integer = 1000)
	seed!(0)
	trials = map(t -> trial(data, sampling_design), 1:n)
	parameters = mapreduce(t -> vcat(mean(t[:,1:2], dims = 2),
	mean(t[:,3:4], dims = 2)),
	hcat,
	trials)
	data_parameters =
	DataFrame(model = repeat(["Pooling", "Within", "Interaction"],
	outer = 2length(trials)),
	dimension = repeat(repeat(["β", "β₂"],
	inner = 3),
	outer = size(parameters, 2)),
	estimate = vec(parameters))
	plt = @df(data_parameters,
	density(:estimate,
	group = (:dimension, :model),
	fill = (.25, 0),
	alpha = 0.85,
	xlab = "Parameter Estimates",
	ylab = "Density (%)")
	)
	plot!(plt, [β], seriestype = "vline", label = "β (ATE for X₁)")
	plot!(plt, [0.5], seriestype = "vline", label = "β₂ (ATE for X₂)")
	limits = vec(mean(parameters, dims = 2))

	coverage = mapreduce(t -> compute_coverage(limits, t),
	vcat,
	trials)
	coverage = mapreduce(model -> mean(coverage[model:2:end,:], dims = 1),
	vcat,
	1:3)
	return (plt, coverage)
	end

	"""
	compute_coverage(limits::AbstractVector{<:Number},
	trials::AbstractMatrix{<:Number})

	Return indicators for whether the probability limits are within the confidence
	intervals.
	"""
	function compute_coverage(limits::AbstractVector{<:Number},
	trials::AbstractMatrix{<:Number})
	k = 1:size(trials, 2)
	output = mapreduce(col -> trials[:,col], vcat, filter(isodd, k)) .≤
	limits .≤
	mapreduce(col -> trials[:,col], vcat, filter(iseven, k))
	reshape(output, 3, 2)
	end

	"""
	Results(::AbstractDataFrame)

	Results hold the plots and matrices which record the results of the trials
	for each condition.
	"""
	struct Results
	plt::Vector{Plot{PyPlotBackend}}
	ecr::Vector{Matrix{Float64}}
	function Results(data::AbstractDataFrame, n::Integer = 1000)
	plt_random, coverage_random =
	trials_n(data, RandomSampling(1e-1), n)
	plt_clustered, coverage_clustered =
	trials_n(data, ClusteredSampling(0.1:0.05:0.3), n)
	plt = [plt_random, plt_clustered]
	ecr = [coverage_random, coverage_clustered]
	return new(plt, ecr)
	end
	end

	# Population Analysis
	seed!(0)
	β = mean(1 .+ sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))

	# HTE dimension constant_mean_variance
	seed!(0)
	data = DataFrame(cluster = sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
	data[:x1] = rand(Normal(), size(data, 1))
	data[:x2] = rand(Normal(-1, 0.5), size(data, 1))
	data[:y] = -0.2 .+
	(1 .+ data[:cluster]) .* data[:x1] .+
	0.5 .* data[:x2] .+
	rand(Normal(), size(data, 1))
	categorical!(data, :cluster)

	# Need to run this for some reason
	# trials_n(data, RandomSampling(1e-1), 1)

	constant_mean_variance = Results(data)