Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Issue with Atom/Juno
# Author: José Bayoán Santiago Calderón (@Nosferican)
# Set up the environment for this application
using Distributions: Normal, TDist
using Plots: Plot, PyPlotBackend, plot, density, savefig, plot!, pyplot
using StatPlots: StatPlots, @df
using Random: seed!
using Statistics: mean, quantile
using StatsBase: coef, coefnames, confint, countmap, dof_residual, levelsmap,
RegressionModel, sample, shuffle, vcov, weights
using DataFrames: AbstractDataFrame, categorical!, DataFrame
using FixedEffectModels: reg, @model
import StatsBase: sample
pyplot()
# Helpers
"""
SamplingDesign
Abstract type for sampling designs.
"""
abstract type SamplingDesign end
"""
RandomSampling
Struct for random sampling (i.e., every observation has equal probability of
being observed). The sampling probability is given by `p`.
"""
struct RandomSampling <: SamplingDesign
p::Float64
function RandomSampling(p)
zero(p) p one(p) || throw(ArgumentError("p ∉ [0, 1]"))
return new(p)
end
end
"""
ClusteredSampling
Struct for clustered sampling (i.e., the sampling probability of an observation
is cluster dependent). The sampling probability for each cluster is given by
`p`.
"""
struct ClusteredSampling <: SamplingDesign
p::Vector{Float64}
function ClusteredSampling(p::AbstractVector)
all(elem -> zero(elem) elem one(elem), p) ||
throw(ArgumentError("p contains non valid probability elements"))
return new(p)
end
end
function sample(cluster::AbstractVector, design::RandomSampling)
p = getfield(design, :p)
sample([true, false], weights([p, one(p) - p]), length(cluster))
end
function sample(cluster::AbstractVector, design::ClusteredSampling)
design = ClusteredSampling(0.1:0.05:0.3)
p = Dict(zip(unique(data[:cluster]), shuffle(getfield(design, :p))))
output = sample.(Ref([true, false]),
weights.((p -> [p, one(p) - p]).(get.(Ref(p),
data[:cluster],
nothing))))
return output
end
"""
linearcombination(model::RegressionModel, subgroups::AbstractVector)
Given a model and subgroups (with a single interaction term), it gives the
parameter estimate and standard error.
"""
function linearcombination(model::RegressionModel, subgroups::AbstractVector)
Vars = occursin.("&", coefnames(model))
shares = values(sort(countmap(subgroups)))
shares = shares ./ sum(shares)
L = zeros(length(Vars))
L[Vars] = shares
β̂₁ = L' * coef(model)
σ̂₁ = sqrt(L' * vcov(model) * L)
tstar = quantile(TDist(dof_residual(model)), 1 - 5e-2 / 2)
movement = tstar * σ̂₁
LB = β̂₁ - movement
UB = β̂₁ + movement
return [LB UB]
end
"""
trial(data::AbstractDataFrame,
sampling_design::SamplingDesign = RandomSampling(1e-1))
Return the estimates for pooling, within, and interactions under the sampling.
"""
function trial(data::AbstractDataFrame,
sampling_design::SamplingDesign = RandomSampling(1e-1))
obs = sample(data[:cluster], sampling_design)
subdata = view(data, obs)
if sampling_design isa RandomSampling
pooling = reg(subdata, @model(y ~ x1 + x2))
within = reg(subdata, @model(y ~ x1 + x2,
fe = cluster))
interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
else
pooling = reg(subdata, @model(y ~ x1 + x2,
vcov = cluster(cluster)))
within = reg(subdata, @model(y ~ x1 + x2,
fe = cluster,
vcov = cluster(cluster)))
interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
end
ests = vcat(confint(pooling)[findfirst(isequal("x1"), coefnames(pooling)):findfirst(isequal("x1"), coefnames(pooling)),:],
confint(within)[findfirst(isequal("x1"), coefnames(within)):findfirst(isequal("x1"), coefnames(within)),:],
linearcombination(interaction, data[:cluster]),
)
ests2 = vcat(confint(pooling)[findfirst(isequal("x2"), coefnames(pooling)):findfirst(isequal("x2"), coefnames(pooling)),:],
confint(within)[findfirst(isequal("x2"), coefnames(within)):findfirst(isequal("x2"), coefnames(within)),:],
confint(interaction)[findfirst(isequal("x2"), coefnames(interaction)):findfirst(isequal("x2"), coefnames(interaction)),:],
)
hcat(ests, ests2)
end
"""
trials_n(data::AbstractDataFrame,
sampling_design::SamplingDesign,
n::Integer = 1000)
Return plots and matrices for distribution of parameter estimates and empirical
coverage rates.
"""
function trials_n(data::AbstractDataFrame,
sampling_design::SamplingDesign,
n::Integer = 1000)
seed!(0)
trials = map(t -> trial(data, sampling_design), 1:n)
parameters = mapreduce(t -> vcat(mean(t[:,1:2], dims = 2),
mean(t[:,3:4], dims = 2)),
hcat,
trials)
data_parameters =
DataFrame(model = repeat(["Pooling", "Within", "Interaction"],
outer = 2length(trials)),
dimension = repeat(repeat(["β", "β₂"],
inner = 3),
outer = size(parameters, 2)),
estimate = vec(parameters))
plt = @df(data_parameters,
density(:estimate,
group = (:dimension, :model),
fill = (.25, 0),
alpha = 0.85,
xlab = "Parameter Estimates",
ylab = "Density (%)")
)
plot!(plt, [β], seriestype = "vline", label = "β (ATE for X₁)")
plot!(plt, [0.5], seriestype = "vline", label = "β₂ (ATE for X₂)")
limits = vec(mean(parameters, dims = 2))
coverage = mapreduce(t -> compute_coverage(limits, t),
vcat,
trials)
coverage = mapreduce(model -> mean(coverage[model:2:end,:], dims = 1),
vcat,
1:3)
return (plt, coverage)
end
"""
compute_coverage(limits::AbstractVector{<:Number},
trials::AbstractMatrix{<:Number})
Return indicators for whether the probability limits are within the confidence
intervals.
"""
function compute_coverage(limits::AbstractVector{<:Number},
trials::AbstractMatrix{<:Number})
k = 1:size(trials, 2)
output = mapreduce(col -> trials[:,col], vcat, filter(isodd, k)) .≤
limits .≤
mapreduce(col -> trials[:,col], vcat, filter(iseven, k))
reshape(output, 3, 2)
end
"""
Results(::AbstractDataFrame)
Results hold the plots and matrices which record the results of the trials
for each condition.
"""
struct Results
plt::Vector{Plot{PyPlotBackend}}
ecr::Vector{Matrix{Float64}}
function Results(data::AbstractDataFrame, n::Integer = 1000)
plt_random, coverage_random =
trials_n(data, RandomSampling(1e-1), n)
plt_clustered, coverage_clustered =
trials_n(data, ClusteredSampling(0.1:0.05:0.3), n)
plt = [plt_random, plt_clustered]
ecr = [coverage_random, coverage_clustered]
return new(plt, ecr)
end
end
# Population Analysis
seed!(0)
β = mean(1 .+ sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
# HTE dimension constant_mean_variance
seed!(0)
data = DataFrame(cluster = sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
data[:x1] = rand(Normal(), size(data, 1))
data[:x2] = rand(Normal(-1, 0.5), size(data, 1))
data[:y] = -0.2 .+
(1 .+ data[:cluster]) .* data[:x1] .+
0.5 .* data[:x2] .+
rand(Normal(), size(data, 1))
categorical!(data, :cluster)
# Need to run this for some reason
# trials_n(data, RandomSampling(1e-1), 1)
constant_mean_variance = Results(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment