Skip to content

Instantly share code, notes, and snippets.

@Nosferican
Created September 28, 2018 19:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nosferican/597bf20e73baf86e66b7ab208715f175 to your computer and use it in GitHub Desktop.
Save Nosferican/597bf20e73baf86e66b7ab208715f175 to your computer and use it in GitHub Desktop.
Issue with Atom/Juno
# Author: José Bayoán Santiago Calderón (@Nosferican)
# Set up the environment for this application
using Distributions: Normal, TDist
using Plots: Plot, PyPlotBackend, plot, density, savefig, plot!, pyplot
using StatPlots: StatPlots, @df
using Random: seed!
using Statistics: mean, quantile
using StatsBase: coef, coefnames, confint, countmap, dof_residual, levelsmap,
RegressionModel, sample, shuffle, vcov, weights
using DataFrames: AbstractDataFrame, categorical!, DataFrame
using FixedEffectModels: reg, @model
import StatsBase: sample
pyplot()
# Helpers
"""
SamplingDesign
Abstract type for sampling designs.
"""
abstract type SamplingDesign end
"""
RandomSampling
Struct for random sampling (i.e., every observation has equal probability of
being observed). The sampling probability is given by `p`.
"""
struct RandomSampling <: SamplingDesign
p::Float64
function RandomSampling(p)
zero(p) ≤ p ≤ one(p) || throw(ArgumentError("p ∉ [0, 1]"))
return new(p)
end
end
"""
ClusteredSampling
Struct for clustered sampling (i.e., the sampling probability of an observation
is cluster dependent). The sampling probability for each cluster is given by
`p`.
"""
struct ClusteredSampling <: SamplingDesign
p::Vector{Float64}
function ClusteredSampling(p::AbstractVector)
all(elem -> zero(elem) ≤ elem ≤ one(elem), p) ||
throw(ArgumentError("p contains non valid probability elements"))
return new(p)
end
end
function sample(cluster::AbstractVector, design::RandomSampling)
p = getfield(design, :p)
sample([true, false], weights([p, one(p) - p]), length(cluster))
end
function sample(cluster::AbstractVector, design::ClusteredSampling)
design = ClusteredSampling(0.1:0.05:0.3)
p = Dict(zip(unique(data[:cluster]), shuffle(getfield(design, :p))))
output = sample.(Ref([true, false]),
weights.((p -> [p, one(p) - p]).(get.(Ref(p),
data[:cluster],
nothing))))
return output
end
"""
linearcombination(model::RegressionModel, subgroups::AbstractVector)
Given a model and subgroups (with a single interaction term), it gives the
parameter estimate and standard error.
"""
function linearcombination(model::RegressionModel, subgroups::AbstractVector)
Vars = occursin.("&", coefnames(model))
shares = values(sort(countmap(subgroups)))
shares = shares ./ sum(shares)
L = zeros(length(Vars))
L[Vars] = shares
β̂₁ = L' * coef(model)
σ̂₁ = sqrt(L' * vcov(model) * L)
tstar = quantile(TDist(dof_residual(model)), 1 - 5e-2 / 2)
movement = tstar * σ̂₁
LB = β̂₁ - movement
UB = β̂₁ + movement
return [LB UB]
end
"""
trial(data::AbstractDataFrame,
sampling_design::SamplingDesign = RandomSampling(1e-1))
Return the estimates for pooling, within, and interactions under the sampling.
"""
function trial(data::AbstractDataFrame,
sampling_design::SamplingDesign = RandomSampling(1e-1))
obs = sample(data[:cluster], sampling_design)
subdata = view(data, obs)
if sampling_design isa RandomSampling
pooling = reg(subdata, @model(y ~ x1 + x2))
within = reg(subdata, @model(y ~ x1 + x2,
fe = cluster))
interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
else
pooling = reg(subdata, @model(y ~ x1 + x2,
vcov = cluster(cluster)))
within = reg(subdata, @model(y ~ x1 + x2,
fe = cluster,
vcov = cluster(cluster)))
interaction = reg(subdata, @model(y ~ x1 & cluster + x2))
end
ests = vcat(confint(pooling)[findfirst(isequal("x1"), coefnames(pooling)):findfirst(isequal("x1"), coefnames(pooling)),:],
confint(within)[findfirst(isequal("x1"), coefnames(within)):findfirst(isequal("x1"), coefnames(within)),:],
linearcombination(interaction, data[:cluster]),
)
ests2 = vcat(confint(pooling)[findfirst(isequal("x2"), coefnames(pooling)):findfirst(isequal("x2"), coefnames(pooling)),:],
confint(within)[findfirst(isequal("x2"), coefnames(within)):findfirst(isequal("x2"), coefnames(within)),:],
confint(interaction)[findfirst(isequal("x2"), coefnames(interaction)):findfirst(isequal("x2"), coefnames(interaction)),:],
)
hcat(ests, ests2)
end
"""
trials_n(data::AbstractDataFrame,
sampling_design::SamplingDesign,
n::Integer = 1000)
Return plots and matrices for distribution of parameter estimates and empirical
coverage rates.
"""
function trials_n(data::AbstractDataFrame,
sampling_design::SamplingDesign,
n::Integer = 1000)
seed!(0)
trials = map(t -> trial(data, sampling_design), 1:n)
parameters = mapreduce(t -> vcat(mean(t[:,1:2], dims = 2),
mean(t[:,3:4], dims = 2)),
hcat,
trials)
data_parameters =
DataFrame(model = repeat(["Pooling", "Within", "Interaction"],
outer = 2length(trials)),
dimension = repeat(repeat(["β", "β₂"],
inner = 3),
outer = size(parameters, 2)),
estimate = vec(parameters))
plt = @df(data_parameters,
density(:estimate,
group = (:dimension, :model),
fill = (.25, 0),
alpha = 0.85,
xlab = "Parameter Estimates",
ylab = "Density (%)")
)
plot!(plt, [β], seriestype = "vline", label = "β (ATE for X₁)")
plot!(plt, [0.5], seriestype = "vline", label = "β₂ (ATE for X₂)")
limits = vec(mean(parameters, dims = 2))
coverage = mapreduce(t -> compute_coverage(limits, t),
vcat,
trials)
coverage = mapreduce(model -> mean(coverage[model:2:end,:], dims = 1),
vcat,
1:3)
return (plt, coverage)
end
"""
compute_coverage(limits::AbstractVector{<:Number},
trials::AbstractMatrix{<:Number})
Return indicators for whether the probability limits are within the confidence
intervals.
"""
function compute_coverage(limits::AbstractVector{<:Number},
trials::AbstractMatrix{<:Number})
k = 1:size(trials, 2)
output = mapreduce(col -> trials[:,col], vcat, filter(isodd, k)) .≤
limits .≤
mapreduce(col -> trials[:,col], vcat, filter(iseven, k))
reshape(output, 3, 2)
end
"""
Results(::AbstractDataFrame)
Results hold the plots and matrices which record the results of the trials
for each condition.
"""
struct Results
plt::Vector{Plot{PyPlotBackend}}
ecr::Vector{Matrix{Float64}}
function Results(data::AbstractDataFrame, n::Integer = 1000)
plt_random, coverage_random =
trials_n(data, RandomSampling(1e-1), n)
plt_clustered, coverage_clustered =
trials_n(data, ClusteredSampling(0.1:0.05:0.3), n)
plt = [plt_random, plt_clustered]
ecr = [coverage_random, coverage_clustered]
return new(plt, ecr)
end
end
# Population Analysis
seed!(0)
β = mean(1 .+ sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
# HTE dimension constant_mean_variance
seed!(0)
data = DataFrame(cluster = sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5)))
data[:x1] = rand(Normal(), size(data, 1))
data[:x2] = rand(Normal(-1, 0.5), size(data, 1))
data[:y] = -0.2 .+
(1 .+ data[:cluster]) .* data[:x1] .+
0.5 .* data[:x2] .+
rand(Normal(), size(data, 1))
categorical!(data, :cluster)
# Need to run this for some reason
# trials_n(data, RandomSampling(1e-1), 1)
constant_mean_variance = Results(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment