Created
September 28, 2018 19:24
-
-
Save Nosferican/597bf20e73baf86e66b7ab208715f175 to your computer and use it in GitHub Desktop.
Issue with Atom/Juno
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: José Bayoán Santiago Calderón (@Nosferican) | |
# Set up the environment for this application | |
using Distributions: Normal, TDist | |
using Plots: Plot, PyPlotBackend, plot, density, savefig, plot!, pyplot | |
using StatPlots: StatPlots, @df | |
using Random: seed! | |
using Statistics: mean, quantile | |
using StatsBase: coef, coefnames, confint, countmap, dof_residual, levelsmap, | |
RegressionModel, sample, shuffle, vcov, weights | |
using DataFrames: AbstractDataFrame, categorical!, DataFrame | |
using FixedEffectModels: reg, @model | |
import StatsBase: sample | |
pyplot() | |
# Helpers | |
""" | |
SamplingDesign | |
Abstract type for sampling designs. | |
""" | |
abstract type SamplingDesign end | |
""" | |
RandomSampling | |
Struct for random sampling (i.e., every observation has equal probability of | |
being observed). The sampling probability is given by `p`. | |
""" | |
struct RandomSampling <: SamplingDesign | |
p::Float64 | |
function RandomSampling(p) | |
zero(p) ≤ p ≤ one(p) || throw(ArgumentError("p ∉ [0, 1]")) | |
return new(p) | |
end | |
end | |
""" | |
ClusteredSampling | |
Struct for clustered sampling (i.e., the sampling probability of an observation | |
is cluster dependent). The sampling probability for each cluster is given by | |
`p`. | |
""" | |
struct ClusteredSampling <: SamplingDesign | |
p::Vector{Float64} | |
function ClusteredSampling(p::AbstractVector) | |
all(elem -> zero(elem) ≤ elem ≤ one(elem), p) || | |
throw(ArgumentError("p contains non valid probability elements")) | |
return new(p) | |
end | |
end | |
function sample(cluster::AbstractVector, design::RandomSampling) | |
p = getfield(design, :p) | |
sample([true, false], weights([p, one(p) - p]), length(cluster)) | |
end | |
function sample(cluster::AbstractVector, design::ClusteredSampling) | |
design = ClusteredSampling(0.1:0.05:0.3) | |
p = Dict(zip(unique(data[:cluster]), shuffle(getfield(design, :p)))) | |
output = sample.(Ref([true, false]), | |
weights.((p -> [p, one(p) - p]).(get.(Ref(p), | |
data[:cluster], | |
nothing)))) | |
return output | |
end | |
""" | |
linearcombination(model::RegressionModel, subgroups::AbstractVector) | |
Given a model and subgroups (with a single interaction term), it gives the | |
parameter estimate and standard error. | |
""" | |
function linearcombination(model::RegressionModel, subgroups::AbstractVector) | |
Vars = occursin.("&", coefnames(model)) | |
shares = values(sort(countmap(subgroups))) | |
shares = shares ./ sum(shares) | |
L = zeros(length(Vars)) | |
L[Vars] = shares | |
β̂₁ = L' * coef(model) | |
σ̂₁ = sqrt(L' * vcov(model) * L) | |
tstar = quantile(TDist(dof_residual(model)), 1 - 5e-2 / 2) | |
movement = tstar * σ̂₁ | |
LB = β̂₁ - movement | |
UB = β̂₁ + movement | |
return [LB UB] | |
end | |
""" | |
trial(data::AbstractDataFrame, | |
sampling_design::SamplingDesign = RandomSampling(1e-1)) | |
Return the estimates for pooling, within, and interactions under the sampling. | |
""" | |
function trial(data::AbstractDataFrame, | |
sampling_design::SamplingDesign = RandomSampling(1e-1)) | |
obs = sample(data[:cluster], sampling_design) | |
subdata = view(data, obs) | |
if sampling_design isa RandomSampling | |
pooling = reg(subdata, @model(y ~ x1 + x2)) | |
within = reg(subdata, @model(y ~ x1 + x2, | |
fe = cluster)) | |
interaction = reg(subdata, @model(y ~ x1 & cluster + x2)) | |
else | |
pooling = reg(subdata, @model(y ~ x1 + x2, | |
vcov = cluster(cluster))) | |
within = reg(subdata, @model(y ~ x1 + x2, | |
fe = cluster, | |
vcov = cluster(cluster))) | |
interaction = reg(subdata, @model(y ~ x1 & cluster + x2)) | |
end | |
ests = vcat(confint(pooling)[findfirst(isequal("x1"), coefnames(pooling)):findfirst(isequal("x1"), coefnames(pooling)),:], | |
confint(within)[findfirst(isequal("x1"), coefnames(within)):findfirst(isequal("x1"), coefnames(within)),:], | |
linearcombination(interaction, data[:cluster]), | |
) | |
ests2 = vcat(confint(pooling)[findfirst(isequal("x2"), coefnames(pooling)):findfirst(isequal("x2"), coefnames(pooling)),:], | |
confint(within)[findfirst(isequal("x2"), coefnames(within)):findfirst(isequal("x2"), coefnames(within)),:], | |
confint(interaction)[findfirst(isequal("x2"), coefnames(interaction)):findfirst(isequal("x2"), coefnames(interaction)),:], | |
) | |
hcat(ests, ests2) | |
end | |
""" | |
trials_n(data::AbstractDataFrame, | |
sampling_design::SamplingDesign, | |
n::Integer = 1000) | |
Return plots and matrices for distribution of parameter estimates and empirical | |
coverage rates. | |
""" | |
function trials_n(data::AbstractDataFrame, | |
sampling_design::SamplingDesign, | |
n::Integer = 1000) | |
seed!(0) | |
trials = map(t -> trial(data, sampling_design), 1:n) | |
parameters = mapreduce(t -> vcat(mean(t[:,1:2], dims = 2), | |
mean(t[:,3:4], dims = 2)), | |
hcat, | |
trials) | |
data_parameters = | |
DataFrame(model = repeat(["Pooling", "Within", "Interaction"], | |
outer = 2length(trials)), | |
dimension = repeat(repeat(["β", "β₂"], | |
inner = 3), | |
outer = size(parameters, 2)), | |
estimate = vec(parameters)) | |
plt = @df(data_parameters, | |
density(:estimate, | |
group = (:dimension, :model), | |
fill = (.25, 0), | |
alpha = 0.85, | |
xlab = "Parameter Estimates", | |
ylab = "Density (%)") | |
) | |
plot!(plt, [β], seriestype = "vline", label = "β (ATE for X₁)") | |
plot!(plt, [0.5], seriestype = "vline", label = "β₂ (ATE for X₂)") | |
limits = vec(mean(parameters, dims = 2)) | |
coverage = mapreduce(t -> compute_coverage(limits, t), | |
vcat, | |
trials) | |
coverage = mapreduce(model -> mean(coverage[model:2:end,:], dims = 1), | |
vcat, | |
1:3) | |
return (plt, coverage) | |
end | |
""" | |
compute_coverage(limits::AbstractVector{<:Number}, | |
trials::AbstractMatrix{<:Number}) | |
Return indicators for whether the probability limits are within the confidence | |
intervals. | |
""" | |
function compute_coverage(limits::AbstractVector{<:Number}, | |
trials::AbstractMatrix{<:Number}) | |
k = 1:size(trials, 2) | |
output = mapreduce(col -> trials[:,col], vcat, filter(isodd, k)) .≤ | |
limits .≤ | |
mapreduce(col -> trials[:,col], vcat, filter(iseven, k)) | |
reshape(output, 3, 2) | |
end | |
""" | |
Results(::AbstractDataFrame) | |
Results hold the plots and matrices which record the results of the trials | |
for each condition. | |
""" | |
struct Results | |
plt::Vector{Plot{PyPlotBackend}} | |
ecr::Vector{Matrix{Float64}} | |
function Results(data::AbstractDataFrame, n::Integer = 1000) | |
plt_random, coverage_random = | |
trials_n(data, RandomSampling(1e-1), n) | |
plt_clustered, coverage_clustered = | |
trials_n(data, ClusteredSampling(0.1:0.05:0.3), n) | |
plt = [plt_random, plt_clustered] | |
ecr = [coverage_random, coverage_clustered] | |
return new(plt, ecr) | |
end | |
end | |
# Population Analysis | |
seed!(0) | |
β = mean(1 .+ sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5))) | |
# HTE dimension constant_mean_variance | |
seed!(0) | |
data = DataFrame(cluster = sample(-0.5:0.25:0.5, weights(rand(5)), Int(1e5))) | |
data[:x1] = rand(Normal(), size(data, 1)) | |
data[:x2] = rand(Normal(-1, 0.5), size(data, 1)) | |
data[:y] = -0.2 .+ | |
(1 .+ data[:cluster]) .* data[:x1] .+ | |
0.5 .* data[:x2] .+ | |
rand(Normal(), size(data, 1)) | |
categorical!(data, :cluster) | |
# Need to run this for some reason | |
# trials_n(data, RandomSampling(1e-1), 1) | |
constant_mean_variance = Results(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment