library(tidyverse)
library(Amelia)
library(broom)
# Use the africa dataset from Amelia
data(africa)
set.seed(1234)
imp_amelia <- amelia(x = africa, m = 5, cs = "country", ts = "year", logs = "gdp_pc", p2s = 0)
# Gather all the imputed datasets into one data frame and run a model on each
models_imputed_df <- bind_rows(unclass(imp_amelia$imputations), .id = "m") %>%
group_by(m) %>%
nest() %>%
mutate(model = data %>% map(~ lm(gdp_pc ~ trade + civlib, data = .)))
models_imputed_df
#> # A tibble: 5 x 3
#> # Groups: m [5]
#> m data model
#> <chr> <list> <list>
#> 1 imp1 <tibble [120 × 7]> <lm>
#> 2 imp2 <tibble [120 × 7]> <lm>
#> 3 imp3 <tibble [120 × 7]> <lm>
#> 4 imp4 <tibble [120 × 7]> <lm>
#> 5 imp5 <tibble [120 × 7]> <lm>
# That model column has the model coefficients, etc. for each imputed dataset
#
# You can see individual models like this:
# Model with imputed dataset 1
models_imputed_df$model[[1]]
#>
#> Call:
#> lm(formula = gdp_pc ~ trade + civlib, data = .)
#>
#> Coefficients:
#> (Intercept) trade civlib
#> 114.47 18.09 -631.45
# Model with imputed dataset 2
summary(models_imputed_df$model[[2]])
#>
#> Call:
#> lm(formula = gdp_pc ~ trade + civlib, data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -677.89 -229.92 -98.02 166.79 959.54
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 123.230 96.767 1.273 0.205375
#> trade 18.012 1.238 14.547 < 2e-16 ***
#> civlib -626.083 181.189 -3.455 0.000766 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 348.7 on 117 degrees of freedom
#> Multiple R-squared: 0.6537, Adjusted R-squared: 0.6477
#> F-statistic: 110.4 on 2 and 117 DF, p-value: < 2.2e-16
# Or use map() and broom::tidy() to see all the results at the same time
models_imputed_df %>%
mutate(coefs = map(model, tidy)) %>%
unnest(coefs)
#> # A tibble: 15 x 8
#> # Groups: m [5]
#> m data model term estimate std.error statistic p.value
#> <chr> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 imp1 <tibble [120 × … <lm> (Interce… 114. 97.7 1.17 2.44e- 1
#> 2 imp1 <tibble [120 × … <lm> trade 18.1 1.25 14.4 9.65e-28
#> 3 imp1 <tibble [120 × … <lm> civlib -631. 182. -3.46 7.47e- 4
#> 4 imp2 <tibble [120 × … <lm> (Interce… 123. 96.8 1.27 2.05e- 1
#> 5 imp2 <tibble [120 × … <lm> trade 18.0 1.24 14.5 5.27e-28
#> 6 imp2 <tibble [120 × … <lm> civlib -626. 181. -3.46 7.66e- 4
#> 7 imp3 <tibble [120 × … <lm> (Interce… 114. 96.5 1.18 2.40e- 1
#> 8 imp3 <tibble [120 × … <lm> trade 18.2 1.24 14.7 2.72e-28
#> 9 imp3 <tibble [120 × … <lm> civlib -633. 181. -3.51 6.42e- 4
#> 10 imp4 <tibble [120 × … <lm> (Interce… 119. 95.4 1.25 2.14e- 1
#> 11 imp4 <tibble [120 × … <lm> trade 18.2 1.22 14.9 8.29e-29
#> 12 imp4 <tibble [120 × … <lm> civlib -651. 180. -3.62 4.41e- 4
#> 13 imp5 <tibble [120 × … <lm> (Interce… 132. 95.2 1.39 1.69e- 1
#> 14 imp5 <tibble [120 × … <lm> trade 18.0 1.22 14.7 1.97e-28
#> 15 imp5 <tibble [120 × … <lm> civlib -648. 180. -3.60 4.63e- 4
# The coefficients, standard errors, and p-values for each of the 5 models are
# all there, but combining them is a little trickier, since you can't just take
# the average.
# Zelig has ways of combining things, but I don't like using Zelig in general
# because it's a weird package ecosystem with strange syntax that's not very
# tidyverse friendly, I find it easier to combine and meld results more manually.
# Amelia comes with a function named mi.meld() that uses fancy math to take the
# correct averages of imputed coefficients. To make it easier to work with
# dplyr's mutate(), I often use a wrapper function that takes a small dataset of
# all the same coefficients from different imputations (like all the intercepts,
# etc.) and melds them
meld_stuff <- function(x) {
# x is a data frame with m rows and three columns:
#
# m estimate std.error
# 1 1.05 0.34
# 2 1.09 0.28
# x ... ...
# Meld the coefficients and errors using Rubin's rules
x_melded <- mi.meld(matrix(x$estimate), matrix(x$std.error))
tibble(estimate = as.numeric(x_melded$q.mi),
std.error = as.numeric(x_melded$se.mi))
}
# We can then use that function to meld the coefficients together
melded_coefs <- models_imputed_df %>%
mutate(coefs = map(model, tidy)) %>%
unnest(coefs) %>%
# Group all the terms together and work with them separately (so we have rows
# for interept, trade, and civlib)
group_by(term) %>%
# Put all these columns in a single cell
nest(coefs = c(m, data, model, estimate, std.error, statistic, p.value)) %>%
# Feed the coefs column into the meld_stuff() function so that R can meld them correctly
mutate(melded = map(coefs, meld_stuff)) %>%
unnest(melded)
melded_coefs
#> # A tibble: 3 x 4
#> # Groups: term [3]
#> term coefs estimate std.error
#> <chr> <list> <dbl> <dbl>
#> 1 (Intercept) <tibble [5 × 7]> 121. 96.7
#> 2 trade <tibble [5 × 7]> 18.1 1.24
#> 3 civlib <tibble [5 × 7]> -638. 181.
# The estimate and std.error columns are now correctly melded, but there's no
# t-statistic or p-value. That's because melding won't calculate that for you,
# in part because you can't really take the average t-statistic or
# p-value---that's wrong. Calculating your own, though, is fairly easy. The
# t-statistic is just estimate/std.error:
melded_coefs %>%
mutate(t.statistic = estimate / std.error)
#> # A tibble: 3 x 5
#> # Groups: term [3]
#> term coefs estimate std.error t.statistic
#> <chr> <list> <dbl> <dbl> <dbl>
#> 1 (Intercept) <tibble [5 × 7]> 121. 96.7 1.25
#> 2 trade <tibble [5 × 7]> 18.1 1.24 14.6
#> 3 civlib <tibble [5 × 7]> -638. 181. -3.52
# The p-value comes from the pt() function. We need feed it the t-statistic and
# the degrees of freedom. The formula for a two-tailed test is:
#
# 2 * (1 - pt(abs(t.stat), df))
#
# We already have the t-statistic, but we need to find the residual degrees of
# freedom from the model. The glance() function in broom shows us this. We can
# grab it from one of the imputed models:
glance(models_imputed_df$model[[1]])
#> # A tibble: 1 x 11
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#> 1 0.649 0.643 351. 108. 2.50e-27 3 -872. 1752. 1763.
#> # … with 2 more variables: deviance <dbl>, df.residual <int>
df_model <- glance(models_imputed_df$model[[1]])$df.residual
df_model
#> [1] 117
# Now we can use it in mutate() to calculate the p-values:
melded_coefs %>%
mutate(t.statistic = estimate / std.error,
p.value = 2 * (1 - pt(abs(t.statistic), df_model)))
#> # A tibble: 3 x 6
#> # Groups: term [3]
#> term coefs estimate std.error t.statistic p.value
#> <chr> <list> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) <tibble [5 × 7]> 121. 96.7 1.25 0.215
#> 2 trade <tibble [5 × 7]> 18.1 1.24 14.6 0
#> 3 civlib <tibble [5 × 7]> -638. 181. -3.52 0.000613
# That should do it!
Created on 2020-06-19 by the reprex package (v0.3.0)