-
-
Save NErler/0d00375da460dd33839b98faeee2fdab to your computer and use it in GitHub Desktop.
# Compare proportions in categorical variables between observed and imputed data | |
# visually (using ggplot) | |
# | |
# Parameters: | |
# x: mids object (from mice) | |
# formula: formula describing which variables to plot | |
# facet: either "wrap" for facet_wrap or "grid" for facet_grid | |
# ...: additional parameters passed to theme() | |
# | |
# Note: if the formula is not specified, all imputed categorical variables are | |
# plotted. | |
# | |
# A formula has the structure: | |
# categorical variables ~ faceting variables | color variable | |
# | |
# By default, .imp (imputation set identifier) will be used as color variable. | |
# | |
# This function uses the following packages: | |
# - mice | |
# - reshape2 | |
# - RColorBrewer | |
# - ggplot2 | |
propplot <- function(x, formula, facet = "wrap", ...) { | |
library(ggplot2) | |
cd <- data.frame(mice::complete(x, "long", include = TRUE)) | |
cd$.imp <- factor(cd$.imp) | |
r <- as.data.frame(is.na(x$data)) | |
impcat <- x$meth != "" & sapply(x$data, is.factor) | |
vnames <- names(impcat)[impcat] | |
if (missing(formula)) { | |
formula <- as.formula(paste(paste(vnames, collapse = "+", | |
sep = ""), "~1", sep = "")) | |
} | |
tmsx <- terms(formula[-3], data = x$data) | |
xnames <- attr(tmsx, "term.labels") | |
xnames <- xnames[xnames %in% vnames] | |
if (paste(formula[3]) != "1") { | |
wvars <- gsub("[[:space:]]*\\|[[:print:]]*", "", paste(formula)[3]) | |
# wvars <- all.vars(as.formula(paste("~", wvars))) | |
wvars <- attr(terms(as.formula(paste("~", wvars))), "term.labels") | |
if (grepl("\\|", formula[3])) { | |
svars <- gsub("[[:print:]]*\\|[[:space:]]*", "", paste(formula)[3]) | |
svars <- all.vars(as.formula(paste("~", svars))) | |
} else { | |
svars <- ".imp" | |
} | |
} else { | |
wvars <- NULL | |
svars <- ".imp" | |
} | |
for (i in seq_along(xnames)) { | |
xvar <- xnames[i] | |
select <- cd$.imp != 0 & !r[, xvar] | |
cd[select, xvar] <- NA | |
} | |
for (i in which(!wvars %in% names(cd))) { | |
cd[, wvars[i]] <- with(cd, eval(parse(text = wvars[i]))) | |
} | |
meltDF <- reshape2::melt(cd[, c(wvars, svars, xnames)], id.vars = c(wvars, svars)) | |
meltDF <- meltDF[!is.na(meltDF$value), ] | |
wvars <- if (!is.null(wvars)) paste0("`", wvars, "`") | |
a <- plyr::ddply(meltDF, c(wvars, svars, "variable", "value"), plyr::summarize, | |
count = length(value)) | |
b <- plyr::ddply(meltDF, c(wvars, svars, "variable"), plyr::summarize, | |
tot = length(value)) | |
mdf <- merge(a,b) | |
mdf$prop <- mdf$count / mdf$tot | |
plotDF <- merge(unique(meltDF), mdf) | |
plotDF$value <- factor(plotDF$value, | |
levels = unique(unlist(lapply(x$data[, xnames], levels))), | |
ordered = T) | |
p <- ggplot(plotDF, aes(x = value, fill = get(svars), y = prop)) + | |
geom_bar(position = "dodge", stat = "identity") + | |
theme(legend.position = "bottom", ...) + | |
ylab("proportion") + | |
scale_fill_manual(name = "", | |
values = c("black", | |
colorRampPalette( | |
RColorBrewer::brewer.pal(9, "Blues"))(x$m + 3)[1:x$m + 3])) + | |
guides(fill = guide_legend(nrow = 1)) | |
if (facet == "wrap") | |
if (length(xnames) > 1) { | |
print(p + facet_wrap(c("variable", wvars), scales = "free")) | |
} else { | |
if (is.null(wvars)) { | |
print(p) | |
} else { | |
print(p + facet_wrap(wvars, scales = "free")) | |
} | |
} | |
if (facet == "grid") | |
if (!is.null(wvars)) { | |
print(p + facet_grid(paste(paste(wvars, collapse = "+"), "~ variable"), | |
scales = "free")) | |
} | |
} |
@guilhermeparreira , did you ever get an answer? I have the same question!
Here a minimal example:
library("mice")
imp <- mice(nhanes2)
propplot(imp)
Each bar represents the proportion of the data in the category given on the x-axis in one (imputed) dataset. The black bars show the distribution of the original, incomplete data. The blue bars show the corresponding distribution in the imputed values from each of the (5, in this case) imputed datasets.
When the blue bars are approximately as high as the black bar, the distribution of the categorical variable in the imputed values is similar to the distribution in the observed data. Note, however, that here the marginal distributions are compared, but the MAR assumption is that the incomplete cases have the same distribution as the observed cases conditional on the other variables.
So if you do see differences this does not necessarily indicate a problem with the imputation. It could be that differences can be explained by covariates used in the imputation model.
@NErler thank you so much, this is very helpful! One quick followup: what metric do you use to gauge whether the imputed value distribution is similar to the observed value distribution?
Any formal metric for evaluation of the fit of the imputation models should use the conditional distribution used during imputation, but this is not what the plots show. To me, the main point of these plots is to have some help in carefully investigating the imputed values, but the interpretation has to be based on common sense.
Thank you for making this function. I notice bars are missing for imputation iterations that do not vary across the levels of a specific categorical variable. Can that be modified so the bar is at zero?
Hi there, thanks for your tutorial.
I am trying to interpret this graphic (propplot.R), but I don't understand how to interpret the data provided by the "fill" variable in legend (Are those the imputed values?)
Could you please, provide a toy example?
I am doing MICE for ordinal variables for likert scale. And I think that this graphic will help me a lot!
Thanks in advance.