Skip to content

Instantly share code, notes, and snippets.

@vsbuffalo
Created March 1, 2019 19:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsbuffalo/4218bafee0a651f3f5e5408500516b9e to your computer and use it in GitHub Desktop.
Save vsbuffalo/4218bafee0a651f3f5e5408500516b9e to your computer and use it in GitHub Desktop.
comparing two implementations of covariance with pairwise complete cases
library(tidyverse)
library(MASS)
pcov <- function(x) {
xs <- scale(x, scale=FALSE)
dd <- as.integer(!is.na(x))
dim(dd) <- dim(x)
denom <- (t(dd) %*% dd) - 1L
no_obs <- denom == 0L
xs[is.na(xs)] <- 0
covmat <- t(xs) %*% xs
covmat[no_obs] <- NA
return(covmat/denom)
}
random_covmat <- function(n) {
# see: https://stats.stackexchange.com/questions/215497/how-to-create-an-arbitrary-covariance-matrix
p <- qr.Q(qr(matrix(rnorm(n^2), n)))
return(crossprod(p, p*(n:1)))
}
mvrnorm_na <- function(n, Sigma, prop_na=0, mu=rep(0, ncol(Sigma))) {
samp <- mvrnorm(n, mu=mu, Sigma=Sigma)
nai <- sample(length(samp), floor(prop_na * length(samp)))
samp[nai] <- NA
samp
}
do <- crossing(rep=1:50, dim=c(5, 10), prop_na=seq(0, 0.9, length.out=10), n=c(10, 100)) %>%
mutate(covmat=map(dim, random_covmat)) %>%
mutate(samples=pmap(list(n, covmat, prop_na), mvrnorm_na))
# Absolute Differences Between elements
d <- do %>% mutate(pcov=map_dbl(samples, ~ sum(pcov(.), na.rm=TRUE))) %>%
mutate(rpcov=map_dbl(samples, ~ sum(cov(., use='pairwise.complete'), na.rm=TRUE)))
abs_diff <- function(x, y, na.rm=TRUE) sum(abs(x-y), na.rm=na.rm)
mean_abs_diff <- function(x, y, na.rm=TRUE) mean(abs(x-y), na.rm=na.rm)
ds <- do %>% mutate(pcov=map(samples, pcov)) %>%
mutate(rpcov=map(samples, cov, use='pairwise.complete')) %>%
mutate(`my implementation`=map2_dbl(pcov, covmat, mean_abs_diff),
`R's pairwise complete`=map2_dbl(rpcov, covmat, mean_abs_diff)) %>%
gather(type, bias, `my implementation`, `R's pairwise complete`)
## Looking at Total Variance (summing elements of cov matrix)
ds %>%
mutate(dim = paste0("dim = ", dim), n=paste0("n = ", n)) %>%
ggplot(aes(prop_na, bias, group=interaction(prop_na, type), color=type)) +
geom_boxplot() + facet_grid(dim~n) +
ylab('mean abs. diff between true matrix and sample matrix') +
xlab('proportion NAs')
do %>% mutate(var=map_dbl(covmat, sum, na.rm=TRUE)) %>%
mutate(pcov_var=map_dbl(samples, ~ sum(pcov(.), na.rm=TRUE))) %>%
mutate(rpcov_var=map_dbl(samples, ~ sum(cov(., use='pairwise.complete'), na.rm=TRUE))) %>%
mutate(bias_pcov = pcov_var-var, bias_rpcov=rpcov_var-var) %>% gather(type, bias, bias_pcov:bias_rpcov) %>%
group_by(type, dim, prop_na, n) %>% summarize(mean_bias=mean(bias)) %>%
ggplot(aes(prop_na, mean_bias, color=type)) + geom_point() + facet_grid(dim~n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment