These functions are in response to StackOverflow questions like this, wanting to fill in missing NA values with preceeding values, optionally by group
fill_down <- function(v) {
if (length(v) > 1) {
keep <- c(TRUE, !is.na(v[-1]))
v[keep][cumsum(keep)]
} else v
}
fill_down_by_group <- function(v, grp)
ave(v, grp, FUN=fill_down)
fill_down_by_grouped <- function(v, grp) {
if (length(v)) {
keep <- !(duplicated(v) & is.na(v))
v[keep][cumsum(keep)]
} else v
}
They aren't particularlly efficient (to fill down requires a single pass through the data, but here we have 4(?) passes) but: (a) scale linearly with the length of the vector or number of groups; (b) are relatively performant (e.g., <0.5s for a vector of length 10M, <10s for 1M groups); and (c) not too crytpic to understand.
library(microbenchmark)
n <- 1e7
x0 <- x10 <- x50 <- x90 <- seq_len(n)
x10[sample(n, .1 * n)] <- NA
x50[sample(n, .5 * n)] <- NA
x90[sample(n, .9 * n)] <- NA
microbenchmark( # about 225ms for me, independent of # NAs
fill_down(x0), fill_down(x10), fill_down(x50), fill_down(x90),
times=5)
grp10 <- rep(seq_len(n/10), each=10)
grp100 <- rep(seq_len(n/100), each=100)
grp1000 <- rep(seq_len(n/1000), each=1000)
microbenchmark( # about 36, 8, 4, 2s for me
fill_down_by_group(x0, x0),
fill_down_by_group(x0, grp10),
fill_down_by_group(x0, grp100),
fill_down_by_group(x0, grp1000),
times=1)
microbenchmark( # about 300ms for me, independent of grp
fill_down_by_grouped(x0, x0),
fill_down_by_grouped(x0, grp10),
fill_down_by_grouped(x0, grp100),
times=10)