require(data.table) ## 1.9.3
set.seed(1L)
DT = data.table(ID = sample(1e3, 1e8, TRUE), GROUP = sample(letters, 1e8, TRUE))
All benchmarks are minimum of three consecutive runs.
system.time(ans1 <- DT[, list(N=length(unique(GROUP))), by=ID])
# user system elapsed
# 8.677 1.939 10.864
system.time(ans2 <- unique(DT)[, .N, by=ID])
# user system elapsed
# 7.054 0.948 8.181
identical(ans1, ans2) # [1] TRUE
require(dplyr) ## latest commit from github
setDF(DT)
system.time(DT_g <- DT %>% group_by(ID))
# user system elapsed
# 7.688 1.369 9.686
gc() ## needed this for measuring timing correctly. group_by seems to take quite a bit of memory.
system.time(ans3 <- DT_g %>% summarise(N = n_distinct(GROUP)))
# user system elapsed
# 16.170 0.050 16.618
system.time(ans4 <- DT_g %>% summarise(N = length(unique(GROUP))))
# user system elapsed
# 7.108 2.421 9.705
identical(ans3, ans4) # [1] TRUE
identical(setDF(setorder(ans1)), as.data.frame(ans3)) # [1] TRUE
n_distinct()
seems slower than length(unique(.))
here.. not sure why.. especially when ?n_distinct
says:
This is a faster and more concise equivalent of
length(unique(x))