Skip to content

Instantly share code, notes, and snippets.

@TysonStanley
Created October 12, 2019 17:34
Show Gist options
  • Save TysonStanley/f64ed76e4859199e22904f9a0ee849c9 to your computer and use it in GitHub Desktop.
Save TysonStanley/f64ed76e4859199e22904f9a0ee849c9 to your computer and use it in GitHub Desktop.
Using `profmem` package to understand data summaries by group for both `data.table` and `dplyr`
library(bench) # assess speed and memory
library(data.table) # data.table for all of its stuff
library(dplyr) # compare it to data.table
library(profmem) # assess the process of R functions
set.seed(84322)
# Example Data
d <- data.table(
grp = sample(c(1,2), size = 1e6, replace = TRUE) %>% factor,
x = rnorm(1e6),
y = runif(1e6)
)
d
#> grp x y
#> 1: 1 0.2558379 0.2034364
#> 2: 2 -0.8886153 0.4684875
#> 3: 2 0.4724519 0.6850357
#> 4: 1 0.7360537 0.4890217
#> 5: 1 0.6855063 0.6964860
#> ---
#> 999996: 1 2.1008965 0.3624327
#> 999997: 2 2.2423628 0.2595716
#> 999998: 2 1.5314115 0.1102460
#> 999999: 2 -1.6086973 0.2679477
#> 1000000: 2 1.2518419 0.6566943
# data size and group vector size
lobstr::obj_size(d)
lobstr::obj_size(d$grp)
#> 20,001,752 B
#> 4,000,560 B
# Copy all data for summaries
df <- copy(d) %>% as.data.frame()
tbl <- copy(d) %>% as_tibble()
dt <- copy(d)
# profile dplyr
profmem::profmem(summarize(group_by(tbl, grp), mean(x))) %>%
data.frame %>%
select(bytes, calls)
#> bytes calls
#> 1 1997104 summarize() -> group_by() -> group_by.data.frame() -> grouped_df() -> grouped_df_impl()
#> 2 2003000 summarize() -> group_by() -> group_by.data.frame() -> grouped_df() -> grouped_df_impl()
#>
# benchmark dplyr
bench::mark(summarize(group_by(tbl, grp), mean(x)),
iterations = 25) %>%
select(median, mem_alloc)
#> # A tibble: 1 x 2
#> median mem_alloc
#> <bch:tm> <bch:byt>
#> 1 26.7ms 3.81MB
# profile data.table
profmem::profmem(dt[, mean(x), by = grp]) %>%
data.frame %>%
select(bytes, calls)
#> bytes calls
#> 1 280 [() -> [.data.table() -> new.env()
#> 2 8240 [() -> [.data.table() -> null.data.table() -> setalloccol()
#> 3 8240 [() -> [.data.table() -> null.data.table() -> setalloccol()
#> 4 4000048 [() -> [.data.table() -> forderv()
#> 5 2003000 [() -> [.data.table()
#> 6 8248 [() -> [.data.table()
#> 7 4005944 [() -> [.data.table()
#> 8 8248 [() -> [.data.table()
#> 9 280 [() -> [.data.table() -> new.env()
#> 10 248 [() -> [.data.table()
#> 11 248 [() -> [.data.table()
#> 12 248 [() -> [.data.table()
#> 13 248 [() -> [.data.table()
#> 14 248 [() -> [.data.table()
#> 15 248 [() -> [.data.table()
#> 16 248 [() -> [.data.table()
#> 17 248 [() -> [.data.table()
#> 18 248 [() -> [.data.table()
#> 19 248 [() -> [.data.table()
#> 20 248 [() -> [.data.table()
#> 21 248 [() -> [.data.table()
#> 22 248 [() -> [.data.table()
#> 23 248 [() -> [.data.table()
#> 24 248 [() -> [.data.table()
#> 25 4000056 [() -> [.data.table() -> gforce()
#> 26 2000056 [() -> [.data.table() -> gforce()
#> 27 2000056 [() -> [.data.table() -> gforce()
#> 28 16000056 [() -> [.data.table() -> gforce()
#> 29 8256 [() -> [.data.table() -> setalloccol()
#> 30 8256 [() -> [.data.table() -> setalloccol()
# benchmark with gforce
bench::mark(dt[, mean(x), by = grp],
iterations = 25) %>%
select(median, mem_alloc)
#> # A tibble: 1 x 2
#> median mem_alloc
#> <bch:tm> <bch:byt>
#> 1 16.8ms 32.5MB
# profile data.table without gforce
profmem::profmem(dt[, base::mean(x), by = grp]) %>%
data.frame %>%
select(bytes, calls)
#> bytes calls
#> 1 280 [() -> [.data.table() -> new.env()
#> 2 8240 [() -> [.data.table() -> null.data.table() -> setalloccol()
#> 3 8240 [() -> [.data.table() -> null.data.table() -> setalloccol()
#> 4 4000048 [() -> [.data.table() -> forderv()
#> 5 2003000 [() -> [.data.table()
#> 6 8248 [() -> [.data.table()
#> 7 4005944 [() -> [.data.table()
#> 8 8248 [() -> [.data.table()
#> 9 2003000 [() -> [.data.table()
#> 10 8256 [() -> [.data.table() -> setalloccol()
#> 11 8256 [() -> [.data.table() -> setalloccol()
# benchmark without gforce
bench::mark(dt[, base::mean(x), by = grp],
iterations = 25) %>%
select(median, mem_alloc)
#> # A tibble: 1 x 2
#> median mem_alloc
#> <bch:tm> <bch:byt>
#> 1 10.5ms 11.5MB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment