Skip to content

Instantly share code, notes, and snippets.

@jthomasmock
Last active March 16, 2021 18:00
Show Gist options
  • Save jthomasmock/94f4444dd3fd28340d40074e35fc87bc to your computer and use it in GitHub Desktop.
Save jthomasmock/94f4444dd3fd28340d40074e35fc87bc to your computer and use it in GitHub Desktop.
library(tidyverse)
library(tictoc)
library(arrow)
tic()
con <- DBI::dbConnect(duckdb::duckdb(), "data/pbp_db.duckdb")
nfl_pbp <- tbl(con, "nflfastR_pbp")
toc()
# 0.123 sec elapsed
tic()
con_lite <- DBI::dbConnect(RSQLite::SQLite(), "data/pbp_db")
nfl_pbp_lite <- tbl(con_lite, "nflfastR_pbp")
toc()
# 0.141 sec elapsed
tic()
nfl_parq <- open_dataset("nflfastr-parquet")
toc()
# 0.077 sec elapsed
tic()
nfl_parq_shard <- open_dataset("nflfastr-shard", partitioning = schema(season = int32(), week = string()))
toc()
# 0.038 sec elapsed
### Here's a benchmark
sqlite_expr <- function(){
con_lite <- DBI::dbConnect(RSQLite::SQLite(), "data/pbp_db")
nfl_pbp_lite <- tbl(con_lite, "nflfastR_pbp")
out <- nfl_pbp_lite %>%
select(epa, posteam, season, week) %>%
filter(
season %in% c(2017, 2018, 2019),
week %in% c(8:16),
!is.na(posteam)
) %>%
group_by(posteam) %>%
summarise(mean_epa = mean(epa, na.rm = TRUE)) %>%
arrange(desc(mean_epa)) %>%
ungroup() %>%
collect()
duckdb::dbDisconnect(con_lite)
out
}
duck_expr <- function(){
con <- DBI::dbConnect(duckdb::duckdb(), "data/pbp_db.duckdb")
nfl_pbp <- tbl(con, "nflfastR_pbp")
out <- nfl_pbp %>%
select(epa, posteam, season, week) %>%
filter(
season %in% c(2017, 2018, 2019),
week %in% c(8:16),
!is.na(posteam)
) %>%
group_by(posteam) %>%
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(mean_epa)) %>%
collect()
duckdb::dbDisconnect(con)
out
}
parquet_expr <- function(){
nfl_parq %>%
select(epa, posteam, season, week) %>%
filter(
season %in% c(2017, 2018, 2019),
week %in% c(8:16),
!is.na(posteam)
) %>%
collect() %>%
group_by(posteam) %>%
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(mean_epa))
}
parquet_part_expr <- function(){
nfl_parq_shard %>%
select(epa, posteam, season, week) %>%
filter(
season %in% c(2017, 2018, 2019),
week %in% c(8:16),
!is.na(posteam)
) %>%
collect() %>%
group_by(posteam) %>%
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(mean_epa))
}
bench::mark(
min_time = 0.1,
iterations = 10,
sqlite_expr(),
duck_expr(),
parquet_expr(),
parquet_part_expr()
)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
# 1 sqlite_expr() 1.15s 1.27s 0.704 3.97MB 0.176 8 2 11.36s
# 2 duck_expr() 192.69ms 211.45ms 3.04 434.47KB 1.30 7 3 2.31s
# 3 parquet_expr() 208.91ms 451.19ms 2.12 9.11MB 0.529 8 2 3.78s
# 4 parquet_part_expr() 228.37ms 254.39ms 3.96 5.11MB 0.440 9 1 2.27s
@jthomasmock
Copy link
Author

carbon-db

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment