Skip to content

Instantly share code, notes, and snippets.

@rkingdc
Created February 28, 2019 20:48
Binning Columns in Remote Tables: run benchmark
run_benchmark <- function(nrows, ncuts, db, times=100, add_index=FALSE){
## setup ##
d <- data.frame(column_to_cut = sample.int(10000, nrows, replace=TRUE),
id = seq(nrows),
k_dummy = 1)
# can't add indexes to temporary tables
dbWriteTable(db, 'data', d, overwrite=TRUE, temporary=!add_index)
# for testing how indexes affect performance
if(add_index){
DBI::dbExecute(db, 'CREATE INDEX idx ON data (column_to_cut)')
DBI::dbExecute(db, 'ANALYZE data')
}
# dplyr and rquery both have remote table pointers
dplyr_data <- dplyr::tbl(db, 'data')
rq_data <- rquery::db_td(db, 'data')
# cut_vector based off number of cuts we want and range of random data
cut_vector <- seq(from=min(d$column_to_cut), to=max(d$column_to_cut), length.out=ncuts+2)
cut_vector <- cut_vector[-c(1, length(cut_vector))]
# write temporary data to database as well
tmpdat <- data.frame(cut = cut_vector, k_dummy=1)
DBI::dbWriteTable(db,
'tmp', tmpdat,
temporary=TRUE, overwrite=TRUE)
dplyr_cuts <- dplyr::tbl(db, 'tmp')
rq_cuts <- db_td(db, 'tmp')
# create binning functions for case_when and rquery
# the dplyr join method doesn't require any fnction setup--it's a simple expressions
db_case_when_case_fn <- .make_case_when_fn(column_name = 'column_to_cut', cut_vector=cut_vector)
db_rquery_join_fn <- .make_db_rquery_join_fn(data=rq_data, tbl_cuts=rq_cuts,
db=db, column_to_cut = 'column_to_cut')
mb <- microbenchmark(
case_when=db_case_when_case_fn(dplyr_data),
dplyr_join=db_dplyr_join_fn(data = dplyr_data, tbl_cuts=dplyr_cuts),
rquery_join=db_rquery_join_fn(),
times=times
)
if (add_index){
# if temp table, drop it
DBI::dbExecute(db, 'DROP TABLE data')
}
return(mb)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment