Skip to content

Instantly share code, notes, and snippets.

@nathan-russell
Created March 4, 2017 22:08
Show Gist options
  • Save nathan-russell/51457eed50205d9def776f0dbd1f5e51 to your computer and use it in GitHub Desktop.
Save nathan-russell/51457eed50205d9def776f0dbd1f5e51 to your computer and use it in GitHub Desktop.
library(tibble)
library(data.table)
library(dplyr)
library(hashmap)
library(microbenchmark)
set.seed(123)
nkeys <- 10 ** 7
keylen <- 8
keys <- unique(stringi::stri_rand_strings(nkeys, keylen))
values <- rnorm(length(keys))
rand_key <- sample(keys, 1)
system.time({ data_tbl <- tibble(keys, values) })
# user system elapsed
# 0.024 0.000 0.023
system.time({ data_table <- data.table(keys, values) })
# user system elapsed
# 0.048 0.036 0.083
data_table2 <- copy(data_table)
system.time({ setkey(data_table2, keys) })
# user system elapsed
# 7.864 0.000 7.368
# slow to construct (SEXPs copied to C++ object)
system.time({ hash <- hashmap(keys, values) })
# user system elapsed
# 3.840 0.308 4.149
# sanity check
data_tbl %>% filter(keys == rand_key)
# # A tibble: 1 × 2
# keys values
# <chr> <dbl>
# 1 dimWYkVf 0.3093082
data_table[keys == rand_key]
# keys values
# 1: dimWYkVf 0.3093082
data_table2[.(rand_key), nomatch = 0L]
# keys values
# 1: dimWYkVf 0.3093082
hash[[rand_key]]
# [1] 0.3093082
microbenchmark::microbenchmark(
"dplyr" = data_tbl %>% filter(keys == rand_key),
"data.table" = data_table[keys == rand_key],
"indexed DT" = data_table2[.(rand_key), nomatch = 0L],
"hashmap" = hash[[rand_key]],
times = 200L
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 129863.170 135231.3850 271200.45996 147699.5085 221820.6025 1905643.772 200
# data.table 479.377 555.9585 850.10595 725.1985 903.3740 3861.167 200
# indexed DT 493.498 589.2130 868.77630 743.1165 914.4945 3624.291 200
# hashmap 16.820 37.5895 70.70894 75.1950 86.8735 331.077 200
sessionInfo()
# R Under development (unstable) (2017-02-21 r72242)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Debian GNU/Linux 8 (jessie)
#
# Matrix products: default
# BLAS: /usr/local/lib64/R/lib/libRblas.so
# LAPACK: /usr/local/lib64/R/lib/libRlapack.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
# [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
# [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] microbenchmark_1.4-2.1 hashmap_0.2.0 dplyr_0.5.0
# [4] data.table_1.10.0 tibble_1.2
#
# loaded via a namespace (and not attached):
# [1] Rcpp_0.12.9.4 codetools_0.2-15 assertthat_0.1 plyr_1.8.4 grid_3.4.0
# [6] R6_2.2.0 gtable_0.2.0 DBI_0.5-1 magrittr_1.5 scales_0.4.1
# [11] ggplot2_2.2.1 stringi_1.1.2 lazyeval_0.2.0 tools_3.4.0 munsell_0.4.3
# [16] compiler_3.4.0 colorspace_1.2-7
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment