Skip to content

Instantly share code, notes, and snippets.

@magic-lantern
Last active November 13, 2017 20:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magic-lantern/648afc6963b1bc89dd1a6efc74eac2c0 to your computer and use it in GitHub Desktop.
Save magic-lantern/648afc6963b1bc89dd1a6efc74eac2c0 to your computer and use it in GitHub Desktop.
Comments and performance testing for ccc.R
# using a data.frame for output results
# base time 26 seconds with 60k rows
# this block takes 27 seconds with 100k rows
# if (sum(df[i,]))
# df$ccc_flag[i] <- 1L
# this one line takes abotut 10 seconds with 60k rows
# df$ccc_flag[i] <- ccc_flag
# this block takes 10 seconds with 60k rows
if(ccc_flag)
df$ccc_flag'[i] <- 1L
# 20 seconds with 60k rows
# df[i, 13] <- ccc_flag
# 20 seconds with 60k rows
# df[i, 13L] <- ccc_flag
## data frames are very slow compared to an integer matrix - but still there are various ways to do
## the same thing - here's some testing with 2 million rows and a matrix.
## from my testing, it appears that indexing based on a number rather than column name is faster
tic("testing sum")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
if (sum(out[i,]))
out[i, 'ccc_flag'] <- 1L
}
toc()
tic("testing based on ccc_flag")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 'ccc_flag'] <- ccc_flag
}
toc()
tic("testing based on ccc_flag, set with integer")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 'ccc_flag'] <- 1L
}
toc()
tic("testing based on integer index and set")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 13L] <- 1L
}
toc()
tic("testing based on number index and set")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 13] <- 1
}
toc()
tic("testing based on integer index and number set")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 13L] <- 1
}
toc()
tic("testing based on number index and integer set")
for(i in 1:nrow(dx)) {
dx_row <- dx[i,]
pc_row <- pc[i,]
ccc_flag <- 1L
# want at least 1 ccc to be set
out[i, 'transplant'] <- 1L
out[i, 13] <- 1L
}
toc()
#### benchmark of different methods for using a hash in R - lists vs environments
#### from my tests at least, an env with no parent environment is about 50x faster
#### than one with a parent env, and a standard environment is about 2x faster than
#### using a list
#### got some of this from
#### http://jeffreyhorner.tumblr.com/post/116325104028/hash-table-performance-in-r-part-ii-in-part-i
#### https://stackoverflow.com/questions/1105659/how-to-add-variable-key-value-pair-to-list-object
library(microbenchmark)
LOOKUP_list <- function(){
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
league <- setNames(as.list(rankings), players)
league$tom
}
LOOKUP_env <- function(){
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
ht <- new.env()
ht[["bob"]] <- 0.2027
ht[["tom"]] <- 0.2187
ht[["tim"]] <- 0.0378
ht[["tony"]] <- 0.3334
ht[["tiny"]] <- 0.0161
ht[["hubert"]] <- 0.0555
ht[["herbert"]] <- 0.1357
ht[["tom"]]
}
LOOKUP_empty_env_get <- function(){
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
ht <- new.env(parent = emptyenv())
ht[["bob"]] <- 0.2027
ht[["tom"]] <- 0.2187
ht[["tim"]] <- 0.0378
ht[["tony"]] <- 0.3334
ht[["tiny"]] <- 0.0161
ht[["hubert"]] <- 0.0555
ht[["herbert"]] <- 0.1357
get("tom",envir=ht,inherits=FALSE)
}
LOOKUP_empty_env_subset <- function(){
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
ht <- new.env(parent = emptyenv())
ht[["bob"]] <- 0.2027
ht[["tom"]] <- 0.2187
ht[["tim"]] <- 0.0378
ht[["tony"]] <- 0.3334
ht[["tiny"]] <- 0.0161
ht[["hubert"]] <- 0.0555
ht[["herbert"]] <- 0.1357
ht[["tom"]]
}
microbenchmark(LOOKUP_list(), LOOKUP_env(), LOOKUP_empty_env_get, LOOKUP_empty_env_subset, times=10L^5)
#Unit: nanoseconds
# expr min lq mean median uq max neval
# LOOKUP_list() 3425 4103 5316.99711 4334 4648 39971707 1e+05
# LOOKUP_env() 2194 2717 3337.51395 2916 3209 4621637 1e+05
# LOOKUP_empty_env_get 16 33 56.08413 55 71 15311 1e+05
# LOOKUP_empty_env_subset 17 33 51.67454 46 61 33205 1e+05
###############################################################################
# Looking at a few options for storing the codes to compare against in a more
# efficient manner than can be evaluated faster.
# From my tests, the list of envs is about 3x faster to build than env of envs
# From my tests, finding elements in the list of envs is faster than envs of envs
# Most of that difference is due to envs only support character indicies, so
# but even if hard code in index, list of envs is still faster by about 10 - 20%
###############################################################################
###############################################################################
# ENV of ENVs
###############################################################################
rm(list=ls())
gc()
codes <- pccc::get_codes(icdv = 9L)
print(codes[2, 2])
code_env <- new.env(parent = emptyenv())
for(i in 2:5) {
code_env[[toString(i)]] <- new.env(parent = emptyenv())
}
for(r in 1:nrow(codes)) {
ccc <- rownames(codes)[r]
for (c in 1:ncol(codes)) {
for (n in seq_along(codes[[r, c]])) {
code_env[[toString(stri_length(codes[[r, c]][n]))]][[codes[[r, c]][n]]] <- ccc
}
}
}
checklist <- c("14", "5280", "1", "234987", "asdf")
lapply(checklist, function(c){
code_env[[toString(stri_length(c))]][[c]]
})
# check like
require(stringi)
stri_length("5280")
!is.null(code_env[["5280"]])
###############################################################################
###############################################################################
# list of ENVs
# this is subject to subscript out of bounds errors
###############################################################################
rm(list=ls())
gc()
codes <- pccc::get_codes(icdv = 9L)
code_list <- sapply(c(1:10), function(x) new.env(parent = emptyenv()))
for(r in 1:nrow(codes)) {
ccc <- rownames(codes)[r]
for (c in 1:ncol(codes)) {
for (n in seq_along(codes[[r, c]])) {
code_list[[stri_length(codes[[r, c]][n])]][[codes[[r, c]][n]]] <- ccc
}
}
}
checklist <- c("14", "5280", "1", "234987", "asdf")
find_in_list <- function() {
lapply(checklist, function(c){
code_list[[4]][[c]]
})
}
find_in_env <- function() {
lapply(checklist, function(c){
code_env[["4"]][[c]]
})
}
microbenchmark(find_in_env(), find_in_list(), times = 10000)
################################################################
# looking at a few different options for the hash structure
# seems that a flatter structure is more efficient.
pc <- pccc::get_primary_codes()
cc <- pccc::get_collapsed_codes()
c <- '2532'
f1 <- function() {
match <- pc[[4]][['dx']][[c]]
!is.null(match)
match <- pc[[4]][['dx']][['abcd']]
!is.null(match)
}
f2 <- function() {
match <- cc[[4]][[c]]
!is.null(match)
match <- cc[[4]][['abcd']]
!is.null(match)
}
f2b <- function() {
match <- cc[[4]][[c]]
!is.null(cc[[4]][[c]])
match <- cc[[4]][['abcd']]
!is.null(cc[[4]][['abcd']])
}
f3 <- function() {
secondary <- pc[[4]][['dx']]
match <- secondary[[c]]
!is.null(match)
match <- secondary[['abcd']]
!is.null(match)
}
microbenchmark(
f1(),
f2(),
f2b(),
f3(),
times = 10000
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment