Last active
November 13, 2017 20:07
-
-
Save magic-lantern/648afc6963b1bc89dd1a6efc74eac2c0 to your computer and use it in GitHub Desktop.
Comments and performance testing for ccc.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# using a data.frame for output results | |
# base time 26 seconds with 60k rows | |
# this block takes 27 seconds with 100k rows | |
# if (sum(df[i,])) | |
# df$ccc_flag[i] <- 1L | |
# this one line takes abotut 10 seconds with 60k rows | |
# df$ccc_flag[i] <- ccc_flag | |
# this block takes 10 seconds with 60k rows | |
if(ccc_flag) | |
df$ccc_flag'[i] <- 1L | |
# 20 seconds with 60k rows | |
# df[i, 13] <- ccc_flag | |
# 20 seconds with 60k rows | |
# df[i, 13L] <- ccc_flag | |
## data frames are very slow compared to an integer matrix - but still there are various ways to do | |
## the same thing - here's some testing with 2 million rows and a matrix. | |
## from my testing, it appears that indexing based on a number rather than column name is faster | |
tic("testing sum") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
if (sum(out[i,])) | |
out[i, 'ccc_flag'] <- 1L | |
} | |
toc() | |
tic("testing based on ccc_flag") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 'ccc_flag'] <- ccc_flag | |
} | |
toc() | |
tic("testing based on ccc_flag, set with integer") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 'ccc_flag'] <- 1L | |
} | |
toc() | |
tic("testing based on integer index and set") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 13L] <- 1L | |
} | |
toc() | |
tic("testing based on number index and set") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 13] <- 1 | |
} | |
toc() | |
tic("testing based on integer index and number set") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 13L] <- 1 | |
} | |
toc() | |
tic("testing based on number index and integer set") | |
for(i in 1:nrow(dx)) { | |
dx_row <- dx[i,] | |
pc_row <- pc[i,] | |
ccc_flag <- 1L | |
# want at least 1 ccc to be set | |
out[i, 'transplant'] <- 1L | |
out[i, 13] <- 1L | |
} | |
toc() | |
#### benchmark of different methods for using a hash in R - lists vs environments | |
#### from my tests at least, an env with no parent environment is about 50x faster | |
#### than one with a parent env, and a standard environment is about 2x faster than | |
#### using a list | |
#### got some of this from | |
#### http://jeffreyhorner.tumblr.com/post/116325104028/hash-table-performance-in-r-part-ii-in-part-i | |
#### https://stackoverflow.com/questions/1105659/how-to-add-variable-key-value-pair-to-list-object | |
library(microbenchmark) | |
LOOKUP_list <- function(){ | |
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert") | |
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357) | |
league <- setNames(as.list(rankings), players) | |
league$tom | |
} | |
LOOKUP_env <- function(){ | |
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert") | |
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357) | |
ht <- new.env() | |
ht[["bob"]] <- 0.2027 | |
ht[["tom"]] <- 0.2187 | |
ht[["tim"]] <- 0.0378 | |
ht[["tony"]] <- 0.3334 | |
ht[["tiny"]] <- 0.0161 | |
ht[["hubert"]] <- 0.0555 | |
ht[["herbert"]] <- 0.1357 | |
ht[["tom"]] | |
} | |
LOOKUP_empty_env_get <- function(){ | |
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert") | |
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357) | |
ht <- new.env(parent = emptyenv()) | |
ht[["bob"]] <- 0.2027 | |
ht[["tom"]] <- 0.2187 | |
ht[["tim"]] <- 0.0378 | |
ht[["tony"]] <- 0.3334 | |
ht[["tiny"]] <- 0.0161 | |
ht[["hubert"]] <- 0.0555 | |
ht[["herbert"]] <- 0.1357 | |
get("tom",envir=ht,inherits=FALSE) | |
} | |
LOOKUP_empty_env_subset <- function(){ | |
players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert") | |
rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357) | |
ht <- new.env(parent = emptyenv()) | |
ht[["bob"]] <- 0.2027 | |
ht[["tom"]] <- 0.2187 | |
ht[["tim"]] <- 0.0378 | |
ht[["tony"]] <- 0.3334 | |
ht[["tiny"]] <- 0.0161 | |
ht[["hubert"]] <- 0.0555 | |
ht[["herbert"]] <- 0.1357 | |
ht[["tom"]] | |
} | |
microbenchmark(LOOKUP_list(), LOOKUP_env(), LOOKUP_empty_env_get, LOOKUP_empty_env_subset, times=10L^5) | |
#Unit: nanoseconds | |
# expr min lq mean median uq max neval | |
# LOOKUP_list() 3425 4103 5316.99711 4334 4648 39971707 1e+05 | |
# LOOKUP_env() 2194 2717 3337.51395 2916 3209 4621637 1e+05 | |
# LOOKUP_empty_env_get 16 33 56.08413 55 71 15311 1e+05 | |
# LOOKUP_empty_env_subset 17 33 51.67454 46 61 33205 1e+05 | |
############################################################################### | |
# Looking at a few options for storing the codes to compare against in a more | |
# efficient manner than can be evaluated faster. | |
# From my tests, the list of envs is about 3x faster to build than env of envs | |
# From my tests, finding elements in the list of envs is faster than envs of envs | |
# Most of that difference is due to envs only support character indicies, so | |
# but even if hard code in index, list of envs is still faster by about 10 - 20% | |
############################################################################### | |
############################################################################### | |
# ENV of ENVs | |
############################################################################### | |
rm(list=ls()) | |
gc() | |
codes <- pccc::get_codes(icdv = 9L) | |
print(codes[2, 2]) | |
code_env <- new.env(parent = emptyenv()) | |
for(i in 2:5) { | |
code_env[[toString(i)]] <- new.env(parent = emptyenv()) | |
} | |
for(r in 1:nrow(codes)) { | |
ccc <- rownames(codes)[r] | |
for (c in 1:ncol(codes)) { | |
for (n in seq_along(codes[[r, c]])) { | |
code_env[[toString(stri_length(codes[[r, c]][n]))]][[codes[[r, c]][n]]] <- ccc | |
} | |
} | |
} | |
checklist <- c("14", "5280", "1", "234987", "asdf") | |
lapply(checklist, function(c){ | |
code_env[[toString(stri_length(c))]][[c]] | |
}) | |
# check like | |
require(stringi) | |
stri_length("5280") | |
!is.null(code_env[["5280"]]) | |
############################################################################### | |
############################################################################### | |
# list of ENVs | |
# this is subject to subscript out of bounds errors | |
############################################################################### | |
rm(list=ls()) | |
gc() | |
codes <- pccc::get_codes(icdv = 9L) | |
code_list <- sapply(c(1:10), function(x) new.env(parent = emptyenv())) | |
for(r in 1:nrow(codes)) { | |
ccc <- rownames(codes)[r] | |
for (c in 1:ncol(codes)) { | |
for (n in seq_along(codes[[r, c]])) { | |
code_list[[stri_length(codes[[r, c]][n])]][[codes[[r, c]][n]]] <- ccc | |
} | |
} | |
} | |
checklist <- c("14", "5280", "1", "234987", "asdf") | |
find_in_list <- function() { | |
lapply(checklist, function(c){ | |
code_list[[4]][[c]] | |
}) | |
} | |
find_in_env <- function() { | |
lapply(checklist, function(c){ | |
code_env[["4"]][[c]] | |
}) | |
} | |
microbenchmark(find_in_env(), find_in_list(), times = 10000) | |
################################################################ | |
# looking at a few different options for the hash structure | |
# seems that a flatter structure is more efficient. | |
pc <- pccc::get_primary_codes() | |
cc <- pccc::get_collapsed_codes() | |
c <- '2532' | |
f1 <- function() { | |
match <- pc[[4]][['dx']][[c]] | |
!is.null(match) | |
match <- pc[[4]][['dx']][['abcd']] | |
!is.null(match) | |
} | |
f2 <- function() { | |
match <- cc[[4]][[c]] | |
!is.null(match) | |
match <- cc[[4]][['abcd']] | |
!is.null(match) | |
} | |
f2b <- function() { | |
match <- cc[[4]][[c]] | |
!is.null(cc[[4]][[c]]) | |
match <- cc[[4]][['abcd']] | |
!is.null(cc[[4]][['abcd']]) | |
} | |
f3 <- function() { | |
secondary <- pc[[4]][['dx']] | |
match <- secondary[[c]] | |
!is.null(match) | |
match <- secondary[['abcd']] | |
!is.null(match) | |
} | |
microbenchmark( | |
f1(), | |
f2(), | |
f2b(), | |
f3(), | |
times = 10000 | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment