magic-lantern/ccc.R

## ccc.R
    # using a data.frame for output results
    # base time 26 seconds with 60k rows

    # this block takes 27 seconds with 100k rows
    # if (sum(df[i,]))
    #   df$ccc_flag[i] <- 1L

    # this one line takes abotut 10 seconds with 60k rows
    # df$ccc_flag[i] <- ccc_flag

    # this block takes 10 seconds with 60k rows
    if(ccc_flag)
      df$ccc_flag'[i] <- 1L

    # 20 seconds with 60k rows
    # df[i, 13] <- ccc_flag

    # 20 seconds with 60k rows
    # df[i, 13L] <- ccc_flag


## data frames are very slow compared to an integer matrix - but still there are various ways to do
## the same thing - here's some testing with 2 million rows and a matrix.
## from my testing, it appears that indexing based on a number rather than column name is faster
  tic("testing sum")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    if (sum(out[i,]))
      out[i, 'ccc_flag'] <- 1L
  }
  toc()

  tic("testing based on ccc_flag")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 'ccc_flag'] <- ccc_flag
  }
  toc()

  tic("testing based on ccc_flag, set with integer")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 'ccc_flag'] <- 1L
  }
  toc()

  tic("testing based on integer index and set")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 13L] <- 1L
  }
  toc()

  tic("testing based on number index and set")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 13] <- 1
  }
  toc()

  tic("testing based on integer index and number set")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 13L] <- 1
  }
  toc()

  tic("testing based on number index and integer set")
  for(i in 1:nrow(dx)) {
    dx_row <- dx[i,]
    pc_row <- pc[i,]
    ccc_flag <- 1L

    # want at least 1 ccc to be set
    out[i, 'transplant'] <- 1L

    out[i, 13] <- 1L
  }
  toc()

#### benchmark of different methods for using a hash in R - lists vs environments
#### from my tests at least, an env with no parent environment is about 50x faster
#### than one with a parent env, and a standard environment is about 2x faster than
#### using a list
#### got some of this from
####   http://jeffreyhorner.tumblr.com/post/116325104028/hash-table-performance-in-r-part-ii-in-part-i
####   https://stackoverflow.com/questions/1105659/how-to-add-variable-key-value-pair-to-list-object
library(microbenchmark)
LOOKUP_list <- function(){
  players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
  rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
  league <- setNames(as.list(rankings), players)
  league$tom
}

LOOKUP_env <- function(){
  players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
  rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
  ht <- new.env()
  ht[["bob"]] <- 0.2027
  ht[["tom"]] <- 0.2187
  ht[["tim"]] <- 0.0378
  ht[["tony"]] <- 0.3334
  ht[["tiny"]] <- 0.0161
  ht[["hubert"]] <- 0.0555
  ht[["herbert"]] <- 0.1357

  ht[["tom"]]
}

LOOKUP_empty_env_get <- function(){
  players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
  rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
  ht <- new.env(parent = emptyenv())
  ht[["bob"]] <- 0.2027
  ht[["tom"]] <- 0.2187
  ht[["tim"]] <- 0.0378
  ht[["tony"]] <- 0.3334
  ht[["tiny"]] <- 0.0161
  ht[["hubert"]] <- 0.0555
  ht[["herbert"]] <- 0.1357

  get("tom",envir=ht,inherits=FALSE)
}

LOOKUP_empty_env_subset <- function(){
  players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
  rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
  ht <- new.env(parent = emptyenv())
  ht[["bob"]] <- 0.2027
  ht[["tom"]] <- 0.2187
  ht[["tim"]] <- 0.0378
  ht[["tony"]] <- 0.3334
  ht[["tiny"]] <- 0.0161
  ht[["hubert"]] <- 0.0555
  ht[["herbert"]] <- 0.1357

  ht[["tom"]]
}

microbenchmark(LOOKUP_list(), LOOKUP_env(), LOOKUP_empty_env_get, LOOKUP_empty_env_subset, times=10L^5)
#Unit: nanoseconds
#                    expr  min   lq       mean median   uq      max neval
#           LOOKUP_list() 3425 4103 5316.99711   4334 4648 39971707 1e+05
#            LOOKUP_env() 2194 2717 3337.51395   2916 3209  4621637 1e+05
#    LOOKUP_empty_env_get   16   33   56.08413     55   71    15311 1e+05
# LOOKUP_empty_env_subset   17   33   51.67454     46   61    33205 1e+05


###############################################################################
# Looking at a few options for storing the codes to compare against in a more
# efficient manner than can be evaluated faster.
# From my tests, the list of envs is about 3x faster to build than env of envs
# From my tests, finding elements in the list of envs is faster than envs of envs
# Most of that difference is due to envs only support character indicies, so
# but even if hard code in index, list of envs is still faster by about 10 - 20%
###############################################################################

###############################################################################
# ENV of ENVs
###############################################################################
rm(list=ls())
gc()
codes <- pccc::get_codes(icdv = 9L)
print(codes[2, 2])
code_env <- new.env(parent = emptyenv())
for(i in 2:5) {
  code_env[[toString(i)]] <- new.env(parent = emptyenv())
}

for(r in 1:nrow(codes)) {
  ccc <- rownames(codes)[r]
  for (c in 1:ncol(codes)) {
    for (n in seq_along(codes[[r, c]])) {
      code_env[[toString(stri_length(codes[[r, c]][n]))]][[codes[[r, c]][n]]] <- ccc
    }
  }
}

checklist <- c("14", "5280", "1", "234987", "asdf")

lapply(checklist, function(c){
  code_env[[toString(stri_length(c))]][[c]]
})

# check like
require(stringi)
stri_length("5280")
!is.null(code_env[["5280"]])
###############################################################################


###############################################################################
# list of ENVs
# this is subject to subscript out of bounds errors
###############################################################################
rm(list=ls())
gc()
codes <- pccc::get_codes(icdv = 9L)
code_list <- sapply(c(1:10), function(x) new.env(parent = emptyenv()))

for(r in 1:nrow(codes)) {
  ccc <- rownames(codes)[r]
  for (c in 1:ncol(codes)) {
    for (n in seq_along(codes[[r, c]])) {
      code_list[[stri_length(codes[[r, c]][n])]][[codes[[r, c]][n]]] <- ccc
    }
  }
}

checklist <- c("14", "5280", "1", "234987", "asdf")

find_in_list <- function() {
  lapply(checklist, function(c){
    code_list[[4]][[c]]
    })
}

find_in_env <- function() {
  lapply(checklist, function(c){
    code_env[["4"]][[c]]
  })
}

microbenchmark(find_in_env(), find_in_list(), times = 10000)


################################################################
# looking at a few different options for the hash structure
# seems that a flatter structure is more efficient.
pc <- pccc::get_primary_codes()
cc <- pccc::get_collapsed_codes()

c <- '2532'

f1 <- function() {
  match <- pc[[4]][['dx']][[c]]
  !is.null(match)
  match <- pc[[4]][['dx']][['abcd']]
  !is.null(match)
}
f2 <- function() {
  match <- cc[[4]][[c]]
  !is.null(match)
  match <- cc[[4]][['abcd']]
  !is.null(match)
}
f2b <- function() {
  match <- cc[[4]][[c]]
  !is.null(cc[[4]][[c]])
  match <- cc[[4]][['abcd']]
  !is.null(cc[[4]][['abcd']])
}
f3 <- function() {
  secondary <- pc[[4]][['dx']]
  match <- secondary[[c]]
  !is.null(match)
  match <- secondary[['abcd']]
  !is.null(match)
}

microbenchmark(
  f1(),
  f2(),
  f2b(),
  f3(),
  times = 10000
)
	# using a data.frame for output results
	# base time 26 seconds with 60k rows

	# this block takes 27 seconds with 100k rows
	# if (sum(df[i,]))
	# df$ccc_flag[i] <- 1L

	# this one line takes abotut 10 seconds with 60k rows
	# df$ccc_flag[i] <- ccc_flag

	# this block takes 10 seconds with 60k rows
	if(ccc_flag)
	df$ccc_flag'[i] <- 1L

	# 20 seconds with 60k rows
	# df[i, 13] <- ccc_flag

	# 20 seconds with 60k rows
	# df[i, 13L] <- ccc_flag


	## data frames are very slow compared to an integer matrix - but still there are various ways to do
	## the same thing - here's some testing with 2 million rows and a matrix.
	## from my testing, it appears that indexing based on a number rather than column name is faster
	tic("testing sum")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	if (sum(out[i,]))
	out[i, 'ccc_flag'] <- 1L
	}
	toc()

	tic("testing based on ccc_flag")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 'ccc_flag'] <- ccc_flag
	}
	toc()

	tic("testing based on ccc_flag, set with integer")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 'ccc_flag'] <- 1L
	}
	toc()

	tic("testing based on integer index and set")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 13L] <- 1L
	}
	toc()

	tic("testing based on number index and set")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 13] <- 1
	}
	toc()

	tic("testing based on integer index and number set")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 13L] <- 1
	}
	toc()

	tic("testing based on number index and integer set")
	for(i in 1:nrow(dx)) {
	dx_row <- dx[i,]
	pc_row <- pc[i,]
	ccc_flag <- 1L

	# want at least 1 ccc to be set
	out[i, 'transplant'] <- 1L

	out[i, 13] <- 1L
	}
	toc()

	#### benchmark of different methods for using a hash in R - lists vs environments
	#### from my tests at least, an env with no parent environment is about 50x faster
	#### than one with a parent env, and a standard environment is about 2x faster than
	#### using a list
	#### got some of this from
	#### http://jeffreyhorner.tumblr.com/post/116325104028/hash-table-performance-in-r-part-ii-in-part-i
	#### https://stackoverflow.com/questions/1105659/how-to-add-variable-key-value-pair-to-list-object
	library(microbenchmark)
	LOOKUP_list <- function(){
	players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
	rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
	league <- setNames(as.list(rankings), players)
	league$tom
	}

	LOOKUP_env <- function(){
	players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
	rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
	ht <- new.env()
	ht[["bob"]] <- 0.2027
	ht[["tom"]] <- 0.2187
	ht[["tim"]] <- 0.0378
	ht[["tony"]] <- 0.3334
	ht[["tiny"]] <- 0.0161
	ht[["hubert"]] <- 0.0555
	ht[["herbert"]] <- 0.1357

	ht[["tom"]]
	}

	LOOKUP_empty_env_get <- function(){
	players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
	rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
	ht <- new.env(parent = emptyenv())
	ht[["bob"]] <- 0.2027
	ht[["tom"]] <- 0.2187
	ht[["tim"]] <- 0.0378
	ht[["tony"]] <- 0.3334
	ht[["tiny"]] <- 0.0161
	ht[["hubert"]] <- 0.0555
	ht[["herbert"]] <- 0.1357

	get("tom",envir=ht,inherits=FALSE)
	}

	LOOKUP_empty_env_subset <- function(){
	players <- c("bob", "tom", "tim", "tony", "tiny", "hubert", "herbert")
	rankings <- c(0.2027, 0.2187, 0.0378, 0.3334, 0.0161, 0.0555, 0.1357)
	ht <- new.env(parent = emptyenv())
	ht[["bob"]] <- 0.2027
	ht[["tom"]] <- 0.2187
	ht[["tim"]] <- 0.0378
	ht[["tony"]] <- 0.3334
	ht[["tiny"]] <- 0.0161
	ht[["hubert"]] <- 0.0555
	ht[["herbert"]] <- 0.1357

	ht[["tom"]]
	}

	microbenchmark(LOOKUP_list(), LOOKUP_env(), LOOKUP_empty_env_get, LOOKUP_empty_env_subset, times=10L^5)
	#Unit: nanoseconds
	# expr min lq mean median uq max neval
	# LOOKUP_list() 3425 4103 5316.99711 4334 4648 39971707 1e+05
	# LOOKUP_env() 2194 2717 3337.51395 2916 3209 4621637 1e+05
	# LOOKUP_empty_env_get 16 33 56.08413 55 71 15311 1e+05
	# LOOKUP_empty_env_subset 17 33 51.67454 46 61 33205 1e+05


	###############################################################################
	# Looking at a few options for storing the codes to compare against in a more
	# efficient manner than can be evaluated faster.
	# From my tests, the list of envs is about 3x faster to build than env of envs
	# From my tests, finding elements in the list of envs is faster than envs of envs
	# Most of that difference is due to envs only support character indicies, so
	# but even if hard code in index, list of envs is still faster by about 10 - 20%
	###############################################################################

	###############################################################################
	# ENV of ENVs
	###############################################################################
	rm(list=ls())
	gc()
	codes <- pccc::get_codes(icdv = 9L)
	print(codes[2, 2])
	code_env <- new.env(parent = emptyenv())
	for(i in 2:5) {
	code_env[[toString(i)]] <- new.env(parent = emptyenv())
	}

	for(r in 1:nrow(codes)) {
	ccc <- rownames(codes)[r]
	for (c in 1:ncol(codes)) {
	for (n in seq_along(codes[[r, c]])) {
	code_env[[toString(stri_length(codes[[r, c]][n]))]][[codes[[r, c]][n]]] <- ccc
	}
	}
	}

	checklist <- c("14", "5280", "1", "234987", "asdf")

	lapply(checklist, function(c){
	code_env[[toString(stri_length(c))]][[c]]
	})

	# check like
	require(stringi)
	stri_length("5280")
	!is.null(code_env[["5280"]])
	###############################################################################


	###############################################################################
	# list of ENVs
	# this is subject to subscript out of bounds errors
	###############################################################################
	rm(list=ls())
	gc()
	codes <- pccc::get_codes(icdv = 9L)
	code_list <- sapply(c(1:10), function(x) new.env(parent = emptyenv()))

	for(r in 1:nrow(codes)) {
	ccc <- rownames(codes)[r]
	for (c in 1:ncol(codes)) {
	for (n in seq_along(codes[[r, c]])) {
	code_list[[stri_length(codes[[r, c]][n])]][[codes[[r, c]][n]]] <- ccc
	}
	}
	}

	checklist <- c("14", "5280", "1", "234987", "asdf")

	find_in_list <- function() {
	lapply(checklist, function(c){
	code_list[[4]][[c]]
	})
	}

	find_in_env <- function() {
	lapply(checklist, function(c){
	code_env[["4"]][[c]]
	})
	}

	microbenchmark(find_in_env(), find_in_list(), times = 10000)


	################################################################
	# looking at a few different options for the hash structure
	# seems that a flatter structure is more efficient.
	pc <- pccc::get_primary_codes()
	cc <- pccc::get_collapsed_codes()

	c <- '2532'

	f1 <- function() {
	match <- pc[[4]][['dx']][[c]]
	!is.null(match)
	match <- pc[[4]][['dx']][['abcd']]
	!is.null(match)
	}
	f2 <- function() {
	match <- cc[[4]][[c]]
	!is.null(match)
	match <- cc[[4]][['abcd']]
	!is.null(match)
	}
	f2b <- function() {
	match <- cc[[4]][[c]]
	!is.null(cc[[4]][[c]])
	match <- cc[[4]][['abcd']]
	!is.null(cc[[4]][['abcd']])
	}
	f3 <- function() {
	secondary <- pc[[4]][['dx']]
	match <- secondary[[c]]
	!is.null(match)
	match <- secondary[['abcd']]
	!is.null(match)
	}

	microbenchmark(
	f1(),
	f2(),
	f2b(),
	f3(),
	times = 10000
	)