Skip to content

Instantly share code, notes, and snippets.

@matthewwolak
Last active August 29, 2015 14:24
Show Gist options
  • Save matthewwolak/95850e3a92a576585497 to your computer and use it in GitHub Desktop.
Save matthewwolak/95850e3a92a576585497 to your computer and use it in GitHub Desktop.
ICCbare benchmarks: v2.2.1 versus 2.3.0

ICCbare Speed Benchmarks

Introduction

Below compares the performance of the R function ICC::ICCbare between the current and the previous package versions. The main change between v2.2.1 and 2.3.0 was to greatly improve the non-standard evaluation of all functions in the ICC package. I try and get at how this might affect the time it takes to run ICC::ICCbare in the code below. First, I load the two versions of the function (cut and pasted from the package tarballs) as well as the two versions of ICC::ICCest (which is the main ICC calculation function) for comparison. Then I use the R package microbenchmark to time the various functions. I then strip the ICCbare functions down to what was changed between versions (the interpretation of the function arguments and set up) to hone in on any changes to the speed that may have arisen from the changes.

Create functions

First, create a function to make a data.frame with variables and data for which an Intraclass Correlation Coefficient (ICC) can be computed.

ICCdf <- function(ICCin, n = 1, nid = 1, kin = 2, muin = 10, seedin = 101){
   set.seed(seedin)

   tdf <- data.frame(id = rep(c(t(sapply(letters, FUN = function(x){paste0(x, seq(nid))}))), each = kin), matrix(NA, nrow = 26*nid*kin, ncol = n))
names(tdf) <- c("id", paste0("t", seq(n)))

   for(c in 2:ncol(tdf)){
     tdf[, c] <- rep(rnorm(26*nid, muin, sqrt(ICCin)), each = kin) + rnorm(26*nid*kin, 0, sqrt(1-ICCin))
   }
 tdf
}
  • ICCin Expected value of ICC
  • n Number of traits/times to calcluate ICC
  • nid Number of identities in units of 26 (the alphabet)
  • kin Number of observations per id
  • muin Measurement/trait mean

Now load the functions from nadiv v2.2.1 and 2.3.0. Load both the ICCbare and ICCest functions to show how much faster the ICCbare is over the more involved ICCest.

ICCbare221<-function(x, y, data){
  ICCcall <- Call <-match.call()
  xc<-as.character(ICCcall[[2L]])
  yc<-as.character(ICCcall[[3L]])
  inds<-unique(data[xc])[[1]]
  tdata<-data.frame(data[yc], data[xc])
  if(!is.factor(tdata[,2])){
    warning("x has been coerced to a factor")
    tdata[,2]<-as.factor(tdata[,2])
    }  

  fmla<-formula(tdata)
  if (!is.list(replications(fmla, tdata))){
   tmp1<-aggregate(tdata[,1], list(tdata[,2]),FUN=mean)
   tmp2<-aggregate(tdata[,1], list(tdata[,2]), FUN=length)
   ord.data<-tdata[order(tdata[,2]),]
   Treat.m<-rep(tmp1$x,tmp2$x)
   Among<-(Treat.m-rep(mean(tdata[,1]),nrow(tdata)))^2
   Within<-(ord.data[,1]-Treat.m)^2
   MS<-(c(sum(Among),sum(Within)))/(c(length(tmp2$x)-1, length(tmp2$x)*(tmp2$x[1]-1)))
   var.a<-(MS[1]-MS[2])/tmp2$x[1]
   r<-var.a/(var.a+MS[2])
      }
  else{
     tmpbb<-anova(aov(fmla, data=tdata))
     MSa<-tmpbb[3][1,1]
     tmp.outj<-data.frame(lapply(unstack(na.omit(tdata)), FUN=length))
     var.a<-(MSa-tmpbb[3][2,1])/((1/(length(inds)-1))*(sum(tmp.outj)-(sum((tmp.outj^2))/(sum(tmp.outj)))))
     r<-var.a/(tmpbb[3][2,1] + var.a) 
      }
  list(ICC=r)
}

##

ICCbare230 <- function(x, y, data = NULL){
  icall <- list(y = substitute(y), x = substitute(x))

  if(is.character(icall$y)){
    warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
    icall$y <- eval(as.name(y), data, parent.frame())
  } 
  if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())


  if(is.character(icall$x)){
    warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
    icall$x <- eval(as.name(x), data, parent.frame())
  } 
  if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())


  tdata <- data.frame(icall)
  tdata <- na.omit(tdata)
  a <- length(unique(tdata$x))

  if(!is.null(attributes(tdata)$na.action)){
     warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
  } 
  if(!is.factor(tdata$x)){
     warning("'x' has been coerced to a factor")
     tdata$x <- as.factor(tdata$x)
  } else{
       if(length(levels(tdata$x)) > a){
          tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
          warning("Missing levels of 'x' have been removed")
       } 
    } 

  fmla <- formula(tdata)
  if (!is.list(replications(fmla, tdata))){
   tmp1 <- aggregate(tdata[, 1], list(tdata[, 2]), FUN = mean)
   tmp2 <- aggregate(tdata[, 1], list(tdata[, 2]), FUN = length)
   ord.data <- tdata[order(tdata[, 2]),]
   Treat.m <- rep(tmp1$x, tmp2$x)
   Among <- (Treat.m - rep(mean(tdata[, 1]), nrow(tdata)))^2
   Within <- (ord.data[, 1] - Treat.m)^2
   MS <- c(sum(Among), sum(Within)) / c(length(tmp2$x) - 1, length(tmp2$x) * (tmp2$x[1]-1))
   var.a <- (MS[1] - MS[2]) / tmp2$x[1]
  return(var.a / (var.a + MS[2]))
  } else{
     tmpbb <- anova(aov(fmla, data = tdata))
     MSa <- tmpbb[3][1, 1]
     tmp.outj <- aggregate(y ~ x, data = tdata, FUN = length)$y
     var.a <- (MSa - tmpbb[3][2, 1]) /((1 / (a - 1)) * (sum(tmp.outj) - (sum(tmp.outj^2) / sum(tmp.outj))))
    return(var.a / (tmpbb[3][2,1] + var.a))
    }
}

##

ICCest221 <- function(x, y, data=data, alpha=0.05, CI.type=c("THD", "Smith"))
{
  if(!is.data.frame(data))
       stop("object dataframe is of the type '", class(data), "' and must be of type 'data.frame'")
  square<-function(z){z^2}
  ICCcall <- Call <-match.call()
  xc<-as.character(ICCcall[[2L]])
  yc<-as.character(ICCcall[[3L]])
  inds<-unique(data[xc])[[1]]
  a<-length(inds)
  tdata<-data.frame(data[yc], data[xc])
  if(!is.factor(tdata[,2])){
    warning("x has been coerced to a factor")
    tdata[,2]<-as.factor(tdata[,2])
    }
  if(length(levels(tdata[,2])) > length(unique(tdata[,2])))
    stop("levels assigned to 'x' are greater than the actual levels of 'x'")
  
  nacheck <- function(x){
     result <- unlist(lapply(x, FUN = is.na))
     if(all(result)){
     return(FALSE)
       } else{return(TRUE)}
  } 
  
  unstackedt <- unstack(tdata)
  TF <- lapply(unstackedt, FUN = nacheck)
  if(any(unlist(TF) == FALSE)){
    warning("one or more groups in 'x' do not contain any records and have been removed")
    tdata <- tdata[!tdata[,2] == which(unlist(TF) == FALSE), ]
  }

  tmpbb<-anova(aov(tdata[,1]~tdata[,2], data=tdata))
  num.df<-tmpbb[1][1,1]
  denom.df<-tmpbb[1][2,1];
  MSa<-tmpbb[3][1,1]
  MSw<-tmpbb[3][2,1]
  
  tmp.outj<-data.frame(lapply(unstack(na.omit(tdata)), FUN=length))
  k<-(1/(a-1))*(sum(tmp.outj)-(sum(square(tmp.outj))/(sum(tmp.outj))))
  
  var.w<-MSw
  var.a<-(MSa-MSw)/(k)
  r<-var.a/(var.w + var.a)

  low.F<-qf(alpha/2, num.df, denom.df, lower.tail=FALSE)
  N<-dim(na.omit(tdata))[1]
  n.bar<-N/a
  n.not<-n.bar-sum(square(tmp.outj-n.bar)/((a-1)*N))    
    type<-match.arg(CI.type)
      if(type=="THD"){
    up.F<-qf(alpha/2, denom.df, num.df, lower.tail=FALSE)   
    FL<-(MSa/MSw)/low.F
    FU<-(MSa/MSw)*up.F
    low.CI<-(FL-1)/(FL+n.not-1)
    up.CI<-(FU-1)/(FU+n.not-1)
       }
      if(type=="Smith"){
    z.not<-qnorm(alpha/2)
    Vr<-(2*square(1-r)/square(n.not))*((square((1+r*(n.not-1)))/(N-a))+((a-1)*(1-r)*(1+r*(2*n.not-1))+square(r)*(sum(square(tmp.outj))-2*(1/N)*sum((tmp.outj^3))+(1/square(N))*square(sum(square(tmp.outj)))))/square(a-1))
    low.CI<-r+z.not*sqrt(Vr)
    up.CI<-r-z.not*sqrt(Vr) 
      }

list(ICC=r, LowerCI=low.CI, UpperCI=up.CI, N=a, k=k, varw=var.w, vara=var.a)
}

##

ICCest230 <- function(x, y, data = NULL, alpha = 0.05, CI.type = c("THD", "Smith")){
  square <- function(z){z^2}
  icall <- list(y = substitute(y), x = substitute(x))

  if(is.character(icall$y)){
    warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
    icall$y <- eval(as.name(y), data, parent.frame())
  } 
  if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())


  if(is.character(icall$x)){
    warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
    icall$x <- eval(as.name(x), data, parent.frame())
  } 
  if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())


  tdata <- data.frame(icall)
  tdata <- na.omit(tdata)
  a <- length(unique(tdata$x))

  if(!is.null(attributes(tdata)$na.action)){
     warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
  } 
  if(!is.factor(tdata$x)){
     warning("'x' has been coerced to a factor")
     tdata$x <- as.factor(tdata$x)
  } else{
       if(length(levels(tdata$x)) > a){
          tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
          warning("Missing levels of 'x' have been removed")
       } 
    } 

  tmpbb <- anova(aov(y ~ x, data = tdata))
  num.df <- tmpbb$Df[1]
  denom.df <- tmpbb$Df[2]
  MSa <- tmpbb$'Mean Sq'[1]
  MSw <- var.w <- tmpbb$'Mean Sq'[2]
  tmp.outj <- aggregate(y ~ x, data = tdata, FUN = length)$y
  k <- (1/(a-1)) * (sum(tmp.outj) - (sum(square(tmp.outj)) / sum(tmp.outj)))
  var.a <- (MSa - MSw) / k
  r <- var.a / (var.w + var.a)

  low.F <- qf(alpha/2, num.df, denom.df, lower.tail = FALSE)
  N <- nrow(tdata)
  n.bar <- N/a
  n.not <- n.bar - sum(square(tmp.outj - n.bar) / ((a - 1) * N))    
    type <- match.arg(CI.type)
      if(type == "THD"){
    up.F <- qf(alpha/2, denom.df, num.df, lower.tail = FALSE)   
    FL <- (MSa/MSw) / low.F
    FU <- (MSa/MSw) * up.F
    low.CI <- (FL - 1) / (FL + n.not - 1)
    up.CI <- (FU - 1) / (FU + n.not - 1)
       }
      if(type == "Smith"){
    z.not <- qnorm(alpha/2)
    Vr <- (2*square(1-r) / square(n.not)) * ((square((1+r*(n.not-1))) / (N-a)) + ((a-1)*(1-r)*(1+r*(2*n.not-1))+square(r)*(sum(square(tmp.outj))-2*(1/N)*sum((tmp.outj^3))+(1/square(N))*square(sum(square(tmp.outj)))))/ square(a-1))
    low.CI <- r + z.not * sqrt(Vr)
    up.CI <- r - z.not * sqrt(Vr) 
      }

list(ICC = r, LowerCI = low.CI, UpperCI = up.CI, N = a, k = k, varw = var.w, vara = var.a)
}

Benchmarks

Simulate to have ICC=0.3, n=1, nid=10, and kin=5

tdf <- ICCdf(ICCin = 0.3, n = 1, nid = 10, kin = 5)

library(microbenchmark)
mbmrk <- microbenchmark(ICCbare221(x = id, y = t1, data = tdf),
    ICCbare230(x = id, y = t1, data = tdf), 
    ICCest221(x = id, y = t1, data = tdf), 
    ICCest230(x = id, y = t1, data = tdf), times = 100)

library(ggplot2)
autoplot(mbmrk)

mbmrk

## Unit: milliseconds
##                                    expr        min         lq       mean
##  ICCbare221(x = id, y = t1, data = tdf)   6.413895   6.498676   7.639075
##  ICCbare230(x = id, y = t1, data = tdf)   6.745108   6.849444   7.989532
##   ICCest221(x = id, y = t1, data = tdf) 132.079083 133.457976 138.546939
##   ICCest230(x = id, y = t1, data = tdf)  73.696330  74.732362  77.466744
##      median         uq       max neval
##    6.638810   6.880297  44.30365   100
##    6.989106   7.174829  46.37274   100
##  134.682576 135.949623 180.47606   100
##   75.133194  76.301067 113.75256   100

Testing the function initialization

Create a simplified version of the functions. Since I only changed the beginning call between nadiv 2.2.1 and 2.3.0, then only run through the functions until the temporary data frame has been created:

ICCbare221start <-function(x, y, data){
  ICCcall <- Call <-match.call()
  xc<-as.character(ICCcall[[2L]])
  yc<-as.character(ICCcall[[3L]])
  inds<-unique(data[xc])[[1]]
  tdata<-data.frame(data[yc], data[xc])
  if(!is.factor(tdata[,2])){
    warning("x has been coerced to a factor")
    tdata[,2]<-as.factor(tdata[,2])
    }  

tdata
}

####

ICCbare230start <- function(x, y, data = NULL){
  icall <- list(y = substitute(y), x = substitute(x))

  if(is.character(icall$y)){
    warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
    icall$y <- eval(as.name(y), data, parent.frame())
  } 
  if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
  if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())


  if(is.character(icall$x)){
    warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
    if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
    icall$x <- eval(as.name(x), data, parent.frame())
  } 
  if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
  if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())


  tdata <- data.frame(icall)
  tdata <- na.omit(tdata)
  a <- length(unique(tdata$x))

  if(!is.null(attributes(tdata)$na.action)){
     warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
  } 
  if(!is.factor(tdata$x)){
     warning("'x' has been coerced to a factor")
     tdata$x <- as.factor(tdata$x)
  } else{
       if(length(levels(tdata$x)) > a){
          tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
          warning("Missing levels of 'x' have been removed")
       } 
    } 

tdata
}

Simplified function benchmarks

tdf <- ICCdf(ICCin = 0.3, n = 1, nid = 10, kin = 5)

library(microbenchmark)
mbmrk <- microbenchmark(ICCbare221start(x = id, y = t1, data = tdf), ICCbare230start(x = id, y = t1, data = tdf), times = 1000)

library(ggplot2)
autoplot(mbmrk)

mbmrk

## Unit: microseconds
##                                         expr     min       lq     mean
##  ICCbare221start(x = id, y = t1, data = tdf) 214.947 226.4460 240.5051
##  ICCbare230start(x = id, y = t1, data = tdf) 551.429 565.8955 593.0327
##    median      uq      max neval
##  231.0165 236.483 1909.913  1000
##  571.2210 579.655 2463.979  1000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment