Below compares the performance of the R function ICC::ICCbare
between the current and the previous package versions. The main change between v2.2.1 and 2.3.0 was to greatly improve the non-standard evaluation of all functions in the ICC
package. I try and get at how this might affect the time it takes to run ICC::ICCbare
in the code below. First, I load the two versions of the function (cut and pasted from the package tarballs) as well as the two versions of ICC::ICCest
(which is the main ICC calculation function) for comparison. Then I use the R package microbenchmark
to time the various functions. I then strip the ICCbare
functions down to what was changed between versions (the interpretation of the function arguments and set up) to hone in on any changes to the speed that may have arisen from the changes.
First, create a function to make a data.frame
with variables and data
for which an Intraclass Correlation Coefficient (ICC) can be computed.
ICCdf <- function(ICCin, n = 1, nid = 1, kin = 2, muin = 10, seedin = 101){
set.seed(seedin)
tdf <- data.frame(id = rep(c(t(sapply(letters, FUN = function(x){paste0(x, seq(nid))}))), each = kin), matrix(NA, nrow = 26*nid*kin, ncol = n))
names(tdf) <- c("id", paste0("t", seq(n)))
for(c in 2:ncol(tdf)){
tdf[, c] <- rep(rnorm(26*nid, muin, sqrt(ICCin)), each = kin) + rnorm(26*nid*kin, 0, sqrt(1-ICCin))
}
tdf
}
ICCin
Expected value of ICCn
Number of traits/times to calcluate ICCnid
Number of identities in units of 26 (the alphabet)kin
Number of observations per idmuin
Measurement/trait mean
Now load the functions from nadiv v2.2.1 and 2.3.0. Load both the
ICCbare
and ICCest
functions to show how much faster the ICCbare
is over the more involved ICCest
.
ICCbare221<-function(x, y, data){
ICCcall <- Call <-match.call()
xc<-as.character(ICCcall[[2L]])
yc<-as.character(ICCcall[[3L]])
inds<-unique(data[xc])[[1]]
tdata<-data.frame(data[yc], data[xc])
if(!is.factor(tdata[,2])){
warning("x has been coerced to a factor")
tdata[,2]<-as.factor(tdata[,2])
}
fmla<-formula(tdata)
if (!is.list(replications(fmla, tdata))){
tmp1<-aggregate(tdata[,1], list(tdata[,2]),FUN=mean)
tmp2<-aggregate(tdata[,1], list(tdata[,2]), FUN=length)
ord.data<-tdata[order(tdata[,2]),]
Treat.m<-rep(tmp1$x,tmp2$x)
Among<-(Treat.m-rep(mean(tdata[,1]),nrow(tdata)))^2
Within<-(ord.data[,1]-Treat.m)^2
MS<-(c(sum(Among),sum(Within)))/(c(length(tmp2$x)-1, length(tmp2$x)*(tmp2$x[1]-1)))
var.a<-(MS[1]-MS[2])/tmp2$x[1]
r<-var.a/(var.a+MS[2])
}
else{
tmpbb<-anova(aov(fmla, data=tdata))
MSa<-tmpbb[3][1,1]
tmp.outj<-data.frame(lapply(unstack(na.omit(tdata)), FUN=length))
var.a<-(MSa-tmpbb[3][2,1])/((1/(length(inds)-1))*(sum(tmp.outj)-(sum((tmp.outj^2))/(sum(tmp.outj)))))
r<-var.a/(tmpbb[3][2,1] + var.a)
}
list(ICC=r)
}
##
ICCbare230 <- function(x, y, data = NULL){
icall <- list(y = substitute(y), x = substitute(x))
if(is.character(icall$y)){
warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
icall$y <- eval(as.name(y), data, parent.frame())
}
if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())
if(is.character(icall$x)){
warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
icall$x <- eval(as.name(x), data, parent.frame())
}
if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())
tdata <- data.frame(icall)
tdata <- na.omit(tdata)
a <- length(unique(tdata$x))
if(!is.null(attributes(tdata)$na.action)){
warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
}
if(!is.factor(tdata$x)){
warning("'x' has been coerced to a factor")
tdata$x <- as.factor(tdata$x)
} else{
if(length(levels(tdata$x)) > a){
tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
warning("Missing levels of 'x' have been removed")
}
}
fmla <- formula(tdata)
if (!is.list(replications(fmla, tdata))){
tmp1 <- aggregate(tdata[, 1], list(tdata[, 2]), FUN = mean)
tmp2 <- aggregate(tdata[, 1], list(tdata[, 2]), FUN = length)
ord.data <- tdata[order(tdata[, 2]),]
Treat.m <- rep(tmp1$x, tmp2$x)
Among <- (Treat.m - rep(mean(tdata[, 1]), nrow(tdata)))^2
Within <- (ord.data[, 1] - Treat.m)^2
MS <- c(sum(Among), sum(Within)) / c(length(tmp2$x) - 1, length(tmp2$x) * (tmp2$x[1]-1))
var.a <- (MS[1] - MS[2]) / tmp2$x[1]
return(var.a / (var.a + MS[2]))
} else{
tmpbb <- anova(aov(fmla, data = tdata))
MSa <- tmpbb[3][1, 1]
tmp.outj <- aggregate(y ~ x, data = tdata, FUN = length)$y
var.a <- (MSa - tmpbb[3][2, 1]) /((1 / (a - 1)) * (sum(tmp.outj) - (sum(tmp.outj^2) / sum(tmp.outj))))
return(var.a / (tmpbb[3][2,1] + var.a))
}
}
##
ICCest221 <- function(x, y, data=data, alpha=0.05, CI.type=c("THD", "Smith"))
{
if(!is.data.frame(data))
stop("object dataframe is of the type '", class(data), "' and must be of type 'data.frame'")
square<-function(z){z^2}
ICCcall <- Call <-match.call()
xc<-as.character(ICCcall[[2L]])
yc<-as.character(ICCcall[[3L]])
inds<-unique(data[xc])[[1]]
a<-length(inds)
tdata<-data.frame(data[yc], data[xc])
if(!is.factor(tdata[,2])){
warning("x has been coerced to a factor")
tdata[,2]<-as.factor(tdata[,2])
}
if(length(levels(tdata[,2])) > length(unique(tdata[,2])))
stop("levels assigned to 'x' are greater than the actual levels of 'x'")
nacheck <- function(x){
result <- unlist(lapply(x, FUN = is.na))
if(all(result)){
return(FALSE)
} else{return(TRUE)}
}
unstackedt <- unstack(tdata)
TF <- lapply(unstackedt, FUN = nacheck)
if(any(unlist(TF) == FALSE)){
warning("one or more groups in 'x' do not contain any records and have been removed")
tdata <- tdata[!tdata[,2] == which(unlist(TF) == FALSE), ]
}
tmpbb<-anova(aov(tdata[,1]~tdata[,2], data=tdata))
num.df<-tmpbb[1][1,1]
denom.df<-tmpbb[1][2,1];
MSa<-tmpbb[3][1,1]
MSw<-tmpbb[3][2,1]
tmp.outj<-data.frame(lapply(unstack(na.omit(tdata)), FUN=length))
k<-(1/(a-1))*(sum(tmp.outj)-(sum(square(tmp.outj))/(sum(tmp.outj))))
var.w<-MSw
var.a<-(MSa-MSw)/(k)
r<-var.a/(var.w + var.a)
low.F<-qf(alpha/2, num.df, denom.df, lower.tail=FALSE)
N<-dim(na.omit(tdata))[1]
n.bar<-N/a
n.not<-n.bar-sum(square(tmp.outj-n.bar)/((a-1)*N))
type<-match.arg(CI.type)
if(type=="THD"){
up.F<-qf(alpha/2, denom.df, num.df, lower.tail=FALSE)
FL<-(MSa/MSw)/low.F
FU<-(MSa/MSw)*up.F
low.CI<-(FL-1)/(FL+n.not-1)
up.CI<-(FU-1)/(FU+n.not-1)
}
if(type=="Smith"){
z.not<-qnorm(alpha/2)
Vr<-(2*square(1-r)/square(n.not))*((square((1+r*(n.not-1)))/(N-a))+((a-1)*(1-r)*(1+r*(2*n.not-1))+square(r)*(sum(square(tmp.outj))-2*(1/N)*sum((tmp.outj^3))+(1/square(N))*square(sum(square(tmp.outj)))))/square(a-1))
low.CI<-r+z.not*sqrt(Vr)
up.CI<-r-z.not*sqrt(Vr)
}
list(ICC=r, LowerCI=low.CI, UpperCI=up.CI, N=a, k=k, varw=var.w, vara=var.a)
}
##
ICCest230 <- function(x, y, data = NULL, alpha = 0.05, CI.type = c("THD", "Smith")){
square <- function(z){z^2}
icall <- list(y = substitute(y), x = substitute(x))
if(is.character(icall$y)){
warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
icall$y <- eval(as.name(y), data, parent.frame())
}
if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())
if(is.character(icall$x)){
warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
icall$x <- eval(as.name(x), data, parent.frame())
}
if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())
tdata <- data.frame(icall)
tdata <- na.omit(tdata)
a <- length(unique(tdata$x))
if(!is.null(attributes(tdata)$na.action)){
warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
}
if(!is.factor(tdata$x)){
warning("'x' has been coerced to a factor")
tdata$x <- as.factor(tdata$x)
} else{
if(length(levels(tdata$x)) > a){
tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
warning("Missing levels of 'x' have been removed")
}
}
tmpbb <- anova(aov(y ~ x, data = tdata))
num.df <- tmpbb$Df[1]
denom.df <- tmpbb$Df[2]
MSa <- tmpbb$'Mean Sq'[1]
MSw <- var.w <- tmpbb$'Mean Sq'[2]
tmp.outj <- aggregate(y ~ x, data = tdata, FUN = length)$y
k <- (1/(a-1)) * (sum(tmp.outj) - (sum(square(tmp.outj)) / sum(tmp.outj)))
var.a <- (MSa - MSw) / k
r <- var.a / (var.w + var.a)
low.F <- qf(alpha/2, num.df, denom.df, lower.tail = FALSE)
N <- nrow(tdata)
n.bar <- N/a
n.not <- n.bar - sum(square(tmp.outj - n.bar) / ((a - 1) * N))
type <- match.arg(CI.type)
if(type == "THD"){
up.F <- qf(alpha/2, denom.df, num.df, lower.tail = FALSE)
FL <- (MSa/MSw) / low.F
FU <- (MSa/MSw) * up.F
low.CI <- (FL - 1) / (FL + n.not - 1)
up.CI <- (FU - 1) / (FU + n.not - 1)
}
if(type == "Smith"){
z.not <- qnorm(alpha/2)
Vr <- (2*square(1-r) / square(n.not)) * ((square((1+r*(n.not-1))) / (N-a)) + ((a-1)*(1-r)*(1+r*(2*n.not-1))+square(r)*(sum(square(tmp.outj))-2*(1/N)*sum((tmp.outj^3))+(1/square(N))*square(sum(square(tmp.outj)))))/ square(a-1))
low.CI <- r + z.not * sqrt(Vr)
up.CI <- r - z.not * sqrt(Vr)
}
list(ICC = r, LowerCI = low.CI, UpperCI = up.CI, N = a, k = k, varw = var.w, vara = var.a)
}
Simulate to have ICC=0.3
, n=1
, nid=10
, and kin=5
tdf <- ICCdf(ICCin = 0.3, n = 1, nid = 10, kin = 5)
library(microbenchmark)
mbmrk <- microbenchmark(ICCbare221(x = id, y = t1, data = tdf),
ICCbare230(x = id, y = t1, data = tdf),
ICCest221(x = id, y = t1, data = tdf),
ICCest230(x = id, y = t1, data = tdf), times = 100)
library(ggplot2)
autoplot(mbmrk)
mbmrk
## Unit: milliseconds
## expr min lq mean
## ICCbare221(x = id, y = t1, data = tdf) 6.413895 6.498676 7.639075
## ICCbare230(x = id, y = t1, data = tdf) 6.745108 6.849444 7.989532
## ICCest221(x = id, y = t1, data = tdf) 132.079083 133.457976 138.546939
## ICCest230(x = id, y = t1, data = tdf) 73.696330 74.732362 77.466744
## median uq max neval
## 6.638810 6.880297 44.30365 100
## 6.989106 7.174829 46.37274 100
## 134.682576 135.949623 180.47606 100
## 75.133194 76.301067 113.75256 100
Create a simplified version of the functions. Since I only changed the beginning call between nadiv 2.2.1 and 2.3.0, then only run through the functions until the temporary data frame has been created:
ICCbare221start <-function(x, y, data){
ICCcall <- Call <-match.call()
xc<-as.character(ICCcall[[2L]])
yc<-as.character(ICCcall[[3L]])
inds<-unique(data[xc])[[1]]
tdata<-data.frame(data[yc], data[xc])
if(!is.factor(tdata[,2])){
warning("x has been coerced to a factor")
tdata[,2]<-as.factor(tdata[,2])
}
tdata
}
####
ICCbare230start <- function(x, y, data = NULL){
icall <- list(y = substitute(y), x = substitute(x))
if(is.character(icall$y)){
warning("passing a character string to 'y' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'y' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'y' or supply both 'data' and then 'y' as an unquoted column name to 'data'")
icall$y <- eval(as.name(y), data, parent.frame())
}
if(is.name(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.call(icall$y)) icall$y <- eval(icall$y, data, parent.frame())
if(is.character(icall$y)) icall$y <- eval(as.name(icall$y), data, parent.frame())
if(is.character(icall$x)){
warning("passing a character string to 'x' is deprecated since ICC vesion 2.3.0 and will not be supported in future versions. The argument to 'x' should either be an unquoted column name of 'data' or an object")
if(missing(data)) stop("Supply either the unquoted name of the object containing 'x' or supply both 'data' and then 'x' as an unquoted column name to 'data'")
icall$x <- eval(as.name(x), data, parent.frame())
}
if(is.name(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.call(icall$x)) icall$x <- eval(icall$x, data, parent.frame())
if(is.character(icall$x) && length(icall$x) == 1) icall$x <- eval(as.name(icall$x), data, parent.frame())
tdata <- data.frame(icall)
tdata <- na.omit(tdata)
a <- length(unique(tdata$x))
if(!is.null(attributes(tdata)$na.action)){
warning(cat("NAs removed from rows:\n", unclass(attributes(tdata)$na.action), "\n"))
}
if(!is.factor(tdata$x)){
warning("'x' has been coerced to a factor")
tdata$x <- as.factor(tdata$x)
} else{
if(length(levels(tdata$x)) > a){
tdata$x <- factor(as.character(tdata$x), levels = unique(tdata$x))
warning("Missing levels of 'x' have been removed")
}
}
tdata
}
tdf <- ICCdf(ICCin = 0.3, n = 1, nid = 10, kin = 5)
library(microbenchmark)
mbmrk <- microbenchmark(ICCbare221start(x = id, y = t1, data = tdf), ICCbare230start(x = id, y = t1, data = tdf), times = 1000)
library(ggplot2)
autoplot(mbmrk)
mbmrk
## Unit: microseconds
## expr min lq mean
## ICCbare221start(x = id, y = t1, data = tdf) 214.947 226.4460 240.5051
## ICCbare230start(x = id, y = t1, data = tdf) 551.429 565.8955 593.0327
## median uq max neval
## 231.0165 236.483 1909.913 1000
## 571.2210 579.655 2463.979 1000