nathanvan/june-2013-stat-bytes-demo.R

## june-2013-stat-bytes-demo.R
## Stat Bytes supplement 6/21/2013
## Nathan VanHoudnos

## Quick demo to test Rprof
Rprof("profile1.out", line.profiling=TRUE)
source('simple.data.R')
Rprof(NULL)

summaryRprof("profile1.out", lines = "show")$by.line


## And to play with Rcpp
## install.packages(c('inline','Rcpp','RcppArmadillo'))
require(Rcpp)
require(RcppArmadillo)
require(inline)

## Here is a pure R version of a function to calculate
## a log likelihood in a 2PL IRT model
log.prob.apply.r <- function(U.data, th, a, b, applyBy) {
  P.persons  <- nrow(U.data)
  term.1     <- outer(th, a)
  term.2     <- matrix(rep(a*b, P.persons), nrow=P.persons,
                byrow=TRUE)
  P.prob <- 1/(1 + exp(term.2 - term.1))

  ## Use the inverse CDF trick
  log.bernoulli <- U.data*log(P.prob) + (1-U.data)*log(1-P.prob)

  return(apply(log.bernoulli,applyBy,sum))
}

## Write the C++ function in a string
## N.B. You can also write it in a separate
##      file, then load it into a string if you
##      prefer. See:
## fileName <- 'log.prob.apply.cpp.src.cpp'
## log.prob.apply.cpp.src <- readChar(fileName, file.info(fileName)$size)
log.prob.apply.cpp.src <- '
// Import the R objects passed to the function
arma::mat U = Rcpp::as<arma::mat>(UdataR);
arma::colvec theta = Rcpp::as<arma::colvec>(thetaR);
arma::rowvec a = Rcpp::as<arma::rowvec>(aR);
arma::rowvec b = Rcpp::as<arma::rowvec>(bR);
int appBy = as<int>(appByS);
//
// Find out the dimensions of the U matrix
int Ppersons = U.n_rows;
int Iitems   = U.n_cols;
//
// Declare variables to use
arma::mat term1(Ppersons,Iitems);
arma::mat term2(Ppersons,Iitems);
arma::mat Pprob(Ppersons,Iitems);
arma::mat logBernoulli(Ppersons,Iitems);;
//
// Calculate the terms for the logit
term1 = theta*a;  //Outer product
//
// Fill term2 with elementwise multiplication
// (every row is the same)
for (int row=0; row<Ppersons; row++) {
  term2.row(row) = a%b; //Schur product (element wise)
}
//
// Calculate the logit
Pprob = 1/(1+exp(term2-term1));
//
// Use the inverse CDF trick
logBernoulli = U%log(Pprob) + (1-U)%log(1-Pprob);
//
// Return the likelihood
if ( appBy == 1 ) {
   logBernoulli = cumsum(logBernoulli, 1);
   theta = logBernoulli.col(Iitems-1);
   return(Rcpp::wrap( theta ));
} else {
   logBernoulli = cumsum(logBernoulli, 0);
   a = logBernoulli.row(Ppersons-1);
   return(Rcpp::wrap(  a    ));
}
'

## C++ version
log.prob.apply.cpp <- cxxfunction(
                 signature(
                   UdataR="numeric",
                   thetaR="numeric", aR="numeric", bR="numeric",
                   appByS="int"),
                 body=log.prob.apply.cpp.src, plugin="RcppArmadillo")

## Use the 100 datasets from the profiling example (see 'sample.data.R')

system.time(
a.disc.lik.r <- lapply( tmp, function(holder){
  log.prob.apply.r(U.data=holder$U,
                 th=holder$theta.abl,
                 a=holder$a.disc,
                 b=holder$b.diff,
                 applyBy=2)
} ) ) ## 1.1413 seconds

system.time(
a.disc.lik.cpp <- lapply( tmp, function(holder){
  as.numeric(log.prob.apply.cpp(UdataR=holder$U,
                 thetaR=holder$theta.abl,
                 aR=holder$a.disc,
                 bR=holder$b.diff,
                 appByS=2))
} ) ) ## 0.888 seconds

stopifnot(all.equal(a.disc.lik.r, a.disc.lik.cpp))

## So a 22% percent improvement by compiling the code
## Not bad, but not an order of magnitude either.

## simple.data.R
## Stat Bytes supplement 6/21/2013 -- Nathan VanHoudnos

## Set the random-number generator seed,
## to make the results reproducible
set.seed(314159)

make.data <- function(I.items=30, P.persons=2000) {
# set the number of items I and persons P
# I.items    <- 30
# P.persons  <- 2000

# set the fixed item and population parameters
a.disc     <- 1 + runif(I.items,-0.5,0.5)
b.diff     <- seq(-3,3,length=I.items)

mean.theta <- 0
sig2.theta <- (1.25)^2

# generate thetas and the I x P matrix of response probabilities
theta.abl  <- rnorm(P.persons, mean=mean.theta, sd=sqrt(sig2.theta))
term.1     <- outer(theta.abl, a.disc)
term.2     <- matrix(rep(a.disc*b.diff, P.persons), nrow=P.persons,
                byrow=TRUE)

## Using lower level functions improves speed, but costs clarity
P.prob.1     <- plogis(term.1-term.2)
P.prob.2     <- 1/(1 + exp(term.2 - term.1))
stopifnot(all.equal(P.prob.1, P.prob.2))

## ifelse is linear time, casting is constant time
U.flip       <- runif(I.items*P.persons)
U.1          <- ifelse(U.flip < P.prob.1,1,0)
U.2          <- as.numeric(U.flip < P.prob.2)
dim(U.2)     <- c(P.persons, I.items)
stopifnot(all.equal(U.1, U.2))
return(list(U=U.1, theta.abl=theta.abl, a.disc=a.disc, b.diff=b.diff))
}

tmp <- lapply(1:100,function(ii){make.data()})
	## Stat Bytes supplement 6/21/2013
	## Nathan VanHoudnos

	## Quick demo to test Rprof
	Rprof("profile1.out", line.profiling=TRUE)
	source('simple.data.R')
	Rprof(NULL)

	summaryRprof("profile1.out", lines = "show")$by.line


	## And to play with Rcpp
	## install.packages(c('inline','Rcpp','RcppArmadillo'))
	require(Rcpp)
	require(RcppArmadillo)
	require(inline)

	## Here is a pure R version of a function to calculate
	## a log likelihood in a 2PL IRT model
	log.prob.apply.r <- function(U.data, th, a, b, applyBy) {
	P.persons <- nrow(U.data)
	term.1 <- outer(th, a)
	term.2 <- matrix(rep(a*b, P.persons), nrow=P.persons,
	byrow=TRUE)
	P.prob <- 1/(1 + exp(term.2 - term.1))

	## Use the inverse CDF trick
	log.bernoulli <- U.datalog(P.prob) + (1-U.data)log(1-P.prob)

	return(apply(log.bernoulli,applyBy,sum))
	}

	## Write the C++ function in a string
	## N.B. You can also write it in a separate
	## file, then load it into a string if you
	## prefer. See:
	## fileName <- 'log.prob.apply.cpp.src.cpp'
	## log.prob.apply.cpp.src <- readChar(fileName, file.info(fileName)$size)
	log.prob.apply.cpp.src <- '
	// Import the R objects passed to the function
	arma::mat U = Rcpp::as<arma::mat>(UdataR);
	arma::colvec theta = Rcpp::as<arma::colvec>(thetaR);
	arma::rowvec a = Rcpp::as<arma::rowvec>(aR);
	arma::rowvec b = Rcpp::as<arma::rowvec>(bR);
	int appBy = as<int>(appByS);
	//
	// Find out the dimensions of the U matrix
	int Ppersons = U.n_rows;
	int Iitems = U.n_cols;
	//
	// Declare variables to use
	arma::mat term1(Ppersons,Iitems);
	arma::mat term2(Ppersons,Iitems);
	arma::mat Pprob(Ppersons,Iitems);
	arma::mat logBernoulli(Ppersons,Iitems);;
	//
	// Calculate the terms for the logit
	term1 = theta*a; //Outer product
	//
	// Fill term2 with elementwise multiplication
	// (every row is the same)
	for (int row=0; row<Ppersons; row++) {
	term2.row(row) = a%b; //Schur product (element wise)
	}
	//
	// Calculate the logit
	Pprob = 1/(1+exp(term2-term1));
	//
	// Use the inverse CDF trick
	logBernoulli = U%log(Pprob) + (1-U)%log(1-Pprob);
	//
	// Return the likelihood
	if ( appBy == 1 ) {
	logBernoulli = cumsum(logBernoulli, 1);
	theta = logBernoulli.col(Iitems-1);
	return(Rcpp::wrap( theta ));
	} else {
	logBernoulli = cumsum(logBernoulli, 0);
	a = logBernoulli.row(Ppersons-1);
	return(Rcpp::wrap( a ));
	}
	'

	## C++ version
	log.prob.apply.cpp <- cxxfunction(
	signature(
	UdataR="numeric",
	thetaR="numeric", aR="numeric", bR="numeric",
	appByS="int"),
	body=log.prob.apply.cpp.src, plugin="RcppArmadillo")

	## Use the 100 datasets from the profiling example (see 'sample.data.R')

	system.time(
	a.disc.lik.r <- lapply( tmp, function(holder){
	log.prob.apply.r(U.data=holder$U,
	th=holder$theta.abl,
	a=holder$a.disc,
	b=holder$b.diff,
	applyBy=2)
	} ) ) ## 1.1413 seconds

	system.time(
	a.disc.lik.cpp <- lapply( tmp, function(holder){
	as.numeric(log.prob.apply.cpp(UdataR=holder$U,
	thetaR=holder$theta.abl,
	aR=holder$a.disc,
	bR=holder$b.diff,
	appByS=2))
	} ) ) ## 0.888 seconds

	stopifnot(all.equal(a.disc.lik.r, a.disc.lik.cpp))

	## So a 22% percent improvement by compiling the code
	## Not bad, but not an order of magnitude either.
	## Stat Bytes supplement 6/21/2013 -- Nathan VanHoudnos

	## Set the random-number generator seed,
	## to make the results reproducible
	set.seed(314159)

	make.data <- function(I.items=30, P.persons=2000) {
	# set the number of items I and persons P
	# I.items <- 30
	# P.persons <- 2000

	# set the fixed item and population parameters
	a.disc <- 1 + runif(I.items,-0.5,0.5)
	b.diff <- seq(-3,3,length=I.items)

	mean.theta <- 0
	sig2.theta <- (1.25)^2

	# generate thetas and the I x P matrix of response probabilities
	theta.abl <- rnorm(P.persons, mean=mean.theta, sd=sqrt(sig2.theta))
	term.1 <- outer(theta.abl, a.disc)
	term.2 <- matrix(rep(a.disc*b.diff, P.persons), nrow=P.persons,
	byrow=TRUE)

	## Using lower level functions improves speed, but costs clarity
	P.prob.1 <- plogis(term.1-term.2)
	P.prob.2 <- 1/(1 + exp(term.2 - term.1))
	stopifnot(all.equal(P.prob.1, P.prob.2))

	## ifelse is linear time, casting is constant time
	U.flip <- runif(I.items*P.persons)
	U.1 <- ifelse(U.flip < P.prob.1,1,0)
	U.2 <- as.numeric(U.flip < P.prob.2)
	dim(U.2) <- c(P.persons, I.items)
	stopifnot(all.equal(U.1, U.2))
	return(list(U=U.1, theta.abl=theta.abl, a.disc=a.disc, b.diff=b.diff))
	}

	tmp <- lapply(1:100,function(ii){make.data()})