public
Created

Sample R code from the June 2013 Stat Bytes talk

  • Download Gist
june-2013-stat-bytes-demo.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## Stat Bytes supplement 6/21/2013
## Nathan VanHoudnos
 
## Quick demo to test Rprof
Rprof("profile1.out", line.profiling=TRUE)
source('simple.data.R')
Rprof(NULL)
 
summaryRprof("profile1.out", lines = "show")$by.line
 
 
## And to play with Rcpp
## install.packages(c('inline','Rcpp','RcppArmadillo'))
require(Rcpp)
require(RcppArmadillo)
require(inline)
 
## Here is a pure R version of a function to calculate
## a log likelihood in a 2PL IRT model
log.prob.apply.r <- function(U.data, th, a, b, applyBy) {
P.persons <- nrow(U.data)
term.1 <- outer(th, a)
term.2 <- matrix(rep(a*b, P.persons), nrow=P.persons,
byrow=TRUE)
P.prob <- 1/(1 + exp(term.2 - term.1))
 
## Use the inverse CDF trick
log.bernoulli <- U.data*log(P.prob) + (1-U.data)*log(1-P.prob)
return(apply(log.bernoulli,applyBy,sum))
}
 
## Write the C++ function in a string
## N.B. You can also write it in a separate
## file, then load it into a string if you
## prefer. See:
## fileName <- 'log.prob.apply.cpp.src.cpp'
## log.prob.apply.cpp.src <- readChar(fileName, file.info(fileName)$size)
log.prob.apply.cpp.src <- '
// Import the R objects passed to the function
arma::mat U = Rcpp::as<arma::mat>(UdataR);
arma::colvec theta = Rcpp::as<arma::colvec>(thetaR);
arma::rowvec a = Rcpp::as<arma::rowvec>(aR);
arma::rowvec b = Rcpp::as<arma::rowvec>(bR);
int appBy = as<int>(appByS);
//
// Find out the dimensions of the U matrix
int Ppersons = U.n_rows;
int Iitems = U.n_cols;
//
// Declare variables to use
arma::mat term1(Ppersons,Iitems);
arma::mat term2(Ppersons,Iitems);
arma::mat Pprob(Ppersons,Iitems);
arma::mat logBernoulli(Ppersons,Iitems);;
//
// Calculate the terms for the logit
term1 = theta*a; //Outer product
//
// Fill term2 with elementwise multiplication
// (every row is the same)
for (int row=0; row<Ppersons; row++) {
term2.row(row) = a%b; //Schur product (element wise)
}
//
// Calculate the logit
Pprob = 1/(1+exp(term2-term1));
//
// Use the inverse CDF trick
logBernoulli = U%log(Pprob) + (1-U)%log(1-Pprob);
//
// Return the likelihood
if ( appBy == 1 ) {
logBernoulli = cumsum(logBernoulli, 1);
theta = logBernoulli.col(Iitems-1);
return(Rcpp::wrap( theta ));
} else {
logBernoulli = cumsum(logBernoulli, 0);
a = logBernoulli.row(Ppersons-1);
return(Rcpp::wrap( a ));
}
'
 
## C++ version
log.prob.apply.cpp <- cxxfunction(
signature(
UdataR="numeric",
thetaR="numeric", aR="numeric", bR="numeric",
appByS="int"),
body=log.prob.apply.cpp.src, plugin="RcppArmadillo")
 
## Use the 100 datasets from the profiling example (see 'sample.data.R')
 
system.time(
a.disc.lik.r <- lapply( tmp, function(holder){
log.prob.apply.r(U.data=holder$U,
th=holder$theta.abl,
a=holder$a.disc,
b=holder$b.diff,
applyBy=2)
} ) ) ## 1.1413 seconds
 
system.time(
a.disc.lik.cpp <- lapply( tmp, function(holder){
as.numeric(log.prob.apply.cpp(UdataR=holder$U,
thetaR=holder$theta.abl,
aR=holder$a.disc,
bR=holder$b.diff,
appByS=2))
} ) ) ## 0.888 seconds
 
stopifnot(all.equal(a.disc.lik.r, a.disc.lik.cpp))
 
## So a 22% percent improvement by compiling the code
## Not bad, but not an order of magnitude either.
simple.data.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## Stat Bytes supplement 6/21/2013 -- Nathan VanHoudnos
 
## Set the random-number generator seed,
## to make the results reproducible
set.seed(314159)
 
make.data <- function(I.items=30, P.persons=2000) {
# set the number of items I and persons P
# I.items <- 30
# P.persons <- 2000
# set the fixed item and population parameters
a.disc <- 1 + runif(I.items,-0.5,0.5)
b.diff <- seq(-3,3,length=I.items)
mean.theta <- 0
sig2.theta <- (1.25)^2
# generate thetas and the I x P matrix of response probabilities
theta.abl <- rnorm(P.persons, mean=mean.theta, sd=sqrt(sig2.theta))
term.1 <- outer(theta.abl, a.disc)
term.2 <- matrix(rep(a.disc*b.diff, P.persons), nrow=P.persons,
byrow=TRUE)
 
## Using lower level functions improves speed, but costs clarity
P.prob.1 <- plogis(term.1-term.2)
P.prob.2 <- 1/(1 + exp(term.2 - term.1))
stopifnot(all.equal(P.prob.1, P.prob.2))
 
## ifelse is linear time, casting is constant time
U.flip <- runif(I.items*P.persons)
U.1 <- ifelse(U.flip < P.prob.1,1,0)
U.2 <- as.numeric(U.flip < P.prob.2)
dim(U.2) <- c(P.persons, I.items)
stopifnot(all.equal(U.1, U.2))
return(list(U=U.1, theta.abl=theta.abl, a.disc=a.disc, b.diff=b.diff))
}
 
tmp <- lapply(1:100,function(ii){make.data()})

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.