Skip to content

Instantly share code, notes, and snippets.

@shabbychef
Created August 21, 2013 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shabbychef/6296955 to your computer and use it in GitHub Desktop.
Save shabbychef/6296955 to your computer and use it in GitHub Desktop.
# see
# http://climateecology.wordpress.com/2013/08/19/r-vs-python-speed-comparison-for-bootstrapping/
# first version#FOLDUP
set.seed(101)
# generate data
x <- 0:100
y <- 2*x + rnorm(101, 0, 10)
# plot data
plot(x, y)
# run the regression
mod1 <- lm(y ~ x)
yHat <- fitted(mod1)
# get the residuals
errors <- resid(mod1)
# make a bootstrapping function
boot1 <- function(n = 10000){
b1 <- numeric(n)
b1[1] <- coef(mod1)[2]
for(i in 2:n){
residBoot <- sample(errors, replace=F)
yBoot <- yHat + residBoot
modBoot <- lm(yBoot ~ x)
b1[i] <- coef(modBoot)[2]
}
return(b1)
}
# Run the bootstrapping function
system.time( bootB1 <- boot1() )
mean(bootB1)
#UNFOLD
# improved verion;#FOLDUP
set.seed(101)
# generate data
x <- 0:100
y <- 2*x + rnorm(101, 0, 10)
# run the regression
mod1 <- lm(y ~ x)
yHat <- fitted(mod1)
# get the residuals
errors <- resid(mod1)
## create a matrix to feed lm.fit (just as in python)
X <- cbind(1, x)
### simulation using lm.fit
boot2 <- function(n = 10000){
b1 <- numeric(n)
b1[1] <- coef(mod1)[2]
for(i in 2:n) {
residBoot <- sample(errors, replace = TRUE)
yBoot <- yHat + residBoot
modBoot <- lm.fit(X, yBoot)
b1[i] <- modBoot$coefficients[2]
}
return(b1)
}
system.time( bootB2 <- boot2() )
mean(bootB2)
#UNFOLD
# nigel's version#FOLDUP
set.seed(101)
# generate data
x <- 0:100
y <- 2*x + rnorm(101, 0, 10)
X<-cbind(1,x)
# run the regression
mod1 <- lm(y ~ x)
yHat <- fitted(mod1)
errors <- resid(mod1)
v <-function()
{
residBoot <- sample(errors, replace = TRUE)
yBoot <- yHat + residBoot
tol = 1e-07
z <- .Call(stats:::C_Cdqrls, X, yBoot, tol)
return(z$coefficients[2])
}
boot3 <- function(n=10000) {
return(replicate(n,v()))
}
system.time( bootB3<-boot3() )
mean(bootB3)
#UNFOLD
# vectorized?#FOLDUP
set.seed(101)
# generate data
x <- 0:100
y <- 2*x + rnorm(101, 0, 10)
X<-cbind(1,x)
# run the regression
mod1 <- lm(y ~ x)
yHat <- fitted(mod1)
errors <- resid(mod1)
boot4 <-function(n=10000)
{
residBoot <- matrix(sample(errors, size=n*length(errors), replace = TRUE),
ncol=n)
yBoot <- yHat + residBoot
modBoot <- lm.fit(X, yBoot)
return(modBoot$coefficients[2,])
}
system.time( bootB4 <- boot4() )
mean(bootB4)
#UNFOLD
# run them all.#FOLDUP
require(rbenchmark)
benchmark(boot1(),
boot2(),
boot3(),
boot4(),
columns = c("test", "replications", "elapsed", "relative"),
order = "elapsed",
replications = 10)
#UNFOLD
# I get:
#
# test replications elapsed relative
# 4 boot4() 10 1.314 1.000
# 3 boot3() 10 2.880 2.192
# 2 boot2() 10 7.905 6.016
# 1 boot1() 10 91.391 69.552
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment