Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A Speed Comparison Between Flexible Linear Regression Alternatives in R.
library(microbenchmark)
library(arm)
library(rstan)
library(bbmle)
log_post <- function(par, y, x) {
sigma <- exp(par[1])
intercept <- par[2]
beta <- as.matrix(par[-c(1,2)])
mu <- intercept + x %*% beta
sum(dnorm(y, mu, sigma, log=TRUE))
}
model_string <- "
data {
int n;
int n_pred;
vector[n] y;
matrix[n, n_pred] x;
}
parameters {
real intercept;
vector[n_pred] beta;
real<lower=0> sigma;
}
model {
vector[n] mu;
mu <- intercept + x * beta;
y ~ normal(mu, sigma);
}
"
model <- stan_model(model_code = model_string)
lin_reg_timing <- ddply(expand.grid(n = c(100, 1000, 10000), n_pred = c(1, 5, 10, 50)), c("n", "n_pred"), function(data_params) {
n <- data_params$n
n_pred <- data_params$n_pred
beta <- rnorm(n_pred, 0, 10)
x <- matrix(rnorm(n * n_pred),nrow = n, ncol = n_pred)
y <- x %*% beta + rnorm(n, 0, 1)
data_mat <- cbind(y, x)
colnames(data_mat) <- c("y", paste0("x", seq_len(n_pred)))
data_df <- as.data.frame(data_mat)
y_depends_on_xs <- as.formula(paste0("y ~ 1 +", paste0("x", seq_len(n_pred), collapse = " + ")))
y_depends_on_xs_with_betas <-
as.formula(paste0("y ~ intercept +", paste0("x", seq_len(n_pred), "* beta", seq_len(n_pred),collapse = " + ")))
y_depends_on_xs_with_dist <-
as.formula(paste0("y ~ dnorm(mean = intercept +", paste0("x", seq_len(n_pred), "* beta", seq_len(n_pred),collapse = " + "), ", sd = exp(log_sigma))" ))
inits <- rnorm(2 + n_pred)
named_inits <- as.list(inits)
names(named_inits) <- c("log_sigma", "intercept", paste0("beta", seq_len(n_pred)))
data_list <- list(n = length(y), n_pred = n_pred, y = as.vector(y), x = x)
mb_result <- microbenchmark(unit = "ms", times = 20,
lm.fit(cbind(1, x), y),
lm(y_depends_on_xs, data = data_df),
glm(y_depends_on_xs, data = data_df),
nls(y_depends_on_xs_with_betas, data = data_df),
optimizing(model, data_list),
bayesglm(y_depends_on_xs, data = data_df, prior.scale=Inf, prior.df=Inf),
optim(par = inits, fn = log_post, control = list(fnscale = -1), y=y, x=x),
mle2(y_depends_on_xs_with_dist, start = named_inits, data = data_df)
)
data.frame(method = c("lm.fit", "lm", "glm", "nls", "stan", "bayesglm", "optim", "mle2"), ms = summary(mb_result)$median)
})
lin_reg_timing$method <- factor(lin_reg_timing$method, levels = c("lm.fit", "lm", "nls","glm","bayesglm", "stan","optim", "mle2") ,ordered = TRUE)
facet_labeler <- function(variable, value) {
if(variable == "n_pred") {
paste(value, "predictors")
} else {
paste(value, "data points")
}
}
qplot(method, ms, main = "Time to run a linear regression (1000 data points, 5 predictors)",
data=lin_reg_timing[lin_reg_timing$n == 1000 & lin_reg_timing$n_pred == 5, ], log = "y") +
theme(axis.text.x=element_text(angle=45, hjust=1))
qplot(method, ms,data=lin_reg_timing, log = "y", main="Time to run a linear regression") +
facet_grid(n ~ n_pred, scales = "free", labeller = facet_labeler) +
theme(axis.text.x=element_text(angle=45, hjust=1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.