Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Bayesian Log Reg: What is the difference between the expected pr(y==1| model) and pr(pr==1| model)[calc w/ expectation of beta] ?
library(rstan)
library(dplyr)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
seed_sim = 1234
set.seed(seed_sim)
# set up data
N = 4000
K = 15
X = matrix(rnorm(N * K), nrow = N, ncol = K)
beta_forward = rnorm(n = K, mean = 0, sd = 1)
alpha_forward = 1
z = (X %*% beta_forward) %>% as.numeric() + alpha_forward
inv_logit = function (s){exp(s)/(1 + exp(s))}
pr = inv_logit(z)
y = rbinom(N, 1, prob = pr)
# quick sanity check on params using glm w/ binomial link
df = data.frame(y = y, x1 = X[,1], x2 = X[,2])
glm( y ~ x1 + x2, data = df, family = "binomial")
# stan code: student_t for coefficients, centered
stan_code = "
data {
int<lower=0> N;
int<lower=0> K;
matrix[N,K] X;
int<lower=0,upper=1> y[N];
}
parameters {
real a;
real b_shape;
real b_location;
vector[K] b;
}
transformed parameters {
vector[N] pr_y;
pr_y = inv_logit(X * b + a);
}
model {
a ~ normal(1,1);
b_location ~ normal(0,1);
b_shape ~ gamma(0.1, 0.1);
to_vector(b) ~ normal(b_location, b_shape);
y ~ binomial(1,pr_y);
}
"
# run STAN w/ a little larger adaptive delta
nchains = 1
stanfit = stan(model_code = stan_code,
data = list(X = X, y = y, K = K, N = N),
control = list(adapt_delta = 0.95,
max_treedepth = 15),
chains = nchains,
seed = seed_sim )
# get model summary
mod_summary <- as.data.frame(summary(stanfit)[[1]])
# extract simulated variables
b_sim = extract(stanfit, "b")$b
a_sim = extract(stanfit, "a")$a
pr_y_sim = extract(stanfit, "pr_y")$pr_y
# demonstrate that simulated values are equal to mean
n_subsample = 241
pr_y_model_mean = mod_summary[grep("pr_y", rownames(mod_summary)), c("mean")]
pry = sapply(seq_len(n_subsample), function(n)
sapply(seq_len(nchains * 1000),
function(i)inv_logit(sum((b_sim[i,] * X[n,])) + a_sim[i] )) %>% mean(., na.rm = TRUE))
(pry - pr_y_model_mean[seq_len(n_subsample)]) %>% abs %>% sum()
# AUC sanity check that model is predictive
library(ROCR);AUC = performance(prediction(pr_y_model_mean, y),"auc")@y.values[[1]]
print(paste("AUC: ", AUC))
## Now for our experiment
# grab the mean values, as summarised
b_mean = mod_summary[grep("^b\\[", rownames(mod_summary)), c("mean")]
a_mean = mod_summary[grep("^a", rownames(mod_summary)), c('mean')]
# run a vectorized calculation
pr_y_from_b_mean = inv_logit(as.numeric(X %*% b_mean + a_mean))
## some formatting....
pr_y_model_mean_fmt= format(pr_y_model_mean, digits = 4,scientific = FALSE)
pr_y_from_b_mean_fmt = format(pr_y_from_b_mean, digits = 4, scientific = FALSE)
out_df = data.frame(y = y,
prob_stan = pr_y_model_mean_fmt,
prob_mean = pr_y_from_b_mean_fmt)
if ("1" == Sys.getenv("RSTUDIO")){
View(out_df)
} else {
out_df %>% head()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.