Skip to content

Instantly share code, notes, and snippets.

@ribsy
Created April 19, 2022 02:01
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ribsy/2249f28c8ae638a50cb108a8fb6181d4 to your computer and use it in GitHub Desktop.
Save ribsy/2249f28c8ae638a50cb108a8fb6181d4 to your computer and use it in GitHub Desktop.
source("manifesto_functions.R")
#########################
### Beta-Binomial Example
# Make grid of 10K sequential possibilities between 0 to 1
plausible_rates <- data.frame(
possible_grid = seq(from = 0, to = 1, length = 10000))
# Get Expert Beliefs (30% and 60%) About Burndown Rates
p_vals <- GetBeliefsEvents(.3,.6)
# Get Prior and Likelihood
plausible_rates <- plausible_rates %>%
mutate(prior = dbeta(possible_grid, p_vals[1], p_vals[2]),
likelihood = dbinom(9, 30, possible_grid))
# Get Posterior
plausible_rates <- plausible_rates %>%
mutate(unnormalized = likelihood * prior,
posterior = unnormalized / sum(unnormalized))
# Simulate 10K samples weighted on posterior
post_sample <- sample_n(plausible_rates, size = 10000,
weight = posterior, replace = TRUE)
# Get HDIs for graph
ci_hdi <- ci(post_sample$possible_grid, method = "HDI")
ci_low <- round(ci_hdi$CI_low,4) * 100
ci_high <- round(ci_hdi$CI_high,4) * 100
# Plot
ggplot(post_sample, aes(x = possible_grid)) +
# Histogram and Curve
geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.05) +
geom_density(alpha=.1, fill="white")+
# HDI Low
geom_vline(xintercept=ci_hdi$CI_low, color="black", size=1,
linetype="dotted") +
# HDI High
geom_vline(xintercept=ci_hdi$CI_high, color="black", size=1,
linetype="dotted") +
# Make x-axis stretch full range
lims(x = c(0, 1)) +
# Labels for graph
labs(x = "Plausible Rates",
y = "Strength",
title = "Beta-Binomial Example",
subtitle = paste0("89% CI Low: ", ci_low , " CI High: ", ci_high)) +
# Clean black and white theme
theme_bw()
# Look at post_sample
head(post_sample %>% select(possible_grid, posterior) %>% arrange(desc(posterior)),5)
####################################
### Gamma-Poisson Grid Approximation
# Subjective Priors, equals 1.5 per week i.e. 3/2
prior_events <- 3
prior_weeks <-2
# Data, which equals 2.5 per week 25/10
total_events <- 25
total_weeks <- 10
# Make grid of 10K sequential values between 0 to 5
plausible_rates <- data.frame(possible_grid =
seq(from = 0, to = 5, length = 10000))
# Get Prior and Likelihood
plausible_rates <- plausible_rates %>%
mutate(prior = dgamma(possible_grid, prior_events, prior_weeks),
likelihood = dpois(total_events, possible_grid * total_weeks))
# Get Posterior
plausible_rates <- plausible_rates %>%
mutate(unnormalized = likelihood * prior,
posterior = unnormalized / sum(unnormalized))
# Simulate 10K samples weighted on posterior
post_sample <- sample_n(plausible_rates, size = 10000,
weight = posterior, replace = TRUE)
# 89% Highest Density Intervals
ci_hdi <- ci(post_sample$possible_grid,method = "HDI")
ci_low <- round(ci_hdi$CI_low,2)
ci_high <- round(ci_hdi$CI_high,2)
# Histogram of posterior sample
ggplot(post_sample, aes(x = possible_grid)) +
geom_histogram(aes(y = ..density..), color = "white", bins = 100) +
geom_density(alpha=.1, fill="white")+
# HDI Low
geom_vline(xintercept=ci_hdi$CI_low, color="black", size=1,
linetype="dotted") +
# HDI High
geom_vline(xintercept=ci_hdi$CI_high, color="black", size=1,
linetype="dotted") +
# Graph Labels
labs(x = "Plausible Rates",
y = "Strength",
title = "Gamma-Poisson Example",
subtitle = paste0("89% CI Low: ", ci_low , " CI High: ", ci_high)) +
lims(x = c(0, 5))
# See post_sample
head(post_sample %>% arrange(desc(posterior)),10)
########################################################
### Steps Toward Bayesian Regression Modeling using MCMC
# Build for product groups with vulns
dtGroupOne <- GetAdvSecResults(c(.03,.05,.15),1,4)
dtGroupOne$group <- 1
dtGroupTwo <- GetAdvSecResults(c(.05,.08,.18),1,3)
dtGroupTwo$group <- 2
dtGroupThree <- GetAdvSecResults(c(.07,.10,.20),1,2)
dtGroupThree$group <- 3
dtGroupFour <- GetAdvSecResults(c(.10,.15,.25),1,1)
dtGroupFour$group <- 4
dt <- bind_rows(dtGroupOne, dtGroupTwo, dtGroupThree, dtGroupFour)
# Get basic subset of columns
basic <- dt %>%
select(group, dev_maturity, audit, exposure, severity, fix_time) %>%
# Check if SLAs were met based on risk
mutate(fixed_in_sla = mapply(sla_check,fix_time,severity,exposure,audit))
# Re-scale fix time from days to weeks
basic$fix_time_7 <- basic$fix_time/7
# Plot policy histogram to compare fixed from not
ggplot(basic, aes(x = fix_time_7, y = ..density..,
fill = fixed_in_sla == 1)) +
geom_histogram() +
scale_fill_manual(values = c("gray30", "skyblue"))
##############################
### Logistic Regression Priors
# Bayesian MCMC Logistic Regression Model
fit_prior_1 <- stan_glm(
# How does meeting SLA relate to time?
fixed_in_sla ~ fix_time_7,
# What is the average prior probability of meeting SLA?
prior_intercept = normal(.62, .52),
# How much influence does each week have on reducing SLA achievement
prior = normal(-.08, .03),
# We are using a binomial model with a log scale
family=binomial(link="logit"),
# Reference our data frame
data=basic,
# Return prior predictive results – as opposed to posterior
prior_PD = TRUE)
# Custom helper function
getLogOddsInt(prob = .38, scale = 1)
# Pipe in our data frame
basic %>% select(fix_time_7, fixed_in_sla) %>%
# TidyBayes function for drawing values from prior (or posterior)
add_epred_draws(fit_prior_1, ndraws = 100) %>%
# Structure the x and y axis
ggplot(aes(x = fix_time_7, y = fixed_in_sla)) +
# Create lines based on prior definitions and data
geom_line(aes(y = .epred, group = .draw), size = 0.1)+
theme_bw()
# Pull data for graph into a data frame
res <- basic %>% select(fix_time_7, fixed_in_sla) %>%
add_epred_draws(fit_prior_1, ndraws = 100)
# Show how much data add_epred_draws() simulates
nrow(res)
# Get mean values based on fix_time_7 for days 1, 5, 10
mean(res$.epred[(res$fix_time_7 <= 1 & res$fixed_in_sla == 1)])
mean(res$.epred[(res$fix_time_7 %in% 5:5.99 & res$fixed_in_sla == 1)])
mean(res$.epred[(res$fix_time_7 %in% 10:10.99 & res$fixed_in_sla == 1)])
mean(res$.epred[res$fixed_in_sla == 1])
# Data frame as input
basic %>%
# This first function creates millions of records.
#add_predicted_draws(fit_prior_1, ndraws = 1000) %>%
add_predicted_draws(sla_model_1, ndraws = 1000) %>%
group_by(.draw) %>%
# Get proportion of fixed records
summarize(proportion_fixed = mean(.prediction == 1),.groups = 'drop') %>%
# Make a simple histogram
ggplot(aes(x = proportion_fixed)) +
geom_histogram(color = "white",bins=50)+
theme_bw()
##########################
### MCMC Posterior Quality
# Build Posterior Model
sla_model_1 <- update(fit_prior_1, prior_PD = FALSE)
# Inspect MCMC Chains - look for chain stability
mcmc_trace(sla_model_1)
# Inspect MCMC Posterior Distributions by Chain - look for overlap
mcmc_dens_overlay(sla_model_1)
# Auto-correlation check - look for quick drop to 0
mcmc_acf(sla_model_1)
# R-hat check - variance in chains over variance across - should be ~1
rhat(sla_model_1)
# Auto-correlation check – Above .1 desirable
neff_ratio(sla_model_1)
# Check posterior values for ROPE...should be 0% out of ROPE
describe_posterior(sla_model_1)
###############################################
### MCMC Posterior Visualization and Prediction
# Basic Curve
basic %>%
# Simulates hundreds of thousands of draws
add_epred_draws(sla_model_1, ndraws = 100) %>%
# Graph Curve
ggplot(aes(x = fix_time_7, y = fixed_in_sla)) +
geom_line(aes(y = .epred, group = .draw), alpha = 0.15) +
labs(x = "Remediation Time by Weeks",
y = "Probability of Fix",
title = "Vulnerability Management Policy Curve",
subtitle = paste0("Fixed Within SLA By Week For: ",
nrow(basic), " Vulnerabilities"))+
theme_bw()
# Advanced Colored Curve
# Linear function – predicts sla rate based on week and MCMC params
pr_switch <- function(x, ests) plogis(ests[1] + ests[2] * x )
# Makes blue and black vulnerability points
jitt <- function(...) {
geom_point(aes_string(...),
position = position_jitter(height = 0.05, width = 0.1),
size = 2, shape = 21, stroke = 0.4)
}
# Plot Curve
ggplot(basic, aes(x = fix_time_7, y = fixed_in_sla,
color = fixed_in_sla)) +
# Y axis
scale_y_continuous(breaks = c(0, 0.5, 1)) +
# Add colored vulns to chart
jitt(x="fix_time_7") +
# Add curve to chart
stat_function(fun = pr_switch, args = list(ests = coef(sla_model_1)),
size = 2, color = "maroon")+
labs(x = "Remediation Time by Weeks",
y = "Fixed Within SLA",
title = "Vulnerability Management Policy Curve",
subtitle = paste0("Fixed Within SLA By Week For: ",
nrow(basic), " Vulnerabilities"))+
theme_bw()+
theme(legend.position = "none")
######################
### Forecasting Policy
# vuln severity and asset exposure
policy_1 <- stan_glm(fixed_in_sla ~ severity + exposure,
family=binomial(link="logit"),data=basic)
# vuln severity and audit
policy_2 <- stan_glm(fixed_in_sla ~ severity + audit,
family=binomial(link="logit"),data=basic)
# asset exposure and audit
policy_3 <- stan_glm(fixed_in_sla ~ exposure + audit,
family=binomial(link="logit"),data=basic)
# vuln severity, audit and exposure
policy_4 <- stan_glm(fixed_in_sla ~ severity + audit + exposure,
family=binomial(link="logit"),data=basic)
# severity, dev maturity and exposure
policy_5 <- stan_glm(fixed_in_sla ~ severity + dev_maturity + exposure,
family=binomial(link="logit"),data=basic)
# severity, dev maturity, exposure and audit
policy_6 <- stan_glm(fixed_in_sla ~ severity + dev_maturity + exposure +
audit, family=binomial(link="logit"),data=basic)
# Use leave-one-out function to get elpd scores
(loo1 <- loo(policy_1))
(loo2 <- loo(policy_2))
(loo3 <- loo(policy_3))
(loo4 <- loo(policy_4))
(loo5 <- loo(policy_5))
(loo6 <- loo(policy_6))
# Compare and rank order scores
loo_compare(loo1, loo2, loo3, loo4, loo5, loo6)
# MCMC Model Checks
describe_posterior(policy_6)
mcmc_trace(policy_6)
mcmc_dens_overlay(policy_6)
mcmc_acf(policy_6)
rhat(policy_6)
neff_ratio(policy_6)
# Mean value
mean(basic$fixed_in_sla)
# Mean chart
# Function to get mean
proportion_fix <- function(x){mean(x == 1)}
# Run numerous simulations
pp_check(policy_6, nreps = 100,
plotfun = "stat", stat = "proportion_fix") +
xlab("probability of fix")
###############################
### Posterior Policy Prediction
policy_prediction <-
posterior_epred( policy_6, newdata = data.frame(
severity = "extreme",
exposure = "external",
audit = 0,
dev_maturity = 4))
mean(policy_prediction)
# Extreme, External, Dev Mature: Excelling, Audited
policy_pred(policy_6, "extreme", "external", 4, 0)
# Extreme, External, Dev Mature: None, Audited
policy_pred(policy_6, "extreme", "external", 1, 0)
# Extreme, Partner, Dev Mature: None, Audited
policy_pred(policy_6, "extreme", "partner", 1, 0)
# Extreme, Partner, Dev Mature: Excelling, Audited
policy_pred(policy_6, "extreme", "partner", 4, 0)
# Critical, External, Dev Mature: Excelling, Audited
policy_pred(policy_6, "critical", "external", 4, 0)
# Critical, External, Dev Mature: None, Audited
policy_pred(policy_6, "critical", "external", 1, 0)
# High, Internal, Dev Mature: None, Not-Audited
policy_pred(policy_6, "high", "internal", 1, 1)
# High, External, Dev Mature: Excelling, Not-Audited
policy_pred(policy_6, "high", "internal", 4, 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment