Created
April 19, 2022 02:01
-
-
Save ribsy/2249f28c8ae638a50cb108a8fb6181d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source("manifesto_functions.R") | |
######################### | |
### Beta-Binomial Example | |
# Make grid of 10K sequential possibilities between 0 to 1 | |
plausible_rates <- data.frame( | |
possible_grid = seq(from = 0, to = 1, length = 10000)) | |
# Get Expert Beliefs (30% and 60%) About Burndown Rates | |
p_vals <- GetBeliefsEvents(.3,.6) | |
# Get Prior and Likelihood | |
plausible_rates <- plausible_rates %>% | |
mutate(prior = dbeta(possible_grid, p_vals[1], p_vals[2]), | |
likelihood = dbinom(9, 30, possible_grid)) | |
# Get Posterior | |
plausible_rates <- plausible_rates %>% | |
mutate(unnormalized = likelihood * prior, | |
posterior = unnormalized / sum(unnormalized)) | |
# Simulate 10K samples weighted on posterior | |
post_sample <- sample_n(plausible_rates, size = 10000, | |
weight = posterior, replace = TRUE) | |
# Get HDIs for graph | |
ci_hdi <- ci(post_sample$possible_grid, method = "HDI") | |
ci_low <- round(ci_hdi$CI_low,4) * 100 | |
ci_high <- round(ci_hdi$CI_high,4) * 100 | |
# Plot | |
ggplot(post_sample, aes(x = possible_grid)) + | |
# Histogram and Curve | |
geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.05) + | |
geom_density(alpha=.1, fill="white")+ | |
# HDI Low | |
geom_vline(xintercept=ci_hdi$CI_low, color="black", size=1, | |
linetype="dotted") + | |
# HDI High | |
geom_vline(xintercept=ci_hdi$CI_high, color="black", size=1, | |
linetype="dotted") + | |
# Make x-axis stretch full range | |
lims(x = c(0, 1)) + | |
# Labels for graph | |
labs(x = "Plausible Rates", | |
y = "Strength", | |
title = "Beta-Binomial Example", | |
subtitle = paste0("89% CI Low: ", ci_low , " CI High: ", ci_high)) + | |
# Clean black and white theme | |
theme_bw() | |
# Look at post_sample | |
head(post_sample %>% select(possible_grid, posterior) %>% arrange(desc(posterior)),5) | |
#################################### | |
### Gamma-Poisson Grid Approximation | |
# Subjective Priors, equals 1.5 per week i.e. 3/2 | |
prior_events <- 3 | |
prior_weeks <-2 | |
# Data, which equals 2.5 per week 25/10 | |
total_events <- 25 | |
total_weeks <- 10 | |
# Make grid of 10K sequential values between 0 to 5 | |
plausible_rates <- data.frame(possible_grid = | |
seq(from = 0, to = 5, length = 10000)) | |
# Get Prior and Likelihood | |
plausible_rates <- plausible_rates %>% | |
mutate(prior = dgamma(possible_grid, prior_events, prior_weeks), | |
likelihood = dpois(total_events, possible_grid * total_weeks)) | |
# Get Posterior | |
plausible_rates <- plausible_rates %>% | |
mutate(unnormalized = likelihood * prior, | |
posterior = unnormalized / sum(unnormalized)) | |
# Simulate 10K samples weighted on posterior | |
post_sample <- sample_n(plausible_rates, size = 10000, | |
weight = posterior, replace = TRUE) | |
# 89% Highest Density Intervals | |
ci_hdi <- ci(post_sample$possible_grid,method = "HDI") | |
ci_low <- round(ci_hdi$CI_low,2) | |
ci_high <- round(ci_hdi$CI_high,2) | |
# Histogram of posterior sample | |
ggplot(post_sample, aes(x = possible_grid)) + | |
geom_histogram(aes(y = ..density..), color = "white", bins = 100) + | |
geom_density(alpha=.1, fill="white")+ | |
# HDI Low | |
geom_vline(xintercept=ci_hdi$CI_low, color="black", size=1, | |
linetype="dotted") + | |
# HDI High | |
geom_vline(xintercept=ci_hdi$CI_high, color="black", size=1, | |
linetype="dotted") + | |
# Graph Labels | |
labs(x = "Plausible Rates", | |
y = "Strength", | |
title = "Gamma-Poisson Example", | |
subtitle = paste0("89% CI Low: ", ci_low , " CI High: ", ci_high)) + | |
lims(x = c(0, 5)) | |
# See post_sample | |
head(post_sample %>% arrange(desc(posterior)),10) | |
######################################################## | |
### Steps Toward Bayesian Regression Modeling using MCMC | |
# Build for product groups with vulns | |
dtGroupOne <- GetAdvSecResults(c(.03,.05,.15),1,4) | |
dtGroupOne$group <- 1 | |
dtGroupTwo <- GetAdvSecResults(c(.05,.08,.18),1,3) | |
dtGroupTwo$group <- 2 | |
dtGroupThree <- GetAdvSecResults(c(.07,.10,.20),1,2) | |
dtGroupThree$group <- 3 | |
dtGroupFour <- GetAdvSecResults(c(.10,.15,.25),1,1) | |
dtGroupFour$group <- 4 | |
dt <- bind_rows(dtGroupOne, dtGroupTwo, dtGroupThree, dtGroupFour) | |
# Get basic subset of columns | |
basic <- dt %>% | |
select(group, dev_maturity, audit, exposure, severity, fix_time) %>% | |
# Check if SLAs were met based on risk | |
mutate(fixed_in_sla = mapply(sla_check,fix_time,severity,exposure,audit)) | |
# Re-scale fix time from days to weeks | |
basic$fix_time_7 <- basic$fix_time/7 | |
# Plot policy histogram to compare fixed from not | |
ggplot(basic, aes(x = fix_time_7, y = ..density.., | |
fill = fixed_in_sla == 1)) + | |
geom_histogram() + | |
scale_fill_manual(values = c("gray30", "skyblue")) | |
############################## | |
### Logistic Regression Priors | |
# Bayesian MCMC Logistic Regression Model | |
fit_prior_1 <- stan_glm( | |
# How does meeting SLA relate to time? | |
fixed_in_sla ~ fix_time_7, | |
# What is the average prior probability of meeting SLA? | |
prior_intercept = normal(.62, .52), | |
# How much influence does each week have on reducing SLA achievement | |
prior = normal(-.08, .03), | |
# We are using a binomial model with a log scale | |
family=binomial(link="logit"), | |
# Reference our data frame | |
data=basic, | |
# Return prior predictive results – as opposed to posterior | |
prior_PD = TRUE) | |
# Custom helper function | |
getLogOddsInt(prob = .38, scale = 1) | |
# Pipe in our data frame | |
basic %>% select(fix_time_7, fixed_in_sla) %>% | |
# TidyBayes function for drawing values from prior (or posterior) | |
add_epred_draws(fit_prior_1, ndraws = 100) %>% | |
# Structure the x and y axis | |
ggplot(aes(x = fix_time_7, y = fixed_in_sla)) + | |
# Create lines based on prior definitions and data | |
geom_line(aes(y = .epred, group = .draw), size = 0.1)+ | |
theme_bw() | |
# Pull data for graph into a data frame | |
res <- basic %>% select(fix_time_7, fixed_in_sla) %>% | |
add_epred_draws(fit_prior_1, ndraws = 100) | |
# Show how much data add_epred_draws() simulates | |
nrow(res) | |
# Get mean values based on fix_time_7 for days 1, 5, 10 | |
mean(res$.epred[(res$fix_time_7 <= 1 & res$fixed_in_sla == 1)]) | |
mean(res$.epred[(res$fix_time_7 %in% 5:5.99 & res$fixed_in_sla == 1)]) | |
mean(res$.epred[(res$fix_time_7 %in% 10:10.99 & res$fixed_in_sla == 1)]) | |
mean(res$.epred[res$fixed_in_sla == 1]) | |
# Data frame as input | |
basic %>% | |
# This first function creates millions of records. | |
#add_predicted_draws(fit_prior_1, ndraws = 1000) %>% | |
add_predicted_draws(sla_model_1, ndraws = 1000) %>% | |
group_by(.draw) %>% | |
# Get proportion of fixed records | |
summarize(proportion_fixed = mean(.prediction == 1),.groups = 'drop') %>% | |
# Make a simple histogram | |
ggplot(aes(x = proportion_fixed)) + | |
geom_histogram(color = "white",bins=50)+ | |
theme_bw() | |
########################## | |
### MCMC Posterior Quality | |
# Build Posterior Model | |
sla_model_1 <- update(fit_prior_1, prior_PD = FALSE) | |
# Inspect MCMC Chains - look for chain stability | |
mcmc_trace(sla_model_1) | |
# Inspect MCMC Posterior Distributions by Chain - look for overlap | |
mcmc_dens_overlay(sla_model_1) | |
# Auto-correlation check - look for quick drop to 0 | |
mcmc_acf(sla_model_1) | |
# R-hat check - variance in chains over variance across - should be ~1 | |
rhat(sla_model_1) | |
# Auto-correlation check – Above .1 desirable | |
neff_ratio(sla_model_1) | |
# Check posterior values for ROPE...should be 0% out of ROPE | |
describe_posterior(sla_model_1) | |
############################################### | |
### MCMC Posterior Visualization and Prediction | |
# Basic Curve | |
basic %>% | |
# Simulates hundreds of thousands of draws | |
add_epred_draws(sla_model_1, ndraws = 100) %>% | |
# Graph Curve | |
ggplot(aes(x = fix_time_7, y = fixed_in_sla)) + | |
geom_line(aes(y = .epred, group = .draw), alpha = 0.15) + | |
labs(x = "Remediation Time by Weeks", | |
y = "Probability of Fix", | |
title = "Vulnerability Management Policy Curve", | |
subtitle = paste0("Fixed Within SLA By Week For: ", | |
nrow(basic), " Vulnerabilities"))+ | |
theme_bw() | |
# Advanced Colored Curve | |
# Linear function – predicts sla rate based on week and MCMC params | |
pr_switch <- function(x, ests) plogis(ests[1] + ests[2] * x ) | |
# Makes blue and black vulnerability points | |
jitt <- function(...) { | |
geom_point(aes_string(...), | |
position = position_jitter(height = 0.05, width = 0.1), | |
size = 2, shape = 21, stroke = 0.4) | |
} | |
# Plot Curve | |
ggplot(basic, aes(x = fix_time_7, y = fixed_in_sla, | |
color = fixed_in_sla)) + | |
# Y axis | |
scale_y_continuous(breaks = c(0, 0.5, 1)) + | |
# Add colored vulns to chart | |
jitt(x="fix_time_7") + | |
# Add curve to chart | |
stat_function(fun = pr_switch, args = list(ests = coef(sla_model_1)), | |
size = 2, color = "maroon")+ | |
labs(x = "Remediation Time by Weeks", | |
y = "Fixed Within SLA", | |
title = "Vulnerability Management Policy Curve", | |
subtitle = paste0("Fixed Within SLA By Week For: ", | |
nrow(basic), " Vulnerabilities"))+ | |
theme_bw()+ | |
theme(legend.position = "none") | |
###################### | |
### Forecasting Policy | |
# vuln severity and asset exposure | |
policy_1 <- stan_glm(fixed_in_sla ~ severity + exposure, | |
family=binomial(link="logit"),data=basic) | |
# vuln severity and audit | |
policy_2 <- stan_glm(fixed_in_sla ~ severity + audit, | |
family=binomial(link="logit"),data=basic) | |
# asset exposure and audit | |
policy_3 <- stan_glm(fixed_in_sla ~ exposure + audit, | |
family=binomial(link="logit"),data=basic) | |
# vuln severity, audit and exposure | |
policy_4 <- stan_glm(fixed_in_sla ~ severity + audit + exposure, | |
family=binomial(link="logit"),data=basic) | |
# severity, dev maturity and exposure | |
policy_5 <- stan_glm(fixed_in_sla ~ severity + dev_maturity + exposure, | |
family=binomial(link="logit"),data=basic) | |
# severity, dev maturity, exposure and audit | |
policy_6 <- stan_glm(fixed_in_sla ~ severity + dev_maturity + exposure + | |
audit, family=binomial(link="logit"),data=basic) | |
# Use leave-one-out function to get elpd scores | |
(loo1 <- loo(policy_1)) | |
(loo2 <- loo(policy_2)) | |
(loo3 <- loo(policy_3)) | |
(loo4 <- loo(policy_4)) | |
(loo5 <- loo(policy_5)) | |
(loo6 <- loo(policy_6)) | |
# Compare and rank order scores | |
loo_compare(loo1, loo2, loo3, loo4, loo5, loo6) | |
# MCMC Model Checks | |
describe_posterior(policy_6) | |
mcmc_trace(policy_6) | |
mcmc_dens_overlay(policy_6) | |
mcmc_acf(policy_6) | |
rhat(policy_6) | |
neff_ratio(policy_6) | |
# Mean value | |
mean(basic$fixed_in_sla) | |
# Mean chart | |
# Function to get mean | |
proportion_fix <- function(x){mean(x == 1)} | |
# Run numerous simulations | |
pp_check(policy_6, nreps = 100, | |
plotfun = "stat", stat = "proportion_fix") + | |
xlab("probability of fix") | |
############################### | |
### Posterior Policy Prediction | |
policy_prediction <- | |
posterior_epred( policy_6, newdata = data.frame( | |
severity = "extreme", | |
exposure = "external", | |
audit = 0, | |
dev_maturity = 4)) | |
mean(policy_prediction) | |
# Extreme, External, Dev Mature: Excelling, Audited | |
policy_pred(policy_6, "extreme", "external", 4, 0) | |
# Extreme, External, Dev Mature: None, Audited | |
policy_pred(policy_6, "extreme", "external", 1, 0) | |
# Extreme, Partner, Dev Mature: None, Audited | |
policy_pred(policy_6, "extreme", "partner", 1, 0) | |
# Extreme, Partner, Dev Mature: Excelling, Audited | |
policy_pred(policy_6, "extreme", "partner", 4, 0) | |
# Critical, External, Dev Mature: Excelling, Audited | |
policy_pred(policy_6, "critical", "external", 4, 0) | |
# Critical, External, Dev Mature: None, Audited | |
policy_pred(policy_6, "critical", "external", 1, 0) | |
# High, Internal, Dev Mature: None, Not-Audited | |
policy_pred(policy_6, "high", "internal", 1, 1) | |
# High, External, Dev Mature: Excelling, Not-Audited | |
policy_pred(policy_6, "high", "internal", 4, 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment