This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source("manifesto_functions.R") | |
########################### | |
### Non-Bayesian Wait Times | |
# Number of product teams | |
group_count = 10 | |
# Empty Data Frame to hold results | |
dt <- tibble() | |
# Loop through count of groups | |
for(x in 1:group_count){ | |
# Get Ext,Sev,High random rates. See “from” and “to” ranges. | |
extreme <- sample(seq(from = .005, to = .03, by= .001), size = 1) | |
severe <- sample(seq(from = .031, to = .07, by= .001), size = 1) | |
high <- sample(seq(from = .071, to = .15, by= .001), size = 1) | |
# Set Random Maturity Values - get one value between 1-4 randomly | |
patch_fail <- sample(1:4,size=1) | |
dev_maturity <- sample(1:4,size=1) | |
# Get Data Frame | |
res <- GetAdvSecResults(c(extreme, severe, high), | |
patch_fail, dev_maturity) | |
# Set Group Number | |
res$group <- x | |
# Append to master data frame – makes one large data frame | |
dt <- bind_rows(dt,res) | |
} | |
# Get Critical Rows And All Columns using subsetting | |
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 & | |
dt$week.fseen <= 52), ] | |
# Get interarrivals. I.e. the difference between weeks numbers | |
interarrivals <- diff(sort(dte$week.fseen)) | |
# Print interarrivals | |
interarrivals | |
#make a histogram | |
hist(interarrivals, breaks=max(interarrivals)) | |
#################### | |
### Wait-Time By Day | |
# Same data frame for weeks | |
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & | |
dt$severity_id == 1 & dt$week.fseen <= 52),] | |
# Get interarrivals by asking for the day number | |
interarrivals <- diff(sort(yday(dte$first.seen))) | |
# Build a histogram of the arrivals. | |
hist(interarrivals,breaks=max(interarrivals)) | |
############################# | |
### The Basic Wait-Time Model | |
# Sum of individual interarrivals. | |
wait_time <- sum(interarrivals) | |
# Get count of vuln events. I.e ~60 vulns | |
n = nrow(dte) | |
# The ratio of vulnerabilities per day. I.e. 60/356 ~ .17 | |
vulns_per_wait_time <- n/wait_time | |
# Average Time Between Event Days i.e. 356/60 ~6 | |
avg_days = wait_time/n #Avg wait time in days between extreme vulns | |
# Exponential Dist to Simulate data. It spits out 60 simulated results | |
# And each value will wiggle around 1/60 ~ .0167 | |
simulated <- rexp(n, rate=n) | |
# Multiply simulated rates by 365 (in my case) to get 60 wait-times | |
sim_interarrivals <- simulated * wait_time | |
# Create a histogram (one of many since this is a random process) | |
hist(sim_interarrivals, breaks = max(interarrivals)) | |
# Avg next arrival | |
avg <- mean(simulated) * wait_time | |
avg | |
# Chance of a vulnerability showing up in 0 to 7 days. | |
pexp(q = 7, rate = vulns_per_wait_time) | |
# Simulated for 7 days | |
mean(rexp(n = 10000, rate = vulns_per_wait_time) <= 7) | |
# Simulate 10 boolean results, note how all are TRUE but one..its random | |
res <- rexp(10,.17) <= 7 | |
res | |
# Get ratio of true to false results | |
mean(res) | |
# Create Dataframe | |
tibble(x = 0:1000 / 100, | |
prob = pexp(q = 0:1000 / 100, | |
rate = vulns_per_wait_time, | |
lower.tail = TRUE)) %>% | |
# Add a column to the data frame | |
mutate(Interval = ifelse(x >= 0 & x <= 7, "0 to 7", "other")) %>% | |
# Send the data frame to the graphing functions | |
ggplot(aes(x = x, y = prob, fill = Interval)) + | |
geom_area(alpha = 0.3) + | |
labs(title = "Probability Of Vulnerabilities Arriving", | |
subtitle = "From 0 to 7 Days", | |
x = "Days", | |
y = "Cumulative Probability") | |
pexp(.01,vulns_per_wait_time) | |
####################### | |
### Bayesian Wait-Times | |
### See GetVulnsGroups() in manifesto_functions.R | |
# Get 10 product teams | |
dt <- GetVulnGroups() | |
# Retrieve first half years worth of data | |
total_weeks <- 26 | |
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 & | |
dt$week.fseen <= total_weeks),] | |
# Get a count of vulns – key variable | |
n = nrow(dte) | |
# Get Interarrivals | |
interarrivals <- diff(sort(yday(dte$first.seen))) | |
# Get Total Wait Time – key variable | |
wait_time <- sum(interarrivals) | |
# Get avg time per vuln | |
avg_wait_time = wait_time/n | |
# Get avg vulns per wait time | |
vulns_per_wait_time <- n/wait_time | |
######################### | |
### Empirical Bayes Prior | |
# Get new data as if it came from the previous year | |
dtPrev <- GetVulnGroups() | |
# Pull only the last quarter’s data. Week 40-52 | |
dtePrev <- dtPrev[(dtPrev$audit == 0 & dtPrev$exposure_id == 1 & | |
dtPrev$severity_id == 1 & | |
(dtPrev$week.fseen > 39 & dtPrev$week.fseen <= 52)),] | |
# Get a list of all the day numbers with events..see example data | |
day_vals <- sort(yday(dtePrev$first.seen)) | |
# Reformat the data for prior analysis | |
prior_int <- GetFirstSeenHits(day_vals, min = 275, max = 365) | |
# Get optimal gamma priors - MLE | |
gvals <- GetGammaPriors(prior_int) | |
alpha = gvals[1] | |
lambda = gvals[2] | |
# Create a list of the possible rates (for graphing only) | |
theta = seq(0, .5, 0.001) | |
# Score possibilities using priors (for graphing only) | |
prior = dgamma(theta, shape = alpha, rate = lambda) | |
##################### | |
### Bayesian Updating | |
# likelihood distribution - for graphing purposes only | |
likelihood = dgamma(wait_time, shape = n, rate = theta) | |
# Posterior - Conjugate prior update. | |
posterior = dgamma(theta, alpha + n, lambda + wait_time) | |
# Bayesian update graph | |
plot_posterior(theta, prior, likelihood, posterior) | |
abline(v = qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time), | |
col = "seagreen", lty = 2) | |
qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time) | |
################################################ | |
### Making Data and Forecasts from the Posterior | |
# Number Of Simulations | |
Nrep = 10000 | |
# prior vuln count + num vulns from data | |
shape = alpha + n | |
# prior time + wait time from data | |
rate = lambda + wait_time | |
# Simulate 10K vuln day rates: .21, .17, .15, .22 etc | |
theta_sim = rgamma(Nrep, shape, rate) | |
# 1/theta_sim is the mean time between events 1/.21 = 4.76 days etc | |
scaled_sim <- 1/theta_sim | |
# Get 89% Highest Density Intervals for graph | |
ci <- ci(scaled_sim, method="HDI") | |
# Get Mean for graph | |
mean_sim <- round(mean(scaled_sim),2) | |
# Put scaled_sim into DF for graph | |
df <- tibble(sim=scaled_sim) | |
# Plot Mean interArrivals | |
ggplot(df, aes(x=sim)) + | |
# Declare a histogram chart | |
geom_histogram(aes(y=..density..), colour="black", | |
fill="white",bins=100)+ | |
# Add a density curve | |
geom_density(alpha=.2, fill="white")+ | |
# Add a vertical line for the lower CI value | |
geom_vline(aes(xintercept=ci$CI_low), | |
color="black", linetype="dashed", size=.5)+ | |
# Add a vertical line for the higher CI value | |
geom_vline(aes(xintercept=ci$CI_high), | |
color="black", linetype="dashed", size=.5)+ | |
# Add a mean line | |
geom_vline(aes(xintercept=mean_sim), | |
color="black", linetype="dashed", size=.5)+ | |
# Decorate the chart with labels | |
labs(title = "Posterior Distribution of Mean interArrivals", | |
subtitle = paste0("Mean Rate: ", mean_sim," | 89% HDI: ", | |
round(ci$CI_low,2), " To: ", round(ci$CI_high,2))) + | |
xlab("Mean interArrival Rates") + | |
ylab("Strength") | |
### Posterior Predictive Distribution | |
# Get interarrival values from the exponential distribution | |
y_sim = rexp(Nrep, rate = theta_sim) | |
# Get 95% and 50% quantiles - used for making vertical lines | |
val95 <- round(quantile(y_sim, 0.95),2) | |
val50 <- round(quantile(y_sim, 0.50),2) | |
# Pass interarrival data into DF | |
df <- tibble(sim=y_sim) | |
# Pass DF into ggplot | |
ggplot(df, aes(x=sim)) + | |
# Declare histogram shape | |
geom_histogram(aes(y=..density..), colour="black", | |
fill="white",bins=100)+ | |
# Curve for chart | |
geom_density(alpha=.2, fill="white")+ | |
# 95% or less vertical line | |
geom_vline(aes(xintercept=mean(val95)), | |
color="black", linetype="dashed", size=.5)+ | |
# 50% or less vertical line | |
geom_vline(aes(xintercept=mean(val50)), | |
color="black", linetype="dashed", size=.5)+ | |
# Labels for chart | |
labs(title = "Posterior Predictive Distribution", | |
subtitle = paste0("95% Chance of Vulnerability In: ", | |
val95," Days Or Less | 50% In: ", | |
val50, " Or Less")) + | |
xlab("Days") + ylab("Strength") + xlim(c(0, 50)) | |
######################## | |
### Mitigatable Surprise | |
GetNVDAnalysis(c(2018:2020), new = TRUE, save = TRUE, from_file = TRUE) | |
GetNVDAnalysis(c(2018:2020)) | |
#### | |
# Note that the supporting functions are all detailed in | |
# manifesto_functions.R |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment