Skip to content

Instantly share code, notes, and snippets.

@ribsy
Last active May 25, 2023 12:38
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save ribsy/be7c682a58bd31e37e8f5e4d71067151 to your computer and use it in GitHub Desktop.
Save ribsy/be7c682a58bd31e37e8f5e4d71067151 to your computer and use it in GitHub Desktop.
my_packages <- c("tidyverse", "lubridate", "survival", "survminer",
"ProbBayes", "Bolstad", "ggplot2", "devtools",
"mc2d", "bayestestR", "scales", "knitr", "ggridges",
"simstudy", "betareg", "data.table", "boot", "broom",
"stats4","jsonlite","tidybayes","bayesplot", "BayesFactor",
"rstanarm")
not_installed <-
my_packages[!(my_packages %in% installed.packages()[ , "Package"])]
if (length(not_installed)){
msgVal = paste("Install packages: ",toString(not_installed))
msgRes <- askYesNo(msgVal)
if( msgRes != TRUE){
stopMsg <- paste("Quitting because you don't want to install:", not_installed)
stop(stopMsg)
}
}
if(length(not_installed)) install.packages(not_installed)
# Deal with ggdistribute - which is in github
my_packages <- c("ggdistribute")
not_installed <- my_packages[!(my_packages %in% installed.packages()[ , "Package"])]
if (length(not_installed)){
msgVal = paste("Install ggdistribute?")
msgRes <- askYesNo(msgVal)
if( msgRes != TRUE){
stopMsg <- paste("Quitting because you don't want to install ggdistribute")
stop(stopMsg)
}
}
if(length(not_installed)) install_github("iamamutt/ggdistribute")
library(devtools)
library(tidyverse)
library(lubridate)
library(survival)
library(survminer)
library(bayestestR)
library(ProbBayes)
library(Bolstad)
library(ggplot2)
library(mc2d)
library(scales)
library(knitr)
library(ggdistribute)
library(ggridges)
library(simstudy)
library(betareg)
library(data.table)
library(boot)
library(broom)
library(jsonlite)
library(stats4)
library(tidybayes)
library(bayesplot)
library(BayesFactor)
library(rstanarm)
select <- dplyr::select
`%!in%` <- Negate(`%in%`)
ScoreBetaBeliefs <- function(bias = c(0.5, 0.5), first_belief = c(2,2), second_belief = c(3,3), hits = 1, misses = 5){
probs <- bias
beta_par <- rbind(first_belief, second_belief) #make dataframe
output <- binomial.beta.mix(bias, beta_par, c(hits, misses)) #prior scoring with data of 12 hits out of 20.
posterior_odds <- output$probs[1]/ output$probs[2] #Bayes Factor
return(posterior_odds)
}
ScoreGammaBeliefs <- function(bias = c(0.5, 0.5), first_belief = c(2,2), second_belief = c(3,3), data = c(13,52)){
probs=bias
beliefs=rbind(first_belief, second_belief)
data = list(y=data[1], t = data[2])
mix <- poisson.gamma.mix(probs,beliefs,data)
scores <- mix$probs[1]/mix$probs[2]
return(scores)
}
MakeBetaGraph <- function(hit, miss, xlab, ylab, tlab, slab , xadj = .01 ){
beta_avg = (hit/(miss + hit))
posterior <- distribution_beta(1000, hit, miss)
ci_hdi <- ci(posterior, method = "HDI")
posterior <- posterior %>%
#estimate_density(extend=TRUE)
estimate_density()
ytop <- max(posterior$y) * .8
posterior %>%
ggplot(aes(x=x, y=y)) +
geom_area(fill="darkslateblue", alpha = 0.6) +
theme_classic() +
geom_vline(xintercept=beta_avg, color="black", size=1) +
geom_vline(xintercept=ci_hdi$CI_low, color="black", size=1, linetype="dotted") +
geom_vline(xintercept=ci_hdi$CI_high, color="black", size=1, linetype="dotted") +
annotate(geom = "text", x = ci_hdi$CI_low + xadj , y = ytop,
label = round(ci_hdi$CI_low,3), fontface = 2, hjust = 0, vjust = -5) +
annotate(geom = "text", x = ci_hdi$CI_high + xadj , y = ytop,
label = round(ci_hdi$CI_high,3), fontface = 2, hjust = 0, vjust = -5) +
annotate(geom = "text", x = beta_avg + xadj , y = ytop,
label = round(beta_avg,3), fontface = 2, hjust = 0, vjust = -5) +
labs(x = xlab, y = ylab, title = tlab, subtitle = slab)
}
MakeRidgeChart2 <- function(eng_risk, title_val, sub_val, xtitle, ytitle, legend_val){
ggplot(eng_risk, aes(x = x_vals, y = factor(yaxis,levels=unique(yaxis)),height = prob,
scale = 1.2, color = legend, fill = factor(legend,levels=rev(unique(legend))))) +
#scale_fill_manual(values = cbp1) +
geom_density_ridges(stat="identity", color = "white", alpha = 0.8,
panel_scaling = TRUE, size = 1) +
guides(fill=guide_legend(title= legend_val)) +
labs(title = title_val,
subtitle = sub_val,
x = xtitle,
y = ytitle)
}
MakeRidgeChart <- function(eng_risk, title_val, sub_val, xtitle, ytitle, legend_val){
ggplot(eng_risk, aes(x = x_vals, y = factor(yaxis,levels=unique(yaxis)),height = prob,
scale = 1.2, color = legend, fill = factor(rev(legend),levels=rev(unique(legend))))) +
#scale_fill_manual(values = cbp1) +
geom_density_ridges(stat="identity", color = "white", alpha = 0.8,
panel_scaling = TRUE, size = 1) +
guides(fill=guide_legend(title= legend_val)) +
labs(title = title_val,
subtitle = sub_val,
x = xtitle,
y = ytitle)
}
GetRope <- function( eng_risk, month_one, month_two){
mone <- eng_risk %>% filter(yaxis == month_one) %>% select(closed, misses) %>% distinct()
mone_dist <- rbeta(1000, mone$closed, mone$misses)
mone_hdi <- ci(mone_dist, method = "HDI")
mtwo <- eng_risk %>% filter(yaxis == month_two) %>% select(closed, misses) %>% distinct()
mtwo_dist <- rbeta(1000, mtwo$closed, mtwo$misses)
mtwo_hdi <- ci(mtwo_dist, method = "HDI")
diff_dist <- mtwo_dist - mone_dist
rope(diff_dist)
}
MakeVulnData <- function(vulnData, prior){
hasGroup <- length(vulnData)
for( i in 1:nrow(vulnData)){
if( i == 1){
hits <- vulnData$hits[i] + prior[1]
misses <- vulnData$misses[i] + prior[2]
yaxis <- vulnData$yaxis[i]
if(hasGroup == 4){
legend <- vulnData$legend[i]
}else{
legend <- yaxis
}
res <- MakeBurnData(hits, misses, yaxis, legend)
eng_risk <- res
}else{
hits <- vulnData$hits[i] + hits
misses <- vulnData$misses[i] + misses
yaxis <- vulnData$yaxis[i]
if(hasGroup == 4){
legend <- vulnData$legend[i]
}else{
legend <- yaxis
}
res <- MakeBurnData(hits, misses, yaxis, legend)
eng_risk <- bind_rows(res, eng_risk)
}
}
return(eng_risk)
}
MakeBurnData <- function(closed, misses, yaxis, legend){
x_vals = seq(from=0,to=1,length = 1000)
dens = dbeta(x_vals,closed, misses)
prob = dens / max(dens)
res <- tibble(x_vals, yaxis , prob, legend, closed, misses, dens)
return(res[res$dens != "Inf",])
}
GetBeliefsEvents <- function(belief, max_belief ){
alpha_ratio <- as.numeric(belief)
beta_ratio <- as.numeric(max_belief)
q1 = list(p=.5, x= alpha_ratio)
q2 = list(p=.9, x= beta_ratio)
return( beta.select(q1, q2) )
}
GetBeliefsHits <- function(hits, max_hits, events ){
alpha_ratio <- hits / events
beta_ratio <- max_hits / events
q1 = list(p=.5, x= alpha_ratio)
q2 = list(p=.9, x= beta_ratio)
return( beta.select(q1, q2) )
}
GetBetResults <- function(guess, tries, w.ball){
guess_cost <- tries * 5
diff_cost <- abs(w.ball - guess) * 5
total_cost <- guess_cost + diff_cost
profit <- 100 - total_cost
print(paste0("Your Winnings: ", profit))
}
GetRolls <- function(tries){
w.ball <- sample(44,1)
val <- ""
for(x in 1:tries){
val <- paste0(rollball(w.ball)," ",val)
}
print(val)
return(w.ball)
}
rollball <- function(w.ball){
if( w.ball == 1){
r.ball <- sample(2:44,1)
}else if( w.ball == 44){
r.ball <- sample(1:43,1)
}else{
r.ball <- sample(c(1:(w.ball - 1), (w.ball + 1):44),1)
}
if(r.ball < w.ball){
return("Left")
}else{
return("Right")
}
}
ErrorCheck <- function(base_configs, event_counts){
a_total <- as.numeric(base_configs["a_n_events"])
a_errors <- as.numeric(event_counts["product_a_errors"])
b_total <- as.numeric(base_configs["b_n_events"])
b_errors <- as.numeric(event_counts["product_b_errors"])
#Remove errors to get on true events
a_true <- a_total - a_errors
b_true <- b_total - b_errors
cat(paste("A Total:",a_total,"A Errors:", a_errors,"A True:", a_true, "\n"))
cat(paste("B Total:",b_total,"A Errors:", b_errors,"B True:", b_true, "\n"))
# Obvious misconfiguration
if( a_total < a_errors){
stop(paste("Exiting: Your errors for product A of: ", a_errors," exceed your total events for product A of: ", a_total))
}
if( b_total < b_errors){
stop(paste("Exiting: Your errors for product B of: ", b_errors," exceed your total events for product B of: ", b_total))
}
# If A's True Vulns (i.e 30) is great than B's Total (i.e 25) -
if( b_total < a_true ){
diff <- a_true - b_total
cat(paste("Product B is missing", diff, "false positive error(s). Add ", diff ," each to B's EVENTS and ERRORS\n"))
stop("Exiting")
}
if( a_total < b_true ){
diff <- b_true - a_total
cat(paste("Product A is missing", diff, "false positive error(s). Add ", diff ," each to A's EVENTS and ERRORS\n"))
stop("Exiting")
}
#Is Prod A/B Missing errors - false negatives specifically?
if( a_true < b_true ){
diff <- b_true - a_true
cat(paste("Adding ", diff ," FALSE NEGATIVE errors to product A for a new total of: ",diff + a_errors,"\n"))
return(c("product_a_errors", diff))
}
if( b_true < a_true ){
diff <- a_true - b_true
cat(paste("Adding ", diff ," FALSE NEGATIVE errors to product B for a new total of: ",diff + b_errors,"\n"))
return(c("product_b_errors", diff))
}
return(c("No Change", 0))
}
CatTotalCost <- function(total_costs, product_costs){
#Get Cost Ratios
prod_a_total_costs <- total_costs[1]
prod_b_total_costs <- total_costs[2]
prod_ratio <- round( prod_a_total_costs / prod_b_total_costs, 3) * 100
prod_ratio <- paste(prod_ratio, "%", sep = "")
#Format as dollars
prod_a_orginal_cost <- dollar_format()(as.numeric(product_costs["prod_a_cost"]))
prod_a_total_final <- dollar_format()(round(as.numeric(prod_a_total_costs),2))
prod_b_orginal_cost <- dollar_format()(as.numeric(product_costs["prod_b_cost"]))
prod_b_total_final <- dollar_format()(round(as.numeric(prod_b_total_costs),2))
#Print Message to stdout
cat("--------Which Product Cost Most To Operate Given Forecasted Errors-------\n")
cat(paste(" Product A is", prod_ratio , "the total cost of product B\n"))
cat(paste(" Product A (with base price of ", prod_a_orginal_cost ,") is expected to cost to operate with errors: ", prod_a_total_final,"\n"))
cat(paste(" Product B (with base price of ", prod_b_orginal_cost,") is expected to cost to operate with errors: ", prod_b_total_final,"\n"))
cat("\n Still uncertain? Get more data by running more tests!")
return(c(prod_ratio, prod_a_orginal_cost, prod_a_total_final, prod_b_orginal_cost, prod_b_total_final))
#print(knitr::kable(out))
}
GraphErrorImpact <- function(posterior, impact){
newdf_a_impact <- data.frame(posterior = posterior$prod_a_impact, product = "Product A")
newdf_b_impact <- data.frame(posterior = posterior$prod_b_impact, product = "Product B")
newdf_ab_impact <- rbind(newdf_a_impact, newdf_b_impact)
impact_text <- paste(" Product A is", impact[1] , "the total cost of product B\n Product A (with base price of ", impact[2] ,") is expected to cost to operate with errors: ", impact[3],"\n Product B (with base price of ", impact[4],") is expected to cost to operate with errors: ", impact[5],"\n")
print(ggplot(newdf_ab_impact, aes(x = posterior, y = ..density.., fill = product)) +
stat_density_ci(alpha = 0.5, n = 1024, geom = "density",position = "identity") +
scale_fill_manual(values=c("#333399", "#CCCCFF")) +
labs(title = "FORECASTING APPSEC ERROR IMPACT\nUsing Bayesian A/B Testing\n", x="Using Small Samples: Predicted Difference In Product Error Impacts",
subtitle = impact_text,
y = "Product Error Comparisons"))
}
CatErrorDiffProp <- function(posterior){
error_rate_diff <- sum(posterior$prop_diff_events > 0 )/length(posterior$prop_diff_events)
cat(paste("--------Error Rate Difference For This Simmulation--------\n Probability that product A makes more errors that Product B is ",round(error_rate_diff * 100) ,"%\n\n", sep = ""))
}
GraphErrorDiff <- function(posterior){
xval <- posterior$prop_diff_events
breaks <- pretty(range(xval), n = nclass.FD(xval), min.n = 1)
bwidth <- breaks[2]-breaks[1]
print(ggplot(posterior, aes(x=prop_diff_events)) +
geom_histogram(aes(y=..density..), colour="black", fill="white", binwidth = bwidth)+
geom_density(alpha=.2, fill="#333399") + labs(title="Event Amount Difference: Product A - Product B",x="Event Rate", y = "Amount")+
geom_vline(xintercept=0, color="red", size=1, linetype = "dotted")
)
}
GraphErrorRates <- function(posterior){
newdf_a <- data.frame(posterior = posterior$prod_a, product = "Product A")
newdf_b <- data.frame(posterior = posterior$prod_b, product = "Product B")
newdf_ab <- rbind(newdf_a, newdf_b)
print(ggplot(newdf_ab, aes(x = posterior, y = ..density.., fill = product)) +
stat_density_ci(alpha = 0.5, n = 1024, geom = "density",position = "identity") +
scale_fill_manual(values=c("#333399", "#CCCCFF")) +
labs(title = "Product A/B Testing", x="Using Small Samples: Predicted Difference In Product Error Rates",
y = "Product Comparisons"))
}
CatUpdatedBeliefRates <- function(posterior, event_counts, base_configs){
out <- "--------Updated Forecasts Given Small Data, Beliefs & Lots of Simulations--------------\n"
# 89% Highest Density Intervals
prod_a_points <- point_estimate(posterior$prod_a)
ci_hdi_prod_a <- ci(posterior$prod_a, method = "HDI")
out <- paste(out, " The previous product A forecast combined with ", base_configs["a_n_events"] ," events and ", event_counts["product_a_errors"] ," errors is between:",
round(ci_hdi_prod_a$CI_low,2) * 100, "% and ",
round(ci_hdi_prod_a$CI_high,2) * 100, "% \n And centered near:",
round(prod_a_points[[1]],2) * 100, "%\n\n", sep="")
# 89% Highest Density Interval
prod_b_points <- point_estimate(posterior$prod_b)
ci_hdi_prod_b <- ci(posterior$prod_b, method = "HDI")
out <- paste(out, " The previous product B foreacst combined with ", base_configs["b_n_events"] ," events and ", event_counts["product_b_errors"]," errors is between:",
round(ci_hdi_prod_b$CI_low,2) * 100, "% and ",
round(ci_hdi_prod_b$CI_high,2) * 100, "% \n And centered near:",
round(prod_b_points[[1]],2) * 100, "%\n\n", sep="")
out <- paste(out, "See the markdown report or plot tab graphs in rstudio to see forecasts combined with data.\n")
cat(out)
}
CatPriorBeliefRates <- function(event_priors){
#Pring Beta Curve
#print(MakeCurves(event_priors[["median_errors"]], event_priors[["edge_errors"]],color="Purple", CI=TRUE, CIT=TRUE))
xval = "Event Probability"
yval = "Event Strength"
tval = "Credible Beliefs About Events"
sval = "With 95% Credible Belief Interval"
beta_vals <- GetBeliefsEvents(event_priors[["median_errors"]], event_priors[["edge_errors"]] )
print(ggplotBeta(beta_vals[1], beta_vals[2], xlab = xval, ylab = yval, tlab = tval, sval = sval))
#print(MakeBetaGraph(beta_vals[1], beta_vals[2], xlab = xval, ylab = yval, tlab = tval,slab = sval))
#Get Beta Prio CIs
priorCI <- GetBetaCI(event_priors["median_errors"], event_priors["edge_errors"])
cat("------Prior Beliefs About Error Rates--------\n")
cat(paste(" You forecasted an ERROR RATE equally ABOVE/BELOW ", round(event_priors[["median_errors"]]*100), "%.\n", sep=""))
cat(paste(" You're also 90% sure the error rate is LESS THAN ", round(event_priors[["edge_errors"]]*100), "%.\n", sep=""))
cat(paste(" Given that forecast and the model we are using you you believe the true rate likely sits between ", priorCI[1],"% and ", priorCI[3], "%\n\n", sep=""))
}
GetTotalCosts <- function(impacts, costs, time_multiple, event_counts, prior_events, base_configs){
product_a_counts <- event_counts[[1]]
product_b_counts <- event_counts[[2]]
# Get Gamma Prior for product a which will get fed into poisson dist in GetEventFrequency()
a_number_events <- as.numeric(base_configs$a_n_events)
a_gamma_median_prior <- prior_events[[1]] * a_number_events
a_gamma_sd <- prior_events[[2]] * a_number_events
# Get Gamma Prior for product b which will get fed into poisson dist in GetEventFrequency()
b_number_events <- as.numeric(base_configs$b_n_events)
b_gamma_median_prior <- prior_events[[1]] * b_number_events
b_gamma_sd <- prior_events[[2]] * b_number_events
prod_a_event_results <- GetEventFrequency(product_a_counts, a_gamma_median_prior, a_gamma_sd)
prod_a_mean_events <- prod_a_event_results$mean
prod_a_event_totals <- prod_a_mean_events * as.numeric(time_multiple)
prod_b_event_results <- GetEventFrequency(product_b_counts, b_gamma_median_prior, b_gamma_sd)
prod_b_mean_events <- prod_b_event_results$mean
prod_b_event_totals <- prod_b_mean_events * as.numeric(time_multiple)
prod_a_total_cost <- prod_a_event_totals * impacts$prod_a_impact + costs$prod_a_cost + costs$prod_a_run
prod_b_total_cost <- prod_b_event_totals * impacts$prod_b_impact + costs$prod_b_cost + costs$prod_b_run
return(c(prod_a_total_cost, prod_b_total_cost) )
}
GetEventFrequency <- function(event_counts, avg_event_count, event_variation, graph = FALSE){
#rate vals can be a vector of results, perhaps events per day, week , month etc
# mode and sd will be converted into rate and scale.
event_count_priors <- GetRatePriors(avg_event_count, event_variation, graph)
result <- poisgamp(event_counts, event_count_priors[1], event_count_priors[2], plot = FALSE, suppressOutput = TRUE)
return(result)
}
GetRatePriors <- function(mode, sd, graph = FALSE){
# This function allows you to think in terms of mode and SD.
# Note that everything is positive..so while the mode might be a small number that is close to 0
# The SD can be large skewing far to the right, without going past zero on the left.
# Here are the corresponding rate and shape parameter values:
ra = ( mode + sqrt( mode^2 + 4*sd^2 ) ) / ( 2 * sd^2 )
sh = 1 + mode * ra
if(graph){
x = seq(0,mode+5*sd,len=1001)
plot( x , dgamma( x , shape=sh , rate=ra ) , type="l" ,
main=paste("dgamma, mode=",mode,", sd=",sd,sep="") ,
ylab=paste("dgamma( shape=",signif(sh,3)," , rate=",signif(ra,3)," )",
sep="") )
abline( v=mode , lty="dotted" )
}
return(c(sh, ra))
}
GetEngCost <- function(size, eng_cost_pert, eng_hour_priors){
mode <- as.numeric(eng_cost_pert[["mode"]])
low <- as.numeric(eng_cost_pert[["low"]])
high <- as.numeric(eng_cost_pert[["high"]])
mode_hrs <- as.numeric(eng_hour_priors[["mode"]])
low_hrs <- as.numeric(eng_hour_priors[["low"]])
high_hrs <- as.numeric(eng_hour_priors[["high"]])
eng_rate <- rpert(size, min = low, mode = mode, max = high, shape = 4)
eng_hours <- rpert(size, min = low_hrs, mode = mode_hrs, max = high_hrs, shape = 4)
eng_cost <- eng_hours * eng_rate
return(eng_cost)
}
GetErrors <- function(n_draws, n_events, proportion_events){
n_errors <- rbinom(n = n_draws, size = n_events, prob = proportion_events)
return(n_errors)
}
GetPropotionOfEvents <- function(n_draws,prior_events){
error_beliefs <- GetBeliefsEvents(prior_events[["median_errors"]], prior_events[["edge_errors"]])
#Make an n_draws length list of error beliefs
proportion_events <- rbeta(as.numeric(n_draws), as.numeric(error_beliefs[[1]]) , as.numeric(error_beliefs[[2]]) )
return(proportion_events)
}
SimulateHits <- function(freq_hits, num_events){
freq_miss <- 1 - freq_hits
res <- sample(c("hit", "miss"), size=num_events, replace=TRUE,
prob=c(freq_hits, freq_miss))
hit <- sum(res == "hit")
miss <- sum(res == "miss")
return( c(hit,miss))
}
GetBetaCI <- function(hit,edge){
betas <- GetBeliefsEvents(hit, edge)
intervals <- qbeta(c(.025, .5, .975), betas[1], betas[2])
intervals <- round(intervals * 100)
return(intervals)
}
MakeCurves <- function(alpha = 0, beta = 0 , color = "black", ytop = 20, CI = FALSE, CIT = FALSE, add_val = FALSE){
alpha <- alpha + 1
beta <- beta + 1
curve(dbeta(x, alpha , beta ), from=0, to=1,
xlab="SME Forecast For Proportion of Events", ylab="Density",
col= color, lwd=4, las=1, add = add_val , ylim = c(0,ytop))
if( CI ){
intervals <- qbeta(c(.025, .975), alpha, beta)
abline(v = intervals[1], lty=2)
abline(v = intervals[2], lty=2)
if( CIT ){
adj <- 0
if( intervals[2] - intervals[1] < 0.11 ){
adj <- 1
}
text(intervals[1] + 0.03,ytop, round(intervals[1], 2))
text(intervals[2] + 0.03 ,ytop - adj, round(intervals[2], 2))
}
}
}
ggplotBeta <- function(alpha, beta, xlab = xval, ylab = yval, tlab = "Beliefs", sval = "Test"){
x = seq(0, 1, 0.01)
y = dbeta(x, shape1=alpha, shape2=beta)
density = data.frame(x=x, y=y)
mean=alpha/(alpha+beta)
mode=(alpha-1)/(alpha+beta-2)
intervals <- qbeta(c(.025, .975), alpha, beta)
#print(paste0("sval: ", sval, " intervals: ", round(intervals[1],2), " to ",round(intervals[2],2) ))
sval <- paste0(sval, " of ", round(intervals[1],2), " to ", round(intervals[2],2), " and mean: ", round(mean,2))
res <- ggplot(density, aes(x, y)) +
geom_area(fill= "darkslateblue", alpha = 0.6) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y = element_blank()) +
geom_vline(xintercept = intervals[1]) +
geom_vline(xintercept = intervals[2])+
geom_vline(xintercept = mean)+
labs(x = xlab, y = ylab, title = tlab, subtitle = sval)
return(res)
}
GetSize <- function(size, first, second){
res <- ifelse(length(first) >= length(second), length(second), length(first) )
if( size > res){
return(res)
}
return(size)
}
GetEngHours <- function(size, med = 10, ninety = 60, total_hours = 100){
beta_vals <- GetBeliefsHits(med, ninety, total_hours)
prop_hours <- rbeta(size, beta_vals[1], beta_vals[2])
hours <- rbinom(size, total_hours, prop_hours)
return(hours)
}
#' ##Wrapper Function
#' This just makes it easy for users to call one function without much fuss.
runProductCostComparison <- function(base_configs, event_priors, event_counts, eng_hour_range, eng_cost_range, product_costs){
# A Quick Check for missed false negatives between products and other sundry.
update_count <- ErrorCheck(base_configs, event_counts)
if( update_count[2] > 0){
event_counts[update_count[[1]]] <- as.numeric(update_count[[2]]) + event_counts[[update_count[1]]]
if( update_count[1] == "product_a_errors"){
base_configs$a_n_events <- base_configs$a_n_events + as.numeric(update_count[[2]])
print(paste("Updating Product A Events: ", base_configs$a_n_events))
}
if( update_count[1] == "product_b_errors"){
base_configs$b_n_events <- base_configs$b_n_events + as.numeric(update_count[[2]])
print(paste("Updating Product B Events: ", base_configs$a_n_events))
}
}
#' ##Prior Beliefs About Error Rates
#' In the *event_priors* config element above you specified what you think is the likely error rate given N events. This graph shows you"
#' the spread of possible rates given our beliefs. It also gives some indication of where the most likely rates may cluster."
CatPriorBeliefRates(event_priors)
#Number of simulations (draws), tests (events), and default array index sizes
n_draws <- as.numeric(base_configs[["n_draws"]])
a_n_events <- as.numeric(base_configs[["a_n_events"]])
b_n_events <- as.numeric(base_configs[["b_n_events"]])
size <- 10000
#Create a list of simmulated events and error RATES based on priors
proportion_events <- GetPropotionOfEvents(n_draws, event_priors)
a_n_errors <- GetErrors(n_draws, a_n_events, proportion_events)
b_n_errors <- GetErrors(n_draws, b_n_events, proportion_events)
#Put simulations into dataframe
a_prior <- data.frame(proportion_events, a_n_errors)
b_prior <- data.frame(proportion_events, b_n_errors)
#print(head(b_prior,10))
# Select the simulationed prior data that correlates to the real product errors
product_a <- a_prior[a_prior$a_n_errors == event_counts[["product_a_errors"]], ]
product_b <- b_prior[b_prior$b_n_errors == event_counts[["product_b_errors"]], ]
#print(head(product_a))
#print(head(product_b))
#Sample to create indices for posterior and load into data frame
sample_a_indices <- sample( nrow(product_a), size = size, replace = TRUE, prob = product_a$proportion_events)
sample_b_indices <- sample( nrow(product_b), size = size, replace = TRUE, prob = product_b$proportion_events)
posterior <- data.frame(
prod_a = product_a$proportion_events[sample_a_indices],
prod_b = product_b$proportion_events[sample_b_indices])
#print(head(posterior))
#' Updated beliefs given real error rates
CatUpdatedBeliefRates(posterior, event_counts, base_configs)
#' Create and Print out Error Rate Graphs
GraphErrorRates(posterior)
#' Calculate the rate differences, graph and print out
posterior$prop_diff_events <- posterior$prod_a - posterior$prod_b
print(head(posterior,10))
GraphErrorDiff(posterior)
CatErrorDiffProp(posterior)
#This gets our impact per event with lots of uncertainty packed in
eng_cost <- GetEngCost(size, eng_cost_range, eng_hour_range)
# Add forecasted expected cost column to posterior for product a and b and store in list
posterior$prod_a_impact <- posterior$prod_a * eng_cost
posterior$prod_b_impact <- posterior$prod_b * eng_cost
#Load Median values into list of use later
impacts <- list()
impacts["prod_a_impact"] <- median(posterior$prod_a_impact)
impacts["prod_b_impact"] <- median(posterior$prod_b_impact)
# Add the column that holds the difference between the two costs
posterior$prod_diff_impact <- posterior$prod_a_impact - posterior$prod_b_impact
print("Here is a glimpse into some of the data.")
#print(head(posterior,10))
total_costs <- GetTotalCosts(impacts, product_costs, base_configs["time_multiple"], event_counts, event_priors, base_configs)
#' #Total Costs Differences
impact_res <- CatTotalCost(total_costs, product_costs)
#' Graph Impact Difference
GraphErrorImpact(posterior, impact_res)
}
GetVulnPredictions <- function(beta_closed, beta_opened, future_counts){
ab = c(beta_closed, beta_opened)
future_count = future_counts;
future_list = 0:future_count
plf = pbetap(ab, future_count, future_list)
return(round(cbind(future_list, plf), digits=3))
}
GetVulnsOpenedByWks <- function(data_table = dt, week_list = 1:52, group_list = 1:10, audit_list = 0:1, exposure_list = 1:3){
vulns <- data_table %>%
filter(week.fseen %in% week_list & group %in% group_list & audit %in% audit_list & exposure_id %in% exposure_list) %>%
select(week.fseen, severity, group, hits) %>%
group_by(group,severity, week.fseen) %>%
summarise(ext_count = sum(hits),.groups = 'drop')
#52 weeks of zeros 3X
empty_week <- tibble(week.fseen = rep(1:52,length(group_list)),ext_count = 0)
# Include Weeks With Zero Counts
vuln_wks <- right_join(vulns,empty_week, by = "week.fseen") %>%
mutate( new_count = if_else(is.na(ext_count.x),0, ext_count.x)) %>%
select(week.fseen, severity,group, new_count) %>% distinct() %>%
group_by(week.fseen, severity) %>% summarize(vuln_count = sum(new_count),.groups = 'drop') %>%
arrange(week.fseen) %>% distinct()
vuln_wks <- vuln_wks %>% pivot_wider(id_cols = c(week.fseen),
names_from = severity, values_from = c(vuln_count),values_fill=0)
vuln_wks <- vuln_wks %>% select(week.fseen, extreme, critical, high)
col_len <- length(vuln_wks)
col_start <- col_len - 2
vuln_wks <- vuln_wks %>% replace(is.na(.), 0) %>% mutate(all = rowSums(across(col_start:col_len)))
vuln_wks <- vuln_wks %>% select(week.fseen, extreme, critical, high, all)
return(vuln_wks)
}
GetVulnGroups <- function(group_count = 10, ext = c(.01,.03), sev = c(.031, .07), hgh = c(.071, .15)){
dt <- tibble()
for(x in 1:group_count){
extreme <- sample(seq(from = ext[1], to = ext[2], by= .001), size = 1)
severe <- sample(seq(from = sev[1], to = sev[2], by= .001), size = 1)
high <- sample(seq(from = hgh[1], to = hgh[2], by= .001), size = 1)
patch_fail <- sample(1:4,size=1)
dev_maturity <- sample(1:4,size=1)
#print(paste0("Ext: ",extreme," Sev: ", severe, " High: ", high, " Patch Fail: ", patch_fail, " Dev Mat: ", dev_maturity))
res <- GetAdvSecResults(c(extreme, severe, high), patch_fail, dev_maturity)
res$group <- x
dt <- bind_rows(dt,res)
}
return(dt)
}
betaFormula <- function(audit, severity_id, exposure_id, time_adj, nDays, dev_maturity){
adj_val <- ((nDays*2)/365) * (dev_maturity * 3)
#print(adj_val)
alpha <- (((audit*3 + severity_id * 3 + exposure_id * 3) * (severity_id*1.43)) * (1 + time_adj))
#print(alpha)
mult <- if_else(alpha < 10, 2.5, if_else(alpha < 30,2,1.5))
beta <- alpha * mult
avg <- (alpha - adj_val)/(alpha + beta)
avg[avg <= 0] <- .005
#print(avg)
return(avg)
}
betaVariance <- function(audit, severity_id, exposure_id, time_adj){
alpha <- (((audit*3 + severity_id * 3 + exposure_id * 3) * (severity_id*1.43)) * (1 + time_adj))
mult <- if_else(alpha < 10, 2, if_else(alpha < 30,2,1.5))
beta <- alpha * mult
ret <- ((alpha * beta)/(alpha + beta)^2 * (alpha + beta + 1))
return(round(ret,4))
}
binomVariance <- function(audit, severity_id, exposure_id, time_adj){
avg <- (((audit*3 + severity_id * 3 + exposure_id * 3) * (severity_id*1.43)) * (1 + time_adj))
mult <- if_else(avg < 10, 4, if_else(avg < 30,3,2))
var <- round(avg * mult)
#print(paste0("Binom Var: ", var))
return(var)
}
GetSecResults <- function(events = c(.05,.10,.2), event_corr = 0.3, patch_fail = 1){
#Simplify, 6 team combinations, so groups of 6
team_count <- 6
days <- 365
total <- days * team_count
eventSum <- sum(events)
# Create 6 Groups of 365 Each.
team <- defData(varname = "team", dist = "nonrandom",formula = 0, id = "idTeam")
team <- defData(team,varname = "nDays",dist = "nonrandom", formula = days)
team <- genData(team_count, team)
team <- genCluster(team, cLevelVar = "idTeam", numIndsVar = "nDays", level1ID = "id")
#Put days into the nDays colums
team$nDays <- (team$id - (365*(team$idTeam - 1)))
# Probablity each day will have one or more extreme, critical or high risk events.
# Mild correlation between the variables
dt <- genCorGen(n = total, nvars = 3, params1 = c(events[1], events[2], events[3]),
dist = "binary", rho = event_corr, corstr = "cs", wide = TRUE,
method = "ep",cnames = "extreme_event,critical_event,high_event", id = "id")
# For each day with an Ext,Crit,High events randomly draw a small number of hits for each
# These are slightly more collelated.
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "extreme_event", cnames = "extreme")
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "critical_event", cnames = "critical")
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "high_event", cnames = "high")
#Event showed 1 but poisson returned 0 - Make it at least 1
dt$extreme[dt$extreme_event == 1 & dt$extreme == 0] <- 1
dt$critical[dt$critical_event == 1 & dt$critical == 0] <- 1
dt$high[dt$high_event == 1 & dt$high == 0] <- 1
#Merge the Team/Days data with the hits/events data
dt <- mergeData(team, dt, "id")
#Add Exposure Categories for Audit and Exposure Type
dt$audit <- if_else(dt$id > total/2,1,0)
#Set Exposure Type
dt$exposure[dt$idTeam %in% c(1, 4)] <- "external"
dt$exposure[dt$idTeam %in% c(2, 5)] <- "partner"
dt$exposure[dt$idTeam %in% c(3, 6)] <- "internal"
#Add Exposure ID Field
dt$exposure_id[dt$exposure == "external"] <- 1
dt$exposure_id[dt$exposure == "partner"] <- 2
dt$exposure_id[dt$exposure == "internal"] <- 3
#Add Names based on audit and exposure type - note that this is just text, idTeam is most important.
dt <- genFactor(dt, "idTeam",
labels = c("external audit", "partner audit", "internal audit",
"external no audit", "partner no audit", "internal no audit"))
#Copy team name over and get a clean dataframe.
dt$team <- dt$fidTeam
## I think this may be the error
dt <- dt %>% select(idTeam, team, nDays, id, extreme, critical, high, audit, exposure, exposure_id)
#Need To move melt to later as it messes with defCondition
dt <- melt(dt, id.vars = c("idTeam","team","nDays","id","audit","exposure","exposure_id" ),
measure.vars = c("extreme", "critical", "high"),
variable.name = "severity", value.name = "hits")
#Cache records with Zeros for Poisson
dtZeros <- dt %>% filter(hits == 0)
#Get One Vuln Per Row i.e. it drops zeros...
dt <- dt[rep(1:.N,hits)]
dt$hits <- 1
#Reset ID field
dt$id <- 1:length(dt$idTeam)
setkey(dt,id)
#####Get Remediation Times
# Make numeric for formula input
dt$severity_id[dt$severity == "extreme"] <- 1
dt$severity_id[dt$severity == "critical"] <- 2
dt$severity_id[dt$severity == "high"] <- 3
# Set Fix Time Based On Vuln, Asset and Audit
# Note small adjustment upward based on event rate divided by 2
defFix <- defDataAdd(varname = "fix_time",
formula = "((audit*3 + severity_id * 3 + exposure_id * 3) * (severity_id*1.43))",
dist = "noZeroPoisson")
dt <- addColumns(defFix, dt)
#Fix Date
defFixDate <- defDataAdd(varname = "fix_date", formula = "fix_time + nDays", dist = "nonrandom")
dt <- addColumns(defFixDate, dt)
#Chance of Something NOT getting fixed goes up with more time.
defStatus <- defDataAdd(varname = "status", formula = "(fix_time/..patch_fail)/365", dist = "binary")
dt <- addColumns(defStatus, dt)
#Add date stamps for first seen and last seen
dt$first.seen <- if_else(dt$nDays <= 365, as.Date(dt$nDays, origin = "2020-01-01"),
as.Date(dt$nDays, origin = "2021-01-01"))
dt$last.seen <- if_else(dt$nDays <= 365, as.Date(dt$fix_date, origin = "2020-01-01"),
as.Date(dt$fix_date, origin = "2021-01-01"))
#Add week number for simplicity
dt$week.fseen <- if_else(dt$nDays <= 365, week(dt$first.seen), as.integer((week(dt$first.seen) + 52)))
dt$week.lseen <- if_else(year(dt$last.seen) == 2020, week(dt$last.seen), as.integer(week(dt$last.seen) + 52))
return(dt)
}
# Advance Option
GetAdvSecResults <- function(events = c(.02,.10,.2), patch_fail = 1, dev_maturity = 4){
###### Parameters ##########
# EVENTS = chance of having an event for extreme, crit, high per day
# PATCH_FAIL = higher the number less like to change the status of a vuln to "still open"
# DEV_MATURITY = 1-4 maturity level. Higher increases the likelihood of an event NOT happening.
# works with where you are in the year in terms of days and other factors to
# adjust the likelihood of events. It also has an impact on rate of closure.
###################
## Define Constants
#Simplify, 6 team combinations, so groups of 6
team_count <- 6
days <- 365
total <- days * team_count
eventSum <- sum(events)
###############
## TEAM SECTION
# Create 6 Groups of 365 Each.
team <- defData(varname = "team", dist = "nonrandom",formula = 0, id = "idTeam")
team <- defData(team,varname = "nDays",dist = "nonrandom", formula = days)
team <- defData(team, varname = "dev_maturity", dist = "nonrandom", formula = "..dev_maturity")
team <- genData(team_count, team)
team <- genCluster(team, cLevelVar = "idTeam", numIndsVar = "nDays", level1ID = "id")
#Put days into the nDays colums
team$nDays <- (team$id - (365*(team$idTeam - 1)))
########################
## Vulnerability Section
# Probablity each day will have one or more extreme, critical or high risk events.
# Mild correlation between the variables
dt <- genData(n=team_count)
dt <- addPeriods(dt, days,perName = "nDays")
dt$nDays <- dt$nDays + 1
#ifelval <- if_else((events[1] - ((round(dt$nDays/(90 * (4/dev_maturity)),1)/100) * 1/2)) < 0, .005,events[1] - ((round(dt$nDays/(90 * (4/dev_maturity)),1)/100) * 1/2))
#print(paste0("ifelval res: ", ifelval))
defExtEvent <- defDataAdd(varname = "extreme_event", formula = "if_else((..events[1] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 1/2)) < 0, .005,..events[1] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 1/2))", dist = "binary")
dt <- addColumns(defExtEvent, dt)
defCritEvent <- defDataAdd(varname = "critical_event", formula = "if_else((..events[2] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 2/2)) < 0, .005,..events[2] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 2/2))", dist = "binary")
dt <- addColumns(defCritEvent, dt)
defHighEvent <- defDataAdd(varname = "high_event", formula = "if_else((..events[3] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 3/2)) < 0, .005,..events[3] - ((round(nDays/(90 * (4/..dev_maturity)),1)/100) * 3/2))", dist = "binary")
dt <- addColumns(defHighEvent, dt)
#view(dt)
#Use dplyr to get fields, and must reset id for data.table
#print(dt)
dt <- dt %>% mutate(id = timeID) %>% select(id, extreme_event, critical_event, high_event, dev_maturity)
#dt <- dt %>% mutate(id = timeID) %>% select(id, extreme_event, critical_event, high_event)
setkey(dt,id)
# For each day with an Ext,Crit,High events randomly draw a small number of hits for each
# These are slightly more collelated.
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "extreme_event", cnames = "extreme")
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "critical_event", cnames = "critical")
dt <- addCorGen(dtOld = dt, idvar = "id", nvars = 1, rho = .4, corstr = "cs",
dist = "poisson", param1 = "high_event", cnames = "high")
#view(dt)
#Event showed 1 but poisson returned 0 - Make it at least 1
dt$extreme[dt$extreme_event == 1 & dt$extreme == 0] <- 1
dt$critical[dt$critical_event == 1 & dt$critical == 0] <- 1
dt$high[dt$high_event == 1 & dt$high == 0] <- 1
################
## Asset Section
#Merge the Team/Days data with the hits/events data
dt <- mergeData(team, dt, "id")
#Add Exposure Categories for Audit and Exposure Type
dt$audit <- if_else(dt$id > total/2,1,0)
#Set Exposure Type
dt$exposure[dt$idTeam %in% c(1, 4)] <- "external"
dt$exposure[dt$idTeam %in% c(2, 5)] <- "partner"
dt$exposure[dt$idTeam %in% c(3, 6)] <- "internal"
#Add Exposure ID Field
dt$exposure_id[dt$exposure == "external"] <- 1
dt$exposure_id[dt$exposure == "partner"] <- 2
dt$exposure_id[dt$exposure == "internal"] <- 3
#Add Names based on audit and exposure type - note that this is just text, idTeam is most important.
dt <- genFactor(dt, "idTeam",
labels = c("external audit", "partner audit", "internal audit",
"external no audit", "partner no audit", "internal no audit"))
#Copy team name over and get a clean dataframe.
dt$team <- dt$fidTeam
###################
## Reformat Section
## Select important columns
dt <- dt %>% select(idTeam, team, nDays, id, dev_maturity, extreme, critical, high, audit, exposure, exposure_id)
#Need To move melt to later as it messes with defCondition
dt <- melt(dt, id.vars = c("idTeam","team","nDays","id","dev_maturity","audit","exposure","exposure_id" ),
measure.vars = c("extreme", "critical", "high"),
variable.name = "severity", value.name = "hits")
#Cache records with Zeros for Poisson
#dtZeros <- dt %>% filter(hits == 0)
dt$hits <- if_else(is.na(dt$hits),0,dt$hits)
#Get One Vuln Per Row i.e. it drops zeros...
dt <- dt[rep(1:.N,hits)]
#Reset ID field
dt$id <- 1:length(dt$idTeam)
setkey(dt,id)
######
## Remediation Time Section
# Make numeric for formula input
dt$severity_id[dt$severity == "extreme"] <- 1
dt$severity_id[dt$severity == "critical"] <- 2
dt$severity_id[dt$severity == "high"] <- 3
#Ranomom Time adjustment roughly based on event volume. Used for Fix Time Generator
defTimeAdj <- defDataAdd(varname = "time_adj", formula = eventSum, variance = .1, dist = "normal")
dt <- addColumns(defTimeAdj, dt)
dt$time_adj <- if_else(dt$time_adj <= 0 , .01, dt$time_adj)
#print(dt)
#Get Probablity Associated With Time from Beta Dist
#print(paste0("BetaFormula: ", betaFormula(dt$audit,dt$severity_id, dt$exposure_id,dt$time_adj, dt$nDays, dt$dev_maturity)))
#print(paste0("BetaVariance: ", betaVariance(dt$audit,dt$severity_id, dt$exposure_id,dt$time_adj)))
betaDef <- defDataAdd(varname = "beta_prob",
formula = "betaFormula(audit,severity_id, exposure_id,time_adj, nDays, dev_maturity)",
variance = "betaVariance(audit,severity_id, exposure_id,time_adj)",
dist = "beta")
#print(betaDef)
#Add column with list of probabilities
dt <- addColumns(betaDef, dt)
#print(paste0("Beta Prob: ",dt$beta_prob))
#print(paste0("Variance: ", binomVariance(dt$audit,dt$severity_id, dt$exposure_id,dt$time_adj)))
# Use beta probabilities as input into binaomial for time to fix
binDef <- defDataAdd(varname = "fix_time",
formula = "beta_prob",
variance = "binomVariance(audit,severity_id, exposure_id,time_adj)",
dist = "binomial"
)
#print(binDef)
#Create column for time to fix.
dt <- addColumns(binDef, dt)
#Update any time to fix zero values with a 1
dt$fix_time[dt$fix_time == 0] <- 1
#Fix Date
defFixDate <- defDataAdd(varname = "fix_date", formula = "fix_time + nDays", dist = "nonrandom")
dt <- addColumns(defFixDate, dt)
#Chance of Something NOT getting fixed goes up with more time.
defStatus <- defDataAdd(varname = "status", formula = "if_else((fix_time/..patch_fail)/365 > 1,1,(fix_time/..patch_fail)/365)", dist = "binary")
dt <- addColumns(defStatus, dt)
#Add date stamps for first seen and last seen
dt$first.seen <- if_else(dt$nDays <= 365, as.Date(dt$nDays, origin = "2020-01-01"),
as.Date(dt$nDays, origin = "2021-01-01"))
dt$last.seen <- if_else(dt$nDays <= 365, as.Date(dt$fix_date, origin = "2020-01-01"),
as.Date(dt$fix_date, origin = "2021-01-01"))
#Add week number for simplicity
dt$week.fseen <- if_else(dt$nDays <= 365, week(dt$first.seen), as.integer((week(dt$first.seen) + 52)))
dt$week.lseen <- if_else(year(dt$last.seen) == 2020, week(dt$last.seen), as.integer(week(dt$last.seen) + 52))
return(dt)
}
GetArrivalPrior <- function(v_open){
# Functions used to get optimized gamma values
boot_mean = function(original_vector, resample_vector) {
mean(original_vector[resample_vector])
}
obj = function(x){
sum(abs(pgamma(q=qbs, shape=x[1], rate=x[2]) - c(.05, .95)))
}
mean_results <- boot(v_open, boot_mean, R = 2000)
qbs = quantile(mean_results$t, prob=c(.05, .95))
theta = optim(par = c(1, 1), obj)$par
return(theta)
}
plotGammaAdv <- function(theta, prior, ytop = 1,xval = "Rate", yval = "Density",
tlab = "Rate Prior", slab = ""){
pdf <- tibble(gamma_val = prior, theta_val = theta)
ggplot(pdf,aes(x=theta, y=gamma_val)) +
geom_density(aes(x=theta,y=gamma_val),stat="identity",fill = "darkslateblue",
alpha = 0.6) +
xlab(xval) +
ylab(yval) +
labs(title = tlab, subtitle = slab, xlab = xval, ylab = yval)
}
plot_posterior <- function(theta, prior, likelihood, posterior) {
ymax = max(c(prior, posterior))
scaled_likelihood = likelihood * ymax / max(likelihood)
plot(theta, prior, type='l', col='orange', xlim= range(theta), ylim=c(0, ymax), ylab='',xlab='Rate', yaxt='n')
par(new=T)
plot(theta, scaled_likelihood, type='l', col='skyblue', xlim=range(theta), ylim=c(0, ymax), ylab='' ,xlab='Rate', yaxt='n')
par(new=T)
plot(theta, posterior, type='l', col='seagreen', xlim=range(theta), ylim=c(0, ymax), ylab='',xlab='Rate', yaxt='n')
legend("topright", c("prior", "likelihood", "posterior"), lty=1, col=c("orange", "skyblue", "seagreen"))
}
posteriorGraph <- function(samples){
ci_hdi <- ci(samples, method = "HDI", verbose = FALSE)
ci_hdi_small <- ci(samples, method = "HDI", ci = .50, verbose = FALSE)
pdf <- tibble(samples = samples)
ggplot(pdf, aes(x = samples)) +
#geom_area(fill="darkslateblue", alpha = 0.6)
geom_density(fill = "darkslateblue", alpha = 0.6, adjust = 4.5) +
geom_vline(xintercept = ci_hdi$CI_low, color = "black", size = .5) +
geom_vline(xintercept = ci_hdi$CI_high, color = "black", size = .5) +
geom_vline(xintercept = ci_hdi_small$CI_low, color = "red", size = .5) +
geom_vline(xintercept = ci_hdi_small$CI_high, color = "red", size = .5)
}
GetNVDAnalysis <- function(years, sev_val = 9, prior_mult = 1, end = 365, theta_range = c(.5,2.0),
new = FALSE, save = FALSE, fill_curve = "white", match_string = "default", from_file = FALSE){
Nrep = 10000
count = 1
if( match_string == "default"){
match_string= "Windows|Microsoft|Office|Mac|Apple|Browser|cisco|java|Oracle|adobe"
}
for(year in years){
prev_year <- year - 1
# use cached files
if( count > 1){
prior <-GetNVDPriors(prev_year,sev_val, match_string = match_string, from_file = from_file) * prior_mult
}else{
# Cache file
prior <-GetNVDPriors(prev_year,sev_val, new_file = new, save_file = save, match_string = match_string, from_file = from_file) * prior_mult
count = count + 1
}
rates <- GetNVDRatesByYear(year, sev_val, day_end = end, get_new = new, cache = save, match_string = match_string, from_file = from_file)
scale <- prior$alpha + rates$num_vulns
rate <- prior$lambda + rates$num_vulns * rates$avg_days
ppd_graph <- GetPostPredDist(Nrep, scale, rate, year, fill_curve)
pm_graph <- GetPostPredMeanGraph(Nrep, scale, rate, year, fill_curve)
pgg <- GetGammaPoisChart(prior, rates)
print(ppd_graph)
print(pm_graph)
print(pgg)
}
}
GetNVDPriors <- function(year, sev_val = 9, optim_vals = c(1,1), new_file = FALSE, save_file = FALSE,
match_string = "default", from_file = FALSE){
nvd_prev <- GetNVDData(year, new = new_file, save = save_file, from_file = from_file)
# Get critical findings
prior_open <- GetCritDF(nvd_prev,sev_val, match_string)
# Sort findings by day of year
prior_open <- sort(prior_open$pub_day)
# Add zeros in the list of year days where there were no findings
prior_open <- GetFirstSeenHits(prior_open, min = 1, max = 365)
g_priors <- GetGammaPriors(prior_open)
# Create readable variable names for prior
ptibble <- tibble(alpha = g_priors[1], lambda = g_priors[2])
return(ptibble)
}
GetNVDRatesByYear <- function(year, sev_val = 9, day_end = 365, get_new = FALSE, cache = FALSE, match_string = "default", from_file = FALSE){
nvd_df <- GetNVDData(year, new = get_new, save = cache, from_file = from_file)
nvd_df <- nvd_df[yday(nvd_df$pub_date) <= day_end,]
#print(nvd_df)
crit_df <- GetCritDF(nvd_df,sev_val, match_string)
wait_time <- sum(diff(sort(crit_df$pub_day)))
num_vulns = nrow(crit_df)
vulns_per_wait_time <- num_vulns/wait_time
avg_days = wait_time/num_vulns
retTib <- tibble(wait_time, num_vulns, vulns_per_wait_time, avg_days)
return(retTib)
}
GetPostPredDist <- function(Nrep, prior, rates, year = "", fill_curve = "white"){
#print(rates)
#shape_val <- prior$alpha + rates$num_vulns
shape_val <- prior
#rate_val <- prior$lambda + rates$num_vulns * rates$avg_days
rate_val <- rates
if( year != ""){
year = paste0(year, " ")
}
theta_sim = rgamma(Nrep, shape_val, rate_val)
y_sim = rexp(Nrep, rate = theta_sim)
val95 <- round(quantile(y_sim, 0.95),2)
val50 <- round(quantile(y_sim, 0.50),2)
df <- tibble(sim=y_sim)
res <- ggplot(df, aes(x=sim)) +
geom_histogram(aes(y=..density..), colour="black", fill="white",bins=100)+
geom_density(alpha=.2, fill=fill_curve)+
geom_vline(aes(xintercept=mean(val95)),
color="black", linetype="dashed", size=.5)+
geom_vline(aes(xintercept=mean(val50)),
color="black", linetype="dashed", size=.5)+
labs(title = paste0(year, "Posterior Predictive Distribution"),
subtitle = paste0("95% Chance Vuln In: ", val95," Days Or Less | 50% In: ",
val50, " Or Less")) +
xlab("Days") + ylab("Strength") + xlim(c(0, max(df$sim)))
return(res)
}
GetPostPredMeanGraph <- function(Nrep, prior, rates, year = "", fill_curve = "white"){
#shape_val <- prior$alpha + rates$num_vulns
#rate_val <- prior$lambda + rates$num_vulns * rates$avg_days
shape_val <- prior
rate_val <- rates
if( year != ""){
year = paste0(year, " ")
}
theta_sim = rgamma(Nrep, shape_val, rate_val)
tratio <- 1 / theta_sim
ci <- ci(tratio, method="HDI")
mean_sim <- round(mean(tratio),2)
df <- tibble(sim=tratio)
res <- ggplot(df, aes(x=sim)) +
geom_histogram(aes(y=..density..), colour="black", fill="white",bins=100)+
# Comment
geom_density(alpha=.2, fill= fill_curve)+
geom_vline(aes(xintercept=ci$CI_low),
color="black", linetype="dashed", size=.5)+
geom_vline(aes(xintercept=ci$CI_high),
color="black", linetype="dashed", size=.5)+
geom_vline(aes(xintercept=mean_sim),
color="black", linetype="dashed", size=.75)+
labs(title = paste0(year,"Posterior Distribution of Mean Inter-Arrivals"),
subtitle = paste0("Mean Rate: ", mean_sim," | 89% HDI: ",
round(ci$CI_low,2), " To: ", round(ci$CI_high,2))) +
xlab("Mean Waiting Time - Days") + ylab("Strength")
return(res)
}
GetGammaPoisChart <- function(prior,data){
p_scale <- prior$alpha + data$num_vulns
p_rate <- prior$lambda + data$num_vulns * data$avg_days
rng <- GetRanges(prior, data, p_scale, p_rate)
sim_g <- rgamma(1000,p_scale,p_rate)
ci <- ci(sim_g,method = "HDI")
ggraph <- ggplot(data.frame(x = c(rng$min_r, rng$max_r)), aes(x = x)) +
stat_function(fun = dgamma, args = list(prior$alpha, prior$lambda),
aes(colour = "Prior"), size = 1.5, geom = "area", fill="goldenrod", alpha=0.2) +
stat_function(fun = dgamma, args = list(data$num_vulns, data$wait_time),
aes(colour = "Likelihood"), size = 1.5, geom = "area", fill="red", alpha=0.2) +
stat_function(fun = dgamma, args = list(p_scale, p_rate),
aes(colour = "Posterior"), size = 1.5, geom = "area", fill="blue", alpha=0.2) +
scale_x_continuous(name = "Vuln Rate")+
scale_y_continuous(name = "Strength") +
ggtitle("Normal function curves of probabilities") +
scale_colour_manual("Bayes Curves", values = c("red", "blue", "goldenrod")) +
geom_vline(aes(xintercept=ci$CI_low),color="black", linetype="dashed", size=.5)+
geom_vline(aes(xintercept=ci$CI_high),color="black", linetype="dashed", size=.5)+
theme_bw()
return(ggraph)
}
GetFirstSeenHits <- function(vals,min = 1, max = 365){
# Holder for Result
res <- c()
# Loop through day range
for(x in min:max){
# If x is a day with a hit in vals...process
if(x %in% vals){
# Get count of x's in vals update return vector
res <- c(res, sum(vals == x))
}else{
# x is not in vals, return a 0 to vector
res <- c(res,0)
}
}
return(res)
}
GetGammaPriors <- function(prior_int){
boot_mean = function(original_vector, resample_vector) {
mean(original_vector[resample_vector])
}
mean_results <- boot(prior_int, boot_mean, R = 2000)
qbs = quantile(mean_results$t, prob=c(.05, .95))
obj = function(x){
sum(abs(pgamma(q=qbs, shape=x[1], rate=x[2]) - c(.05, .95)))
}
gvals = optim(par = c(1, 1), obj)$par
return(gvals)
}
GetNVDData <- function(year, new = FALSE, save = FALSE, from_file = FALSE){
print(paste0("Year:", year))
if(new == FALSE){
nvd_df <- readRDS(paste0("nvd_",year,".RDS"))
return(nvd_df)
}
if( from_file == FALSE){
temp <- tempfile()
download.file(paste0("https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-",year,".json.zip"),temp)
nvd <- fromJSON((unz(temp, paste0("nvdcve-1.1-",year,".json"))))
unlink(temp)
}else{
nvd <- fromJSON((unz(paste0("nvdcve-1.1-",year,".json.zip"),paste0("nvdcve-1.1-",year,".json"))))
}
cves <- nvd$CVE_Items$cve$CVE_data_meta$ID
pub_date <- nvd$CVE_Items$publishedDate
mod_date <- nvd$CVE_Items$lastModifiedDate
b_score <- nvd$CVE_Items$impact$baseMetricV2$cvssV2$baseScore
sev <- nvd$CVE_Items$impact$baseMetricV2$severity
# int_act <- nvd$CVE_Items$impact$baseMetricV2$userInteractionRequired
# all_priv <- nvd$CVE_Items$impact$baseMetricV2$obtainAllPrivilege
# acc_vec <- nvd$CVE_Items$impact$baseMetricV2$cvssV2$accessVector
desc <- nvd$CVE_Items$cve$description$description_data
desc_vals = NULL
for(x in 1:length(desc)){
if( x == 0){
desc_vals <- desc[[x]]$value
}else{
desc_vals <- c(desc_vals, desc[[x]]$value)
}
}
#nvd_df <- tibble(cves,pub_date,mod_date,b_score,sev, desc_vals, int_act, all_priv, acc_vec)
nvd_df <- tibble(cves,pub_date,mod_date,b_score,sev, desc_vals)
if( save == TRUE){
saveRDS(nvd_df, file = paste0("nvd_",year,".RDS"))
}
return(nvd_df)
}
GetLargeVulnGroups <- function(new = FALSE, group_count = 50, ext = c(.005,.03), sev = c(.031, .07), hgh = c(.071, .15)){
# If new then build simulation
if( new == TRUE){
# Can take ~30 seconds to buid
dt <- GetVulnGroups(group_count, ext, sev, hgh)
# Save for future use
saveRDS(dt, file = "audit_df.RDS")
}else{
# Instantiated saved file
dt <- readRDS("audit_df.RDS")
}
return(dt)
}
GetEscapeGroups <- function(dt, group_count){
#dt <- year_one_group
dt$escaped <- GetEscapeVal(dt)
# Set a quarter of groups to strating in December
three_quarters <- group_count - round(group_count/4)
dt_one_month <- dt %>% filter(group %in% three_quarters:group_count & (week.fseen >= 48 & week.fseen <= 52))
# Set another quarters of groups starting in October
half_count <- round(group_count/2)
dt_one_quarter <- dt %>% filter(group %in% half_count:(three_quarters-1) & (week.fseen >= 39 & week.fseen <= 52))
# Have the remaining half start at the beginning of the year
dt_full <- dt %>% filter(group < half_count & week.fseen <= 52)
# Integrate all of the data frames back into one
dt <- bind_rows(dt_one_month, dt_one_quarter, dt_full)
# Remove unwanted data frames.
dt_one_month <- dt_one_quarter <- dt_full <- NULL
# Creat group aggregates
group_vals <- dt %>%
group_by(group) %>%
summarize(total_weeks = (max(week.fseen) - min(week.fseen)) + 1,
prod_vuln = sum(escaped==1),
dev_vuln = sum(escaped == 0),
total_vulns = prod_vuln + dev_vuln,.groups = 'drop')
return(group_vals)
}
GetEscapeVal <- function(dt){
rand_vals <- runif(nrow(dt))
#maturity_levels <- mapply(MaturityFlip,dt$dev_maturity)
prob_escape <- dt$beta_prob/dt$dev_maturity
dt$escaped <- if_else(rand_vals < prob_escape,1,0)
return(dt$escaped)
}
AddEscapeDF <- function(df1, df2){
count <- max(nrow(df1),nrow(df2))
res <- tibble()
for(x in 1:count){
if(is.na(df1[x,1]) & is.na(df2[x,1])){
#do nothing
}else if(is.na(df1[x,1]) & !is.na(df2[x,1])){
tib <- df2[x,]
}else if(is.na(df2[x,1]) & !is.na(df1[x,1])){
tib <- df1[x,]
}else{
tib <- df1[x,] + df2[x,]
}
res <- bind_rows(res, tib)
}
return(res)
}
GetBurnMLE <- function(hits,misses){
ll <- function(alpha, beta) {
x <- hits
total <- hits + misses
#print(total)
-sum(VGAM::dbetabinom.ab(x, total, alpha, beta, log = TRUE))
}
m <- mle(ll, start = list(alpha = 1, beta = 10), method = "L-BFGS-B",lower = c(0.0001, .1))
return(coef(m))
}
MakeProportionChart <- function(df_val, xlab="Escape Rates w/95% Credible Intervals", ylab="Product Development Groups",
tval="Escape Rates By Product Group - With KPIs",
sval="Predictive Rates (black) vs Actuals (red)",
kpis = c(.05,.1,.2)){
df_val %>%
ggplot(aes(emp_bayes_avg, rank(emp_bayes_avg))) +
geom_point() +
geom_point(aes(x = escape_avg), color = "red") +
geom_errorbarh(aes(xmin = low_ci, xmax = high_ci)) +
geom_vline(xintercept=kpis[1], color="maroon", size=1, linetype="dotted") +
geom_vline(xintercept=kpis[2], color="orangered2", size=1, linetype="dotted") +
geom_vline(xintercept=kpis[3], color="goldenrod", size=1, linetype="dotted") +
labs(x = xlab,
y = ylab,
title = tval,
subtitle = sval) +
theme_bw()
#return(res)
}
GetCritDF <- function(nvd_val, sev_val, match_string = "default"){
if( match_string == "deafult"){
match_string <- "Windows|Microsoft|Office|Mac|Apple|Browser|cisco|java|Oracle|adobe"
}
crit_df <- nvd_val %>% filter(b_score >= sev_val & grepl(match_string, desc_vals,ignore.case = TRUE))
crit_df$pub_day <- yday(crit_df$pub_date)
crit_df$mod_day <- yday(crit_df$pub_date)
return(crit_df)
}
GetRanges <- function(prior, data, p_scale, p_rate){
p_range <- qgamma(c(.001,.999),prior$alpha,prior$lambda)
d_range <- qgamma(c(.001,.999),data$num_vulns,data$wait_time)
r_range <- qgamma(c(.001,.999),p_scale,p_rate)
ranges <- c(p_range, d_range, r_range)
min_r <- min(ranges) - .01
max_r <- max(ranges) + .01
return(tibble(min_r, max_r))
}
sla_check <- function(fix_time,severity,exposure,audit){
# Extreme
if(severity == "extreme" & exposure == "external" & audit == 0){
if( fix_time <= 7){
return(1)
}else{
return(0)
}
}
if((severity == "extreme" & exposure == "external") | (severity == "critical" & exposure == "external" & audit == 0)){
if( fix_time <= 14){
return(1)
}else{
return(0)
}
}
if((severity == "critical" & exposure == "external") | (severity == "high" & exposure == "external" & audit == 0)){
if(fix_time <= 21){
return(1)
}else{
return(0)
}
}
if((severity == "high" & exposure == "external") | (severity == "extreme" & exposure == "partner" & audit == 0)){
if(fix_time <= 28){
return(1)
}else{
return(0)
}
}
if((severity == "extreme" & exposure == "partner") | (severity == "critical" & exposure == "partner" & audit == 0) ){
if(fix_time <= 35){
return(1)
}else{
return(0)
}
}
if((severity == "critical" & exposure == "partner") | (severity == "high" & exposure == "partner" & audit == 0) ){
if(fix_time <= 42){
return(1)
}else{
return(0)
}
}
if((severity == "high" & exposure == "partner") | (severity == "extreme" & exposure == "internal" & audit == 0) ){
if(fix_time <= 50){
return(1)
}else{
return(0)
}
}
if((severity == "extreme" & exposure == "internal") | (severity == "critical" & exposure == "internal" & audit == 0) ){
if(fix_time <= 60){
return(1)
}else{
return(0)
}
}
if((severity == "critical" & exposure == "internal") | (severity == "high" & exposure == "internal" & audit == 0) ){
if(fix_time <= 75){
return(1)
}else{
return(0)
}
}
if(severity == "high" & exposure == "internal"){
if(fix_time <= 90){
return(1)
}else{
return(0)
}
}
return(0)
}
getLogOddsInt <- function(prob, scale){
# Turn probability into odds
odds <- prob/(1-prob)
# Turn odds into log odds
logOdds <- log(odds)
# Get a standard deviation value
sdev <- abs(round(logOdds/scale,2))
# Return
return(c(logOdds,sdev))
}
policy_pred <- function(mcmc, sev, exp, dev_val, audit_val){
# simulate from posterior
res <- posterior_epred( mcmc, newdata = data.frame(
severity = sev,
exposure = exp,
dev_maturity = dev_val,
audit = audit_val))
# Highest Density Interval
ci_val <- ci(res, method = "HDI")
ret_tib <- tibble(mean_est = mean(res),
ci_low = ci_val$CI_low,
ci_high = ci_val$CI_high)
return(ret_tib)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment