ribsy/chapter_six_scripts.R

## chapter_six_scripts.R
source("manifesto_functions.R")

###########################
### Non-Bayesian Wait Times

# Number of product teams
group_count = 10

# Empty Data Frame to hold results
dt <- tibble()

# Loop through count of groups
for(x in 1:group_count){

  # Get Ext,Sev,High random rates. See “from” and “to” ranges.
  extreme <- sample(seq(from = .005, to = .03, by= .001), size = 1)
  severe <- sample(seq(from = .031, to = .07, by= .001), size = 1)
  high <- sample(seq(from = .071, to = .15, by= .001), size = 1)

  # Set Random Maturity Values - get one value between 1-4 randomly
  patch_fail <- sample(1:4,size=1)
  dev_maturity <- sample(1:4,size=1)

  # Get Data Frame
  res <- GetAdvSecResults(c(extreme, severe, high),
                          patch_fail, dev_maturity)

  # Set Group Number
  res$group <- x

  # Append to master data frame – makes one large data frame
  dt <- bind_rows(dt,res)
}

# Get Critical Rows And All Columns using subsetting
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 &
             dt$week.fseen <= 52), ]

# Get interarrivals. I.e. the difference between weeks numbers
interarrivals <- diff(sort(dte$week.fseen))

# Print interarrivals
interarrivals

#make a histogram
hist(interarrivals, breaks=max(interarrivals))

####################
### Wait-Time By Day

# Same data frame for weeks
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 &
             dt$severity_id == 1 & dt$week.fseen <= 52),]

# Get interarrivals by asking for the day number
interarrivals <- diff(sort(yday(dte$first.seen)))


# Build a histogram of the arrivals.
hist(interarrivals,breaks=max(interarrivals))

#############################
### The Basic Wait-Time Model
# Sum of individual interarrivals.
wait_time <- sum(interarrivals)

# Get count of vuln events. I.e ~60 vulns
n = nrow(dte)

# The ratio of vulnerabilities per day. I.e. 60/356 ~ .17
vulns_per_wait_time <- n/wait_time

# Average Time Between Event Days i.e. 356/60 ~6
avg_days = wait_time/n   #Avg wait time in days between extreme vulns

# Exponential Dist to Simulate data. It spits out 60 simulated results
# And each value will wiggle around 1/60 ~ .0167
simulated <- rexp(n, rate=n)

# Multiply simulated rates by 365 (in my case) to get 60 wait-times
sim_interarrivals <- simulated * wait_time

# Create a histogram (one of many since this is a random process)
hist(sim_interarrivals, breaks = max(interarrivals))

# Avg next arrival
avg <- mean(simulated) * wait_time
avg

# Chance of a vulnerability showing up in 0 to 7 days.
pexp(q = 7, rate = vulns_per_wait_time)

# Simulated for 7 days
mean(rexp(n = 10000, rate = vulns_per_wait_time) <= 7)

# Simulate 10 boolean results, note how all are TRUE but one..its random
res <- rexp(10,.17) <= 7
res

# Get ratio of true to false results
mean(res)

# Create Dataframe
tibble(x = 0:1000 / 100,
       prob = pexp(q = 0:1000 / 100,
                   rate = vulns_per_wait_time,
                   lower.tail = TRUE)) %>%

# Add a column to the data frame
mutate(Interval = ifelse(x >= 0 & x <= 7, "0 to 7", "other")) %>%

# Send the data frame to the graphing functions
ggplot(aes(x = x, y = prob, fill = Interval)) +
  geom_area(alpha = 0.3) +
  labs(title = "Probability Of Vulnerabilities Arriving",
       subtitle = "From 0 to 7 Days",
       x = "Days",
       y = "Cumulative Probability")

pexp(.01,vulns_per_wait_time)

#######################
### Bayesian Wait-Times

### See GetVulnsGroups() in manifesto_functions.R
# Get 10 product teams
dt <- GetVulnGroups()

# Retrieve first half years worth of data
total_weeks <- 26
dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 &
             dt$week.fseen <= total_weeks),]

# Get a count of vulns – key variable
n = nrow(dte)

# Get Interarrivals
interarrivals <- diff(sort(yday(dte$first.seen)))

# Get Total Wait Time – key variable
wait_time <- sum(interarrivals)

# Get avg time per vuln
avg_wait_time = wait_time/n

# Get avg vulns per wait time
vulns_per_wait_time <- n/wait_time

#########################
### Empirical Bayes Prior
# Get new data as if it came from the previous year
dtPrev <- GetVulnGroups()

# Pull only the last quarter’s data. Week 40-52
dtePrev <- dtPrev[(dtPrev$audit == 0 & dtPrev$exposure_id == 1 &
                   dtPrev$severity_id == 1 &
                  (dtPrev$week.fseen > 39 & dtPrev$week.fseen <= 52)),]

# Get a list of all the day numbers with events..see example data
day_vals <- sort(yday(dtePrev$first.seen))

# Reformat the data for prior analysis
prior_int <- GetFirstSeenHits(day_vals, min = 275, max = 365)

# Get optimal gamma priors - MLE
gvals <- GetGammaPriors(prior_int)
alpha = gvals[1]
lambda = gvals[2]

# Create a list of the possible rates (for graphing only)
theta = seq(0, .5, 0.001)

# Score possibilities using priors (for graphing only)
prior = dgamma(theta, shape = alpha, rate = lambda)

#####################
### Bayesian Updating

# likelihood distribution - for graphing purposes only
likelihood = dgamma(wait_time, shape = n, rate = theta)


# Posterior - Conjugate prior update.
posterior = dgamma(theta, alpha + n, lambda + wait_time)

# Bayesian update graph
plot_posterior(theta, prior, likelihood, posterior)
abline(v = qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time),
       col = "seagreen", lty = 2)

qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time)

################################################
### Making Data and Forecasts from the Posterior
# Number Of Simulations
Nrep = 10000

# prior vuln count + num vulns from data
shape = alpha + n


# prior time + wait time from data
rate = lambda + wait_time

# Simulate 10K vuln day rates: .21, .17, .15, .22 etc
theta_sim = rgamma(Nrep, shape, rate)

# 1/theta_sim is the mean time between events 1/.21 = 4.76 days etc
scaled_sim <- 1/theta_sim


# Get 89% Highest Density Intervals for graph
ci <- ci(scaled_sim, method="HDI")

# Get Mean for graph
mean_sim <- round(mean(scaled_sim),2)

# Put scaled_sim into DF for graph
df <- tibble(sim=scaled_sim)

# Plot Mean interArrivals
ggplot(df, aes(x=sim)) +

  # Declare a histogram chart
  geom_histogram(aes(y=..density..), colour="black",
                 fill="white",bins=100)+

  # Add a density curve
  geom_density(alpha=.2, fill="white")+

  # Add a vertical line for the lower CI value
  geom_vline(aes(xintercept=ci$CI_low),
             color="black", linetype="dashed", size=.5)+

  # Add a vertical line for the higher CI value
  geom_vline(aes(xintercept=ci$CI_high),
             color="black", linetype="dashed", size=.5)+

  # Add a mean line
  geom_vline(aes(xintercept=mean_sim),
             color="black", linetype="dashed", size=.5)+

  # Decorate the chart with labels
  labs(title = "Posterior Distribution of Mean interArrivals",
       subtitle = paste0("Mean Rate: ", mean_sim," | 89% HDI: ",
                         round(ci$CI_low,2), " To: ", round(ci$CI_high,2))) +
  xlab("Mean interArrival Rates") +
  ylab("Strength")

### Posterior Predictive Distribution
# Get interarrival values from the exponential distribution
y_sim = rexp(Nrep, rate = theta_sim)

# Get 95% and 50% quantiles - used for making vertical lines
val95 <- round(quantile(y_sim, 0.95),2)
val50 <- round(quantile(y_sim, 0.50),2)

# Pass interarrival data into DF
df <- tibble(sim=y_sim)

# Pass DF into ggplot
ggplot(df, aes(x=sim)) +

  # Declare histogram shape
  geom_histogram(aes(y=..density..), colour="black",
                 fill="white",bins=100)+

  # Curve for chart
  geom_density(alpha=.2, fill="white")+

  # 95% or less vertical line
  geom_vline(aes(xintercept=mean(val95)),
             color="black", linetype="dashed", size=.5)+

  # 50% or less vertical line
  geom_vline(aes(xintercept=mean(val50)),
             color="black", linetype="dashed", size=.5)+

  # Labels for chart
  labs(title = "Posterior Predictive Distribution",
       subtitle = paste0("95% Chance of Vulnerability In: ",
                         val95," Days Or Less | 50% In: ",
                         val50, " Or Less")) +
  xlab("Days") + ylab("Strength") + xlim(c(0, 50))


########################
### Mitigatable Surprise
GetNVDAnalysis(c(2018:2020), new = TRUE, save = TRUE, from_file = TRUE)

GetNVDAnalysis(c(2018:2020))

####
# Note that the supporting functions are all detailed in
# manifesto_functions.R
	source("manifesto_functions.R")

	###########################
	### Non-Bayesian Wait Times

	# Number of product teams
	group_count = 10

	# Empty Data Frame to hold results
	dt <- tibble()

	# Loop through count of groups
	for(x in 1:group_count){

	# Get Ext,Sev,High random rates. See “from” and “to” ranges.
	extreme <- sample(seq(from = .005, to = .03, by= .001), size = 1)
	severe <- sample(seq(from = .031, to = .07, by= .001), size = 1)
	high <- sample(seq(from = .071, to = .15, by= .001), size = 1)

	# Set Random Maturity Values - get one value between 1-4 randomly
	patch_fail <- sample(1:4,size=1)
	dev_maturity <- sample(1:4,size=1)

	# Get Data Frame
	res <- GetAdvSecResults(c(extreme, severe, high),
	patch_fail, dev_maturity)

	# Set Group Number
	res$group <- x

	# Append to master data frame – makes one large data frame
	dt <- bind_rows(dt,res)
	}

	# Get Critical Rows And All Columns using subsetting
	dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 &
	dt$week.fseen <= 52), ]

	# Get interarrivals. I.e. the difference between weeks numbers
	interarrivals <- diff(sort(dte$week.fseen))

	# Print interarrivals
	interarrivals

	#make a histogram
	hist(interarrivals, breaks=max(interarrivals))

	####################
	### Wait-Time By Day

	# Same data frame for weeks
	dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 &
	dt$severity_id == 1 & dt$week.fseen <= 52),]

	# Get interarrivals by asking for the day number
	interarrivals <- diff(sort(yday(dte$first.seen)))


	# Build a histogram of the arrivals.
	hist(interarrivals,breaks=max(interarrivals))

	#############################
	### The Basic Wait-Time Model
	# Sum of individual interarrivals.
	wait_time <- sum(interarrivals)

	# Get count of vuln events. I.e ~60 vulns
	n = nrow(dte)

	# The ratio of vulnerabilities per day. I.e. 60/356 ~ .17
	vulns_per_wait_time <- n/wait_time

	# Average Time Between Event Days i.e. 356/60 ~6
	avg_days = wait_time/n #Avg wait time in days between extreme vulns

	# Exponential Dist to Simulate data. It spits out 60 simulated results
	# And each value will wiggle around 1/60 ~ .0167
	simulated <- rexp(n, rate=n)

	# Multiply simulated rates by 365 (in my case) to get 60 wait-times
	sim_interarrivals <- simulated * wait_time

	# Create a histogram (one of many since this is a random process)
	hist(sim_interarrivals, breaks = max(interarrivals))

	# Avg next arrival
	avg <- mean(simulated) * wait_time
	avg

	# Chance of a vulnerability showing up in 0 to 7 days.
	pexp(q = 7, rate = vulns_per_wait_time)

	# Simulated for 7 days
	mean(rexp(n = 10000, rate = vulns_per_wait_time) <= 7)

	# Simulate 10 boolean results, note how all are TRUE but one..its random
	res <- rexp(10,.17) <= 7
	res

	# Get ratio of true to false results
	mean(res)

	# Create Dataframe
	tibble(x = 0:1000 / 100,
	prob = pexp(q = 0:1000 / 100,
	rate = vulns_per_wait_time,
	lower.tail = TRUE)) %>%

	# Add a column to the data frame
	mutate(Interval = ifelse(x >= 0 & x <= 7, "0 to 7", "other")) %>%

	# Send the data frame to the graphing functions
	ggplot(aes(x = x, y = prob, fill = Interval)) +
	geom_area(alpha = 0.3) +
	labs(title = "Probability Of Vulnerabilities Arriving",
	subtitle = "From 0 to 7 Days",
	x = "Days",
	y = "Cumulative Probability")

	pexp(.01,vulns_per_wait_time)

	#######################
	### Bayesian Wait-Times

	### See GetVulnsGroups() in manifesto_functions.R
	# Get 10 product teams
	dt <- GetVulnGroups()

	# Retrieve first half years worth of data
	total_weeks <- 26
	dte <- dt[(dt$audit == 0 & dt$exposure_id == 1 & dt$severity_id == 1 &
	dt$week.fseen <= total_weeks),]

	# Get a count of vulns – key variable
	n = nrow(dte)

	# Get Interarrivals
	interarrivals <- diff(sort(yday(dte$first.seen)))

	# Get Total Wait Time – key variable
	wait_time <- sum(interarrivals)

	# Get avg time per vuln
	avg_wait_time = wait_time/n

	# Get avg vulns per wait time
	vulns_per_wait_time <- n/wait_time

	#########################
	### Empirical Bayes Prior
	# Get new data as if it came from the previous year
	dtPrev <- GetVulnGroups()

	# Pull only the last quarter’s data. Week 40-52
	dtePrev <- dtPrev[(dtPrev$audit == 0 & dtPrev$exposure_id == 1 &
	dtPrev$severity_id == 1 &
	(dtPrev$week.fseen > 39 & dtPrev$week.fseen <= 52)),]

	# Get a list of all the day numbers with events..see example data
	day_vals <- sort(yday(dtePrev$first.seen))

	# Reformat the data for prior analysis
	prior_int <- GetFirstSeenHits(day_vals, min = 275, max = 365)

	# Get optimal gamma priors - MLE
	gvals <- GetGammaPriors(prior_int)
	alpha = gvals[1]
	lambda = gvals[2]

	# Create a list of the possible rates (for graphing only)
	theta = seq(0, .5, 0.001)

	# Score possibilities using priors (for graphing only)
	prior = dgamma(theta, shape = alpha, rate = lambda)

	#####################
	### Bayesian Updating

	# likelihood distribution - for graphing purposes only
	likelihood = dgamma(wait_time, shape = n, rate = theta)


	# Posterior - Conjugate prior update.
	posterior = dgamma(theta, alpha + n, lambda + wait_time)

	# Bayesian update graph
	plot_posterior(theta, prior, likelihood, posterior)
	abline(v = qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time),
	col = "seagreen", lty = 2)

	qgamma(c(0.025, 0.975), alpha + n, lambda + wait_time)

	################################################
	### Making Data and Forecasts from the Posterior
	# Number Of Simulations
	Nrep = 10000

	# prior vuln count + num vulns from data
	shape = alpha + n


	# prior time + wait time from data
	rate = lambda + wait_time

	# Simulate 10K vuln day rates: .21, .17, .15, .22 etc
	theta_sim = rgamma(Nrep, shape, rate)

	# 1/theta_sim is the mean time between events 1/.21 = 4.76 days etc
	scaled_sim <- 1/theta_sim


	# Get 89% Highest Density Intervals for graph
	ci <- ci(scaled_sim, method="HDI")

	# Get Mean for graph
	mean_sim <- round(mean(scaled_sim),2)

	# Put scaled_sim into DF for graph
	df <- tibble(sim=scaled_sim)

	# Plot Mean interArrivals
	ggplot(df, aes(x=sim)) +

	# Declare a histogram chart
	geom_histogram(aes(y=..density..), colour="black",
	fill="white",bins=100)+

	# Add a density curve
	geom_density(alpha=.2, fill="white")+

	# Add a vertical line for the lower CI value
	geom_vline(aes(xintercept=ci$CI_low),
	color="black", linetype="dashed", size=.5)+

	# Add a vertical line for the higher CI value
	geom_vline(aes(xintercept=ci$CI_high),
	color="black", linetype="dashed", size=.5)+

	# Add a mean line
	geom_vline(aes(xintercept=mean_sim),
	color="black", linetype="dashed", size=.5)+

	# Decorate the chart with labels
	labs(title = "Posterior Distribution of Mean interArrivals",
	subtitle = paste0("Mean Rate: ", mean_sim," \| 89% HDI: ",
	round(ci$CI_low,2), " To: ", round(ci$CI_high,2))) +
	xlab("Mean interArrival Rates") +
	ylab("Strength")

	### Posterior Predictive Distribution
	# Get interarrival values from the exponential distribution
	y_sim = rexp(Nrep, rate = theta_sim)

	# Get 95% and 50% quantiles - used for making vertical lines
	val95 <- round(quantile(y_sim, 0.95),2)
	val50 <- round(quantile(y_sim, 0.50),2)

	# Pass interarrival data into DF
	df <- tibble(sim=y_sim)

	# Pass DF into ggplot
	ggplot(df, aes(x=sim)) +

	# Declare histogram shape
	geom_histogram(aes(y=..density..), colour="black",
	fill="white",bins=100)+

	# Curve for chart
	geom_density(alpha=.2, fill="white")+

	# 95% or less vertical line
	geom_vline(aes(xintercept=mean(val95)),
	color="black", linetype="dashed", size=.5)+

	# 50% or less vertical line
	geom_vline(aes(xintercept=mean(val50)),
	color="black", linetype="dashed", size=.5)+

	# Labels for chart
	labs(title = "Posterior Predictive Distribution",
	subtitle = paste0("95% Chance of Vulnerability In: ",
	val95," Days Or Less \| 50% In: ",
	val50, " Or Less")) +
	xlab("Days") + ylab("Strength") + xlim(c(0, 50))


	########################
	### Mitigatable Surprise
	GetNVDAnalysis(c(2018:2020), new = TRUE, save = TRUE, from_file = TRUE)

	GetNVDAnalysis(c(2018:2020))

	####
	# Note that the supporting functions are all detailed in
	# manifesto_functions.R