ribsy/htma_cyberfunctions

## htma_cyberfunctions
library(tidyverse)
library(bayestestR)
library(bayesAB)

# See Manifesto Functions at www.themetricsmanifesto.com
# Put in your same working director
source("manifesto_functions.R")


bayesPhishArrive <- function(prior_vals = c(5,1), total_vals = c(87,12)){

  # Subjective Priors, equals 1.5 per week i.e. 3/2
  prior_events <- prior_vals[1]
  prior_weeks <-prior_vals[2]
  prior_rate <- prior_events / prior_weeks

  # Data, which equals 2.5 per week 25/10
  total_events <- total_vals[1]
  total_weeks <- total_vals[2]
  total_rate <- total_events / total_weeks

  range_events <- if_else(total_rate <= prior_rate, prior_rate, total_rate)
  range_events <- range_events * 5


  # Make grid of 10K sequential values between 0 to 5
  plausible_rates <- data.frame(possible_grid =
                      seq(from = 0, to = range_events, length = 10000))

  # Get Prior and Likelihood
  plausible_rates <- plausible_rates %>%
    mutate(prior = dgamma(possible_grid, prior_events, prior_weeks),
           likelihood = dpois(total_events, possible_grid * total_weeks))

  # Get Posterior
  plausible_rates <- plausible_rates %>%
    mutate(unnormalized = likelihood * prior,
           posterior = unnormalized / sum(unnormalized))

  # Simulate 10K samples weighted on posterior
  post_sample <- sample_n(plausible_rates, size = 10000,
                weight = posterior, replace = TRUE)

  return(post_sample)
}

get_phish_rate <- function(post_sample){
  phish_res <- post_sample %>% filter(posterior == max(posterior)) %>%
                select(possible_grid) %>% unique()

  phish_rate <- phish_res[[1]]
  return(phish_rate)
}

bayes_waits_analysis <- function(prior,waits){
  # Number Of Simulations
  Nrep = 10000

  n <- length(waits)
  wait_time <- sum(waits)
  # Get avg vulns per wait time
  vulns_per_wait_time <- n/wait_time

  gvals <- GetGammaPriors(prior)
  scale_val <- prior_scale
  alpha <- gvals[1] * scale_val
  lambda <- gvals[2] * scale_val

  # prior vuln count + num vulns from data
  shape = alpha + n

  # prior time + wait time from data
  rate = lambda + wait_time

  # Simulate 10K vuln day rates: .21, .17, .15, .22 etc
  theta_sim = rgamma(Nrep, shape, rate)

  # Get interarrival values from the exponential distribution
  y_sim = rexp(Nrep, rate = theta_sim)

  # Get 95% and 50% quantiles - used for making vertical lines
  val95 <- round(quantile(y_sim, 0.95),2)
  val50 <- round(quantile(y_sim, 0.50),2)

  # Pass interarrival data into DF
  df <- tibble(sim=y_sim)

  # Pass DF into ggplot
  ggplot(df, aes(x=sim)) +

    # Declare histogram shape
    geom_histogram(aes(y=..density..), colour="black",
                   fill="white",bins=100)+

    # Curve for chart
    geom_density(alpha=.2, fill="white")+

    # 95% or less vertical line
    geom_vline(aes(xintercept=mean(val95)),
               color="black", linetype="dashed", size=.5)+

    # 50% or less vertical line
    geom_vline(aes(xintercept=mean(val50)),
               color="black", linetype="dashed", size=.5)+

    # Labels for chart
    labs(title = "Posterior Predictive Distribution",
         subtitle = paste0("95% Chance of Event In: ",
                           val95," Days Or Less | 50% In: ",
                           val50, " Or Less")) +
    xlab("Days") + ylab("Strength") + xlim(c(0, val95*2))

}

bayes_waits_basic <- function(prior, waits, prior_scale = 1,x_update = 2){

  p_avg <- sum(prior)/length(prior)
  w_avg <- sum(waits)/length(waits)

  xrange <- ifelse(w_avg > p_avg, w_avg, p_avg) * x_update

  gvals <- GetGammaPriors(prior)
  scale_val <- prior_scale
  alpha <- gvals[1] * scale_val
  lambda <- gvals[2] * scale_val

  # Create a list of the possible rates (for graphing only)
  theta = seq(0, xrange, 0.001)

  # Score possibilities using priors (for graphing only)
  prior = dgamma(theta, shape = alpha, rate = lambda)

  # likelihood distribution - for graphing purposes only
  likelihood = dgamma(sum(waits), shape = length(waits), rate = theta)

  # Posterior - Conjugate prior update.
  posterior = dgamma(theta, alpha + length(waits), lambda + sum(waits))

  # Bayesian update graph
  plot_posterior(theta, prior, likelihood, posterior)
  abline(v = qgamma(c(0.025, 0.975), alpha + length(waits), lambda + sum(waits)),
         col = "seagreen", lty = 2)
}

non_bayes_waits <- function(waits,range = 30, to_val = ""){

  # Determine if event rate measurement should be in days or hours.
  ratio_val <- sum(waits)/length(waits)
  time_vals <- pexp(1:range, ratio_val,lower.tail = TRUE)
  index_val <- max(which(time_vals < .99))
  rate_val <- 1000/index_val
  if(to_val == ""){
     to_val <- round(index_val/2)
  }
  time_type <- "time"

  tibble(x = 0:1000 / rate_val,
       prob = pexp(q = 0:1000 / rate_val,
                   rate = sum(waits)/length(waits),
                   lower.tail = TRUE)) %>%
  # Add a column to the data frame
  mutate(Interval = ifelse(x>= 0 & x <= to_val, paste0("0 to ", to_val), "other")) %>%


  # Send the data frame to the graphing functions
  ggplot(aes(x = x, y = prob, fill = Interval)) +
  geom_area(alpha = 0.3) +
  labs(title = "Probability Of Event Arriving",
       subtitle = paste0(round(time_vals[to_val],2)*100,"% chance of event arriving in ",to_val," day or less."),
       x = str_to_title(time_type),
       y = "Cumulative Probability")
}
	library(tidyverse)
	library(bayestestR)
	library(bayesAB)

	# See Manifesto Functions at www.themetricsmanifesto.com
	# Put in your same working director
	source("manifesto_functions.R")


	bayesPhishArrive <- function(prior_vals = c(5,1), total_vals = c(87,12)){

	# Subjective Priors, equals 1.5 per week i.e. 3/2
	prior_events <- prior_vals[1]
	prior_weeks <-prior_vals[2]
	prior_rate <- prior_events / prior_weeks

	# Data, which equals 2.5 per week 25/10
	total_events <- total_vals[1]
	total_weeks <- total_vals[2]
	total_rate <- total_events / total_weeks

	range_events <- if_else(total_rate <= prior_rate, prior_rate, total_rate)
	range_events <- range_events * 5


	# Make grid of 10K sequential values between 0 to 5
	plausible_rates <- data.frame(possible_grid =
	seq(from = 0, to = range_events, length = 10000))

	# Get Prior and Likelihood
	plausible_rates <- plausible_rates %>%
	mutate(prior = dgamma(possible_grid, prior_events, prior_weeks),
	likelihood = dpois(total_events, possible_grid * total_weeks))

	# Get Posterior
	plausible_rates <- plausible_rates %>%
	mutate(unnormalized = likelihood * prior,
	posterior = unnormalized / sum(unnormalized))

	# Simulate 10K samples weighted on posterior
	post_sample <- sample_n(plausible_rates, size = 10000,
	weight = posterior, replace = TRUE)

	return(post_sample)
	}

	get_phish_rate <- function(post_sample){
	phish_res <- post_sample %>% filter(posterior == max(posterior)) %>%
	select(possible_grid) %>% unique()

	phish_rate <- phish_res[[1]]
	return(phish_rate)
	}

	bayes_waits_analysis <- function(prior,waits){
	# Number Of Simulations
	Nrep = 10000

	n <- length(waits)
	wait_time <- sum(waits)
	# Get avg vulns per wait time
	vulns_per_wait_time <- n/wait_time

	gvals <- GetGammaPriors(prior)
	scale_val <- prior_scale
	alpha <- gvals[1] * scale_val
	lambda <- gvals[2] * scale_val

	# prior vuln count + num vulns from data
	shape = alpha + n

	# prior time + wait time from data
	rate = lambda + wait_time

	# Simulate 10K vuln day rates: .21, .17, .15, .22 etc
	theta_sim = rgamma(Nrep, shape, rate)

	# Get interarrival values from the exponential distribution
	y_sim = rexp(Nrep, rate = theta_sim)

	# Get 95% and 50% quantiles - used for making vertical lines
	val95 <- round(quantile(y_sim, 0.95),2)
	val50 <- round(quantile(y_sim, 0.50),2)

	# Pass interarrival data into DF
	df <- tibble(sim=y_sim)

	# Pass DF into ggplot
	ggplot(df, aes(x=sim)) +

	# Declare histogram shape
	geom_histogram(aes(y=..density..), colour="black",
	fill="white",bins=100)+

	# Curve for chart
	geom_density(alpha=.2, fill="white")+

	# 95% or less vertical line
	geom_vline(aes(xintercept=mean(val95)),
	color="black", linetype="dashed", size=.5)+

	# 50% or less vertical line
	geom_vline(aes(xintercept=mean(val50)),
	color="black", linetype="dashed", size=.5)+

	# Labels for chart
	labs(title = "Posterior Predictive Distribution",
	subtitle = paste0("95% Chance of Event In: ",
	val95," Days Or Less \| 50% In: ",
	val50, " Or Less")) +
	xlab("Days") + ylab("Strength") + xlim(c(0, val95*2))

	}

	bayes_waits_basic <- function(prior, waits, prior_scale = 1,x_update = 2){

	p_avg <- sum(prior)/length(prior)
	w_avg <- sum(waits)/length(waits)

	xrange <- ifelse(w_avg > p_avg, w_avg, p_avg) * x_update

	gvals <- GetGammaPriors(prior)
	scale_val <- prior_scale
	alpha <- gvals[1] * scale_val
	lambda <- gvals[2] * scale_val

	# Create a list of the possible rates (for graphing only)
	theta = seq(0, xrange, 0.001)

	# Score possibilities using priors (for graphing only)
	prior = dgamma(theta, shape = alpha, rate = lambda)

	# likelihood distribution - for graphing purposes only
	likelihood = dgamma(sum(waits), shape = length(waits), rate = theta)

	# Posterior - Conjugate prior update.
	posterior = dgamma(theta, alpha + length(waits), lambda + sum(waits))

	# Bayesian update graph
	plot_posterior(theta, prior, likelihood, posterior)
	abline(v = qgamma(c(0.025, 0.975), alpha + length(waits), lambda + sum(waits)),
	col = "seagreen", lty = 2)
	}

	non_bayes_waits <- function(waits,range = 30, to_val = ""){

	# Determine if event rate measurement should be in days or hours.
	ratio_val <- sum(waits)/length(waits)
	time_vals <- pexp(1:range, ratio_val,lower.tail = TRUE)
	index_val <- max(which(time_vals < .99))
	rate_val <- 1000/index_val
	if(to_val == ""){
	to_val <- round(index_val/2)
	}
	time_type <- "time"

	tibble(x = 0:1000 / rate_val,
	prob = pexp(q = 0:1000 / rate_val,
	rate = sum(waits)/length(waits),
	lower.tail = TRUE)) %>%
	# Add a column to the data frame
	mutate(Interval = ifelse(x>= 0 & x <= to_val, paste0("0 to ", to_val), "other")) %>%


	# Send the data frame to the graphing functions
	ggplot(aes(x = x, y = prob, fill = Interval)) +
	geom_area(alpha = 0.3) +
	labs(title = "Probability Of Event Arriving",
	subtitle = paste0(round(time_vals[to_val],2)*100,"% chance of event arriving in ",to_val," day or less."),
	x = str_to_title(time_type),
	y = "Cumulative Probability")
	}