smartinsightsfromdata/openFairSim.R

## openFairSim.R
# for *pert functons
library('mc2d')

# given the number of loss events, calculate the sum of losses for the number of events
sample_LM <- function(N, L, ML, H, CONF){
	# Calculate the aggregate loss
	#
	# ARGS:
	#		N - number of threat events to evaluate
	#		L - low boundary
	#		ML - most likely
	#		H - high boundary
	#		CONF - confidence
	#
	# RETURNS:
	# 	sum of loss events

	result <- sum(rpert(N, L, ML, H, shape = CONF) )

	return(result)
}

select_events <- function (N, TSestimate, RSestimate) {
	# Calculate the threat strength and whether the asset resists the attack
	#
	# ARGS:
	#		N - number of threat events to evaluate
	#		TSestimate - threat strength estimate
	#		RSestimate - threat strength estimate
	#
	# RETURNS:
	# 	Number of succesfull attacks (i.e. loss events)


	# sample threat and resistive strength
	TSsamples <- rpert(N, TSestimate$L, TSestimate$ML, TSestimate$H, shape = TSestimate$CONF)
	RSsamples <- rpert(N, RSestimate$L, RSestimate$ML, RSestimate$H, shape = RSestimate$CONF)

	# loss events occur whenever TS > RS
	VULNsamples <- TSsamples > RSsamples

	return(length(VULNsamples[VULNsamples == TRUE]))
}

# calculate the ALE for a given scenario
calculate_ALE  <- function(scenario, N = 10^4, title = 'Untitled', verbose = F) {
	# run an OpenFAIR simulation
	#
	# ARGS:
	#		scenario - list of l/ml/h/conf pairs for tef, rs, and lm
	#		N - number of simulations to run
	#		title - optional name of scenario
	#		verbose - whether to print progress indicators
	#
	# RETURNS:
	# 	list of scenario name and ALE samples

	# make samples repeatable (and l33t)
	set.seed(31337)

  if (verbose) cat(paste0('Working on scenario ', title, '\n'))

  # calibrated estimates
  # extending to UI framework of your choice is left as an exercise for the reader
  TEFestimate <- data.frame(L = scenario[['tef_l']], ML = scenario[['tef_ml']], H = scenario[['tef_h']], CONF = scenario[['tef_conf']])
  TSestimate  <- data.frame(L = 20,  ML = 30,  H = 70, CONF = 1)
  RSestimate  <- data.frame(L = scenario[['rs_l']], ML = scenario[['rs_ml']], H = scenario[['rs_h']], CONF = scenario[['rs_conf']])
  LMestimate  <- data.frame(L = scenario[['lm_l']], ML = scenario[['lm_ml']], H = scenario[['lm_h']], CONF = scenario[['lm_conf']])

  # TEF - how many contacts do we have in each year
  TEFsamples <- rpert(N, TEFestimate$L, TEFestimate$ML, TEFestimate$H, shape = TEFestimate$CONF)
  # fractional events per year is nonsensical, so round to the nearest integer
  TEFsamples <- round(TEFsamples)

  # for each number of contacts (TEF), calculate how many of those succeed (become LEF counts)
  LEF <- sapply(TEFsamples, function(x) select_events(x, TSestimate, RSestimate))

  # for the range of loss events, calculate the annual sum of losses across the range of possible LMs
  ALEsamples <- sapply(LEF, function(x) sample_LM(x, L = LMestimate$L, ML = LMestimate$ML, H = LMestimate$H, CONF = LMestimate$CONF))

  # summary stats for ALE
  if (verbose) {
  	print(summary(ALEsamples))
  	VaR <- quantile(ALEsamples, probs=(0.95))
  	cat(paste0("Losses at 95th percentile are $", format(VaR, nsmall = 2, big.mark = ","), "\n"))
  }

  simulation_results <- list(title = as.character(title), ALEsamples = ALEsamples)

  return (simulation_results)
}

## qual_to_quant.R
# plotting, commas, and binning libraries
library('ggplot2')
library('scales')
library('hexbin')


convert_qual_to_quant <- function(app) {
	# Convert qualitative ratings to quant estimate ranges
	#
	# ARGS:
	#		app - list of scenarios, including TEF, RS, and LM qualitative labels
	#
	# RETURNS:
	# 	list of tef_estimate, rs_estimate, lm_estimate, and scenario name


  if (app['TEF'] == "Critical") {
    tef_estimate  <- data.frame(tef_l=1, tef_ml=24, tef_h=365, tef_conf=1)}
  if (app['TEF'] == "High") {
    tef_estimate  <- data.frame(tef_l=1, tef_ml=12, tef_h=50, tef_conf=1)}
  if (app['TEF'] == "Medium") {
    tef_estimate  <- data.frame(tef_l=0, tef_ml=6, tef_h=50, tef_conf=1)}
  if (app['TEF'] == "Low") {
    tef_estimate  <- data.frame(tef_l=0, tef_ml=6, tef_h=24, tef_conf=1)}
  if (app['TEF'] == "Very Low") {
    tef_estimate  <- data.frame(tef_l=0, tef_ml=2, tef_h=12, tef_conf=1)}

  if (app['RS'] == "Critical") {
    rs_estimate  <- data.frame(rs_l=1, rs_ml=24, rs_h=365, rs_conf=1)}
  if (app['RS'] == "High") {
    rs_estimate  <- data.frame(rs_l=1, rs_ml=12, rs_h=50, rs_conf=1)}
  if (app['RS'] == "Medium") {
    rs_estimate  <- data.frame(rs_l=40, rs_ml=50, rs_h=60, rs_conf=1)}
  if (app['RS'] == "Low") {
    rs_estimate  <- data.frame(rs_l=20, rs_ml=40, rs_h=60, rs_conf=1)}
  if (app['RS'] == "Very Low") {
    rs_estimate  <- data.frame(rs_l=10, rs_ml=30, rs_h=50, rs_conf=1)}

  if (app['PLM'] == "Critical") {
    lm_estimate  <- data.frame(lm_l=1, lm_ml=24, lm_h=365, lm_conf=1)}
  if (app['PLM'] == "High") {
    lm_estimate  <- data.frame(lm_l=10^4, lm_ml=5*10^4, lm_h=10^6, lm_conf=1)}
  if (app['PLM'] == "Medium") {
    lm_estimate  <- data.frame(lm_l=5*10^3, lm_ml=10^4, lm_h=5*10^4, lm_conf=1)}
  if (app['PLM'] == "Low") {
    lm_estimate  <- data.frame(lm_l=10^3, lm_ml=2*10^3, lm_h=10^4, lm_conf=1)}
  if (app['PLM'] == "Very Low") {
    lm_estimate  <- data.frame(lm_l=0, lm_ml=2, h=12, conf=1)}

  estimate <- cbind(tef_estimate, rs_estimate, lm_estimate, title = as.character(app[['Application.Name']]))

  #print(paste("TEF is ", TEF))

  return(estimate)
}

scenario_data <- read.csv('app_data.tsv', sep='\t', stringsAsFactors = F)

scenario_data <- apply(scenario_data[,1:4], 1, convert_qual_to_quant)

results <- lapply(scenario_data, function(x) calculate_ALE(scenario = x[1:12], title = x$title, verbose = TRUE))

summary_data <- do.call(rbind, lapply(results, function(x) {
												data.frame(title = x[['title']],
													ALEmedian = median(x[['ALEsamples']]),
													ALEmax = max(x[['ALEsamples']])
												)
											}
))

# scatter plot of median vs. max ALEs
gg <- ggplot(summary_data, aes(x = ALEmedian, y = ALEmax))
gg <- gg + geom_point(postion = "jitter", alpha = 0.1)
#gg <- gg + stat_bin2d(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
gg <- gg + stat_binhex(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
#gg <- gg + geom_text(aes(label = title), size = 4)
gg <- gg + scale_x_log10(labels = comma) + scale_y_log10(labels = comma)
gg <- gg + theme_bw()
print(gg)

# histogram of median ALEs
gg <- ggplot(summary_data, aes(x = ALEmedian))
gg <- gg + geom_histogram(binwidth = diff(range(summary_data$ALEmedian) / 50), aes(y = ..density..), color = "black", fill = "white")
gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
gg <- gg + scale_x_continuous(labels = comma) + scale_y_continuous(labels = comma)
gg <- gg + theme_bw()
print(gg)

# create histogram
#gg <- ggplot(ALEsamples, aes(x = ALEsamples))
#gg <- gg + geom_histogram(binwidth = diff(range(ALEsamples$ALEsamples)/50), aes(y = ..density..), color = "black", fill = "white")
#gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
#gg <- gg + scale_x_continuous(labels = comma)
#gg <- gg + theme_bw()
#gg <- gg + geom_point(aes(y=cumloss)) + geom_path(aes(y=cumloss, group=1))
#print(gg)
	# for *pert functons
	library('mc2d')

	# given the number of loss events, calculate the sum of losses for the number of events
	sample_LM <- function(N, L, ML, H, CONF){
	# Calculate the aggregate loss
	#
	# ARGS:
	# N - number of threat events to evaluate
	# L - low boundary
	# ML - most likely
	# H - high boundary
	# CONF - confidence
	#
	# RETURNS:
	# sum of loss events

	result <- sum(rpert(N, L, ML, H, shape = CONF) )

	return(result)
	}

	select_events <- function (N, TSestimate, RSestimate) {
	# Calculate the threat strength and whether the asset resists the attack
	#
	# ARGS:
	# N - number of threat events to evaluate
	# TSestimate - threat strength estimate
	# RSestimate - threat strength estimate
	#
	# RETURNS:
	# Number of succesfull attacks (i.e. loss events)


	# sample threat and resistive strength
	TSsamples <- rpert(N, TSestimate$L, TSestimate$ML, TSestimate$H, shape = TSestimate$CONF)
	RSsamples <- rpert(N, RSestimate$L, RSestimate$ML, RSestimate$H, shape = RSestimate$CONF)

	# loss events occur whenever TS > RS
	VULNsamples <- TSsamples > RSsamples

	return(length(VULNsamples[VULNsamples == TRUE]))
	}

	# calculate the ALE for a given scenario
	calculate_ALE <- function(scenario, N = 10^4, title = 'Untitled', verbose = F) {
	# run an OpenFAIR simulation
	#
	# ARGS:
	# scenario - list of l/ml/h/conf pairs for tef, rs, and lm
	# N - number of simulations to run
	# title - optional name of scenario
	# verbose - whether to print progress indicators
	#
	# RETURNS:
	# list of scenario name and ALE samples

	# make samples repeatable (and l33t)
	set.seed(31337)

	if (verbose) cat(paste0('Working on scenario ', title, '\n'))

	# calibrated estimates
	# extending to UI framework of your choice is left as an exercise for the reader
	TEFestimate <- data.frame(L = scenario[['tef_l']], ML = scenario[['tef_ml']], H = scenario[['tef_h']], CONF = scenario[['tef_conf']])
	TSestimate <- data.frame(L = 20, ML = 30, H = 70, CONF = 1)
	RSestimate <- data.frame(L = scenario[['rs_l']], ML = scenario[['rs_ml']], H = scenario[['rs_h']], CONF = scenario[['rs_conf']])
	LMestimate <- data.frame(L = scenario[['lm_l']], ML = scenario[['lm_ml']], H = scenario[['lm_h']], CONF = scenario[['lm_conf']])

	# TEF - how many contacts do we have in each year
	TEFsamples <- rpert(N, TEFestimate$L, TEFestimate$ML, TEFestimate$H, shape = TEFestimate$CONF)
	# fractional events per year is nonsensical, so round to the nearest integer
	TEFsamples <- round(TEFsamples)

	# for each number of contacts (TEF), calculate how many of those succeed (become LEF counts)
	LEF <- sapply(TEFsamples, function(x) select_events(x, TSestimate, RSestimate))

	# for the range of loss events, calculate the annual sum of losses across the range of possible LMs
	ALEsamples <- sapply(LEF, function(x) sample_LM(x, L = LMestimate$L, ML = LMestimate$ML, H = LMestimate$H, CONF = LMestimate$CONF))

	# summary stats for ALE
	if (verbose) {
	print(summary(ALEsamples))
	VaR <- quantile(ALEsamples, probs=(0.95))
	cat(paste0("Losses at 95th percentile are $", format(VaR, nsmall = 2, big.mark = ","), "\n"))
	}

	simulation_results <- list(title = as.character(title), ALEsamples = ALEsamples)

	return (simulation_results)
	}
	# plotting, commas, and binning libraries
	library('ggplot2')
	library('scales')
	library('hexbin')


	convert_qual_to_quant <- function(app) {
	# Convert qualitative ratings to quant estimate ranges
	#
	# ARGS:
	# app - list of scenarios, including TEF, RS, and LM qualitative labels
	#
	# RETURNS:
	# list of tef_estimate, rs_estimate, lm_estimate, and scenario name


	if (app['TEF'] == "Critical") {
	tef_estimate <- data.frame(tef_l=1, tef_ml=24, tef_h=365, tef_conf=1)}
	if (app['TEF'] == "High") {
	tef_estimate <- data.frame(tef_l=1, tef_ml=12, tef_h=50, tef_conf=1)}
	if (app['TEF'] == "Medium") {
	tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=50, tef_conf=1)}
	if (app['TEF'] == "Low") {
	tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=24, tef_conf=1)}
	if (app['TEF'] == "Very Low") {
	tef_estimate <- data.frame(tef_l=0, tef_ml=2, tef_h=12, tef_conf=1)}

	if (app['RS'] == "Critical") {
	rs_estimate <- data.frame(rs_l=1, rs_ml=24, rs_h=365, rs_conf=1)}
	if (app['RS'] == "High") {
	rs_estimate <- data.frame(rs_l=1, rs_ml=12, rs_h=50, rs_conf=1)}
	if (app['RS'] == "Medium") {
	rs_estimate <- data.frame(rs_l=40, rs_ml=50, rs_h=60, rs_conf=1)}
	if (app['RS'] == "Low") {
	rs_estimate <- data.frame(rs_l=20, rs_ml=40, rs_h=60, rs_conf=1)}
	if (app['RS'] == "Very Low") {
	rs_estimate <- data.frame(rs_l=10, rs_ml=30, rs_h=50, rs_conf=1)}

	if (app['PLM'] == "Critical") {
	lm_estimate <- data.frame(lm_l=1, lm_ml=24, lm_h=365, lm_conf=1)}
	if (app['PLM'] == "High") {
	lm_estimate <- data.frame(lm_l=10^4, lm_ml=5*10^4, lm_h=10^6, lm_conf=1)}
	if (app['PLM'] == "Medium") {
	lm_estimate <- data.frame(lm_l=510^3, lm_ml=10^4, lm_h=510^4, lm_conf=1)}
	if (app['PLM'] == "Low") {
	lm_estimate <- data.frame(lm_l=10^3, lm_ml=2*10^3, lm_h=10^4, lm_conf=1)}
	if (app['PLM'] == "Very Low") {
	lm_estimate <- data.frame(lm_l=0, lm_ml=2, h=12, conf=1)}

	estimate <- cbind(tef_estimate, rs_estimate, lm_estimate, title = as.character(app[['Application.Name']]))

	#print(paste("TEF is ", TEF))

	return(estimate)
	}

	scenario_data <- read.csv('app_data.tsv', sep='\t', stringsAsFactors = F)

	scenario_data <- apply(scenario_data[,1:4], 1, convert_qual_to_quant)

	results <- lapply(scenario_data, function(x) calculate_ALE(scenario = x[1:12], title = x$title, verbose = TRUE))

	summary_data <- do.call(rbind, lapply(results, function(x) {
	data.frame(title = x[['title']],
	ALEmedian = median(x[['ALEsamples']]),
	ALEmax = max(x[['ALEsamples']])
	)
	}
	))

	# scatter plot of median vs. max ALEs
	gg <- ggplot(summary_data, aes(x = ALEmedian, y = ALEmax))
	gg <- gg + geom_point(postion = "jitter", alpha = 0.1)
	#gg <- gg + stat_bin2d(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
	gg <- gg + stat_binhex(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
	#gg <- gg + geom_text(aes(label = title), size = 4)
	gg <- gg + scale_x_log10(labels = comma) + scale_y_log10(labels = comma)
	gg <- gg + theme_bw()
	print(gg)

	# histogram of median ALEs
	gg <- ggplot(summary_data, aes(x = ALEmedian))
	gg <- gg + geom_histogram(binwidth = diff(range(summary_data$ALEmedian) / 50), aes(y = ..density..), color = "black", fill = "white")
	gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
	gg <- gg + scale_x_continuous(labels = comma) + scale_y_continuous(labels = comma)
	gg <- gg + theme_bw()
	print(gg)

	# create histogram
	#gg <- ggplot(ALEsamples, aes(x = ALEsamples))
	#gg <- gg + geom_histogram(binwidth = diff(range(ALEsamples$ALEsamples)/50), aes(y = ..density..), color = "black", fill = "white")
	#gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
	#gg <- gg + scale_x_continuous(labels = comma)
	#gg <- gg + theme_bw()
	#gg <- gg + geom_point(aes(y=cumloss)) + geom_path(aes(y=cumloss, group=1))
	#print(gg)