Skip to content

Instantly share code, notes, and snippets.

@davidski
Last active October 3, 2022 07:02
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save davidski/8490758 to your computer and use it in GitHub Desktop.
Save davidski/8490758 to your computer and use it in GitHub Desktop.
Sample OpenFAIR model
# for *pert functons
library('mc2d')
# given the number of loss events, calculate the sum of losses for the number of events
sample_LM <- function(N, L, ML, H, CONF){
# Calculate the aggregate loss
#
# ARGS:
# N - number of threat events to evaluate
# L - low boundary
# ML - most likely
# H - high boundary
# CONF - confidence
#
# RETURNS:
# sum of loss events
result <- sum(rpert(N, L, ML, H, shape = CONF) )
return(result)
}
select_events <- function (N, TSestimate, RSestimate) {
# Calculate the threat strength and whether the asset resists the attack
#
# ARGS:
# N - number of threat events to evaluate
# TSestimate - threat strength estimate
# RSestimate - threat strength estimate
#
# RETURNS:
# Number of succesfull attacks (i.e. loss events)
# sample threat and resistive strength
TSsamples <- rpert(N, TSestimate$L, TSestimate$ML, TSestimate$H, shape = TSestimate$CONF)
RSsamples <- rpert(N, RSestimate$L, RSestimate$ML, RSestimate$H, shape = RSestimate$CONF)
# loss events occur whenever TS > RS
VULNsamples <- TSsamples > RSsamples
return(length(VULNsamples[VULNsamples == TRUE]))
}
# calculate the ALE for a given scenario
calculate_ALE <- function(scenario, N = 10^4, title = 'Untitled', verbose = F) {
# run an OpenFAIR simulation
#
# ARGS:
# scenario - list of l/ml/h/conf pairs for tef, rs, and lm
# N - number of simulations to run
# title - optional name of scenario
# verbose - whether to print progress indicators
#
# RETURNS:
# list of scenario name and ALE samples
# make samples repeatable (and l33t)
set.seed(31337)
if (verbose) cat(paste0('Working on scenario ', title, '\n'))
# calibrated estimates
# extending to UI framework of your choice is left as an exercise for the reader
TEFestimate <- data.frame(L = scenario[['tef_l']], ML = scenario[['tef_ml']], H = scenario[['tef_h']], CONF = scenario[['tef_conf']])
TSestimate <- data.frame(L = 20, ML = 30, H = 70, CONF = 1)
RSestimate <- data.frame(L = scenario[['rs_l']], ML = scenario[['rs_ml']], H = scenario[['rs_h']], CONF = scenario[['rs_conf']])
LMestimate <- data.frame(L = scenario[['lm_l']], ML = scenario[['lm_ml']], H = scenario[['lm_h']], CONF = scenario[['lm_conf']])
# TEF - how many contacts do we have in each year
TEFsamples <- rpert(N, TEFestimate$L, TEFestimate$ML, TEFestimate$H, shape = TEFestimate$CONF)
# fractional events per year is nonsensical, so round to the nearest integer
TEFsamples <- round(TEFsamples)
# for each number of contacts (TEF), calculate how many of those succeed (become LEF counts)
LEF <- sapply(TEFsamples, function(x) select_events(x, TSestimate, RSestimate))
# for the range of loss events, calculate the annual sum of losses across the range of possible LMs
ALEsamples <- sapply(LEF, function(x) sample_LM(x, L = LMestimate$L, ML = LMestimate$ML, H = LMestimate$H, CONF = LMestimate$CONF))
# summary stats for ALE
if (verbose) {
print(summary(ALEsamples))
VaR <- quantile(ALEsamples, probs=(0.95))
cat(paste0("Losses at 95th percentile are $", format(VaR, nsmall = 2, big.mark = ","), "\n"))
}
simulation_results <- list(title = as.character(title), ALEsamples = ALEsamples)
return (simulation_results)
}
# plotting, commas, and binning libraries
library('ggplot2')
library('scales')
library('hexbin')
convert_qual_to_quant <- function(app) {
# Convert qualitative ratings to quant estimate ranges
#
# ARGS:
# app - list of scenarios, including TEF, RS, and LM qualitative labels
#
# RETURNS:
# list of tef_estimate, rs_estimate, lm_estimate, and scenario name
if (app['TEF'] == "Critical") {
tef_estimate <- data.frame(tef_l=1, tef_ml=24, tef_h=365, tef_conf=1)}
if (app['TEF'] == "High") {
tef_estimate <- data.frame(tef_l=1, tef_ml=12, tef_h=50, tef_conf=1)}
if (app['TEF'] == "Medium") {
tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=50, tef_conf=1)}
if (app['TEF'] == "Low") {
tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=24, tef_conf=1)}
if (app['TEF'] == "Very Low") {
tef_estimate <- data.frame(tef_l=0, tef_ml=2, tef_h=12, tef_conf=1)}
if (app['RS'] == "Critical") {
rs_estimate <- data.frame(rs_l=1, rs_ml=24, rs_h=365, rs_conf=1)}
if (app['RS'] == "High") {
rs_estimate <- data.frame(rs_l=1, rs_ml=12, rs_h=50, rs_conf=1)}
if (app['RS'] == "Medium") {
rs_estimate <- data.frame(rs_l=40, rs_ml=50, rs_h=60, rs_conf=1)}
if (app['RS'] == "Low") {
rs_estimate <- data.frame(rs_l=20, rs_ml=40, rs_h=60, rs_conf=1)}
if (app['RS'] == "Very Low") {
rs_estimate <- data.frame(rs_l=10, rs_ml=30, rs_h=50, rs_conf=1)}
if (app['PLM'] == "Critical") {
lm_estimate <- data.frame(lm_l=1, lm_ml=24, lm_h=365, lm_conf=1)}
if (app['PLM'] == "High") {
lm_estimate <- data.frame(lm_l=10^4, lm_ml=5*10^4, lm_h=10^6, lm_conf=1)}
if (app['PLM'] == "Medium") {
lm_estimate <- data.frame(lm_l=5*10^3, lm_ml=10^4, lm_h=5*10^4, lm_conf=1)}
if (app['PLM'] == "Low") {
lm_estimate <- data.frame(lm_l=10^3, lm_ml=2*10^3, lm_h=10^4, lm_conf=1)}
if (app['PLM'] == "Very Low") {
lm_estimate <- data.frame(lm_l=0, lm_ml=2, h=12, conf=1)}
estimate <- cbind(tef_estimate, rs_estimate, lm_estimate, title = as.character(app[['Application.Name']]))
#print(paste("TEF is ", TEF))
return(estimate)
}
scenario_data <- read.csv('app_data.tsv', sep='\t', stringsAsFactors = F)
scenario_data <- apply(scenario_data[,1:4], 1, convert_qual_to_quant)
results <- lapply(scenario_data, function(x) calculate_ALE(scenario = x[1:12], title = x$title, verbose = TRUE))
summary_data <- do.call(rbind, lapply(results, function(x) {
data.frame(title = x[['title']],
ALEmedian = median(x[['ALEsamples']]),
ALEmax = max(x[['ALEsamples']])
)
}
))
# scatter plot of median vs. max ALEs
gg <- ggplot(summary_data, aes(x = ALEmedian, y = ALEmax))
gg <- gg + geom_point(postion = "jitter", alpha = 0.1)
#gg <- gg + stat_bin2d(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
gg <- gg + stat_binhex(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red')
#gg <- gg + geom_text(aes(label = title), size = 4)
gg <- gg + scale_x_log10(labels = comma) + scale_y_log10(labels = comma)
gg <- gg + theme_bw()
print(gg)
# histogram of median ALEs
gg <- ggplot(summary_data, aes(x = ALEmedian))
gg <- gg + geom_histogram(binwidth = diff(range(summary_data$ALEmedian) / 50), aes(y = ..density..), color = "black", fill = "white")
gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
gg <- gg + scale_x_continuous(labels = comma) + scale_y_continuous(labels = comma)
gg <- gg + theme_bw()
print(gg)
# create histogram
#gg <- ggplot(ALEsamples, aes(x = ALEsamples))
#gg <- gg + geom_histogram(binwidth = diff(range(ALEsamples$ALEsamples)/50), aes(y = ..density..), color = "black", fill = "white")
#gg <- gg + geom_density(fill = "steelblue", alpha = 1/3)
#gg <- gg + scale_x_continuous(labels = comma)
#gg <- gg + theme_bw()
#gg <- gg + geom_point(aes(y=cumloss)) + geom_path(aes(y=cumloss, group=1))
#print(gg)
@davidski
Copy link
Author

This would probably be better if re-written to use the native model capabilities of mc2d (already loaded for use of the rpert function). When using mc2d, the provided sensitivity analysis items would be available for use. I'd like to be generate nice tornado diagrams to show what's most important in a given scenario.

@davidski
Copy link
Author

For a shiny version of this code, see http://shiny.dds.ec/solvomediocris/

@OSUso
Copy link

OSUso commented Mar 19, 2018

Hi David,
I'm an absolute beginner in R and trying to understand the code.
I'm trying to understand why you use sapply() for example in calculating ALE instead of multiplying the two distributions like in the example Jay was giving (http://datadrivensecurity.info/blog/posts/2014/Jan/severski/).

thanks
Osama S.

@davidski
Copy link
Author

Hi, @OSUso! Had to go back and reacquaint myself with this old code. This is a quick 5 minute review. 😄

Jay's version multiples the #er of losses in a given year by the size of those losses. The vectorization technique Jay used is fantastic and super easy to use, but assumes that each loss in a given period is the exact same size (i.e. 5 losses in a given year would all have $1,000 sizes , resulting in a total of 5 * 1000 = $5,000 losses). The above code, and the sampleL assumes that losses can vary in a given year (5 losses in a year may have loss sizes of 900, 1000, 1.1K, 1050, and 950).

I strongly encourage you to take a look at Evaluator (https://evaluator.severski.net) which is the descendant of this original code. Much more robust modelling, data input, reporting and more in there!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment