-
-
Save davidski/8490758 to your computer and use it in GitHub Desktop.
# for *pert functons | |
library('mc2d') | |
# given the number of loss events, calculate the sum of losses for the number of events | |
sample_LM <- function(N, L, ML, H, CONF){ | |
# Calculate the aggregate loss | |
# | |
# ARGS: | |
# N - number of threat events to evaluate | |
# L - low boundary | |
# ML - most likely | |
# H - high boundary | |
# CONF - confidence | |
# | |
# RETURNS: | |
# sum of loss events | |
result <- sum(rpert(N, L, ML, H, shape = CONF) ) | |
return(result) | |
} | |
select_events <- function (N, TSestimate, RSestimate) { | |
# Calculate the threat strength and whether the asset resists the attack | |
# | |
# ARGS: | |
# N - number of threat events to evaluate | |
# TSestimate - threat strength estimate | |
# RSestimate - threat strength estimate | |
# | |
# RETURNS: | |
# Number of succesfull attacks (i.e. loss events) | |
# sample threat and resistive strength | |
TSsamples <- rpert(N, TSestimate$L, TSestimate$ML, TSestimate$H, shape = TSestimate$CONF) | |
RSsamples <- rpert(N, RSestimate$L, RSestimate$ML, RSestimate$H, shape = RSestimate$CONF) | |
# loss events occur whenever TS > RS | |
VULNsamples <- TSsamples > RSsamples | |
return(length(VULNsamples[VULNsamples == TRUE])) | |
} | |
# calculate the ALE for a given scenario | |
calculate_ALE <- function(scenario, N = 10^4, title = 'Untitled', verbose = F) { | |
# run an OpenFAIR simulation | |
# | |
# ARGS: | |
# scenario - list of l/ml/h/conf pairs for tef, rs, and lm | |
# N - number of simulations to run | |
# title - optional name of scenario | |
# verbose - whether to print progress indicators | |
# | |
# RETURNS: | |
# list of scenario name and ALE samples | |
# make samples repeatable (and l33t) | |
set.seed(31337) | |
if (verbose) cat(paste0('Working on scenario ', title, '\n')) | |
# calibrated estimates | |
# extending to UI framework of your choice is left as an exercise for the reader | |
TEFestimate <- data.frame(L = scenario[['tef_l']], ML = scenario[['tef_ml']], H = scenario[['tef_h']], CONF = scenario[['tef_conf']]) | |
TSestimate <- data.frame(L = 20, ML = 30, H = 70, CONF = 1) | |
RSestimate <- data.frame(L = scenario[['rs_l']], ML = scenario[['rs_ml']], H = scenario[['rs_h']], CONF = scenario[['rs_conf']]) | |
LMestimate <- data.frame(L = scenario[['lm_l']], ML = scenario[['lm_ml']], H = scenario[['lm_h']], CONF = scenario[['lm_conf']]) | |
# TEF - how many contacts do we have in each year | |
TEFsamples <- rpert(N, TEFestimate$L, TEFestimate$ML, TEFestimate$H, shape = TEFestimate$CONF) | |
# fractional events per year is nonsensical, so round to the nearest integer | |
TEFsamples <- round(TEFsamples) | |
# for each number of contacts (TEF), calculate how many of those succeed (become LEF counts) | |
LEF <- sapply(TEFsamples, function(x) select_events(x, TSestimate, RSestimate)) | |
# for the range of loss events, calculate the annual sum of losses across the range of possible LMs | |
ALEsamples <- sapply(LEF, function(x) sample_LM(x, L = LMestimate$L, ML = LMestimate$ML, H = LMestimate$H, CONF = LMestimate$CONF)) | |
# summary stats for ALE | |
if (verbose) { | |
print(summary(ALEsamples)) | |
VaR <- quantile(ALEsamples, probs=(0.95)) | |
cat(paste0("Losses at 95th percentile are $", format(VaR, nsmall = 2, big.mark = ","), "\n")) | |
} | |
simulation_results <- list(title = as.character(title), ALEsamples = ALEsamples) | |
return (simulation_results) | |
} |
# plotting, commas, and binning libraries | |
library('ggplot2') | |
library('scales') | |
library('hexbin') | |
convert_qual_to_quant <- function(app) { | |
# Convert qualitative ratings to quant estimate ranges | |
# | |
# ARGS: | |
# app - list of scenarios, including TEF, RS, and LM qualitative labels | |
# | |
# RETURNS: | |
# list of tef_estimate, rs_estimate, lm_estimate, and scenario name | |
if (app['TEF'] == "Critical") { | |
tef_estimate <- data.frame(tef_l=1, tef_ml=24, tef_h=365, tef_conf=1)} | |
if (app['TEF'] == "High") { | |
tef_estimate <- data.frame(tef_l=1, tef_ml=12, tef_h=50, tef_conf=1)} | |
if (app['TEF'] == "Medium") { | |
tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=50, tef_conf=1)} | |
if (app['TEF'] == "Low") { | |
tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=24, tef_conf=1)} | |
if (app['TEF'] == "Very Low") { | |
tef_estimate <- data.frame(tef_l=0, tef_ml=2, tef_h=12, tef_conf=1)} | |
if (app['RS'] == "Critical") { | |
rs_estimate <- data.frame(rs_l=1, rs_ml=24, rs_h=365, rs_conf=1)} | |
if (app['RS'] == "High") { | |
rs_estimate <- data.frame(rs_l=1, rs_ml=12, rs_h=50, rs_conf=1)} | |
if (app['RS'] == "Medium") { | |
rs_estimate <- data.frame(rs_l=40, rs_ml=50, rs_h=60, rs_conf=1)} | |
if (app['RS'] == "Low") { | |
rs_estimate <- data.frame(rs_l=20, rs_ml=40, rs_h=60, rs_conf=1)} | |
if (app['RS'] == "Very Low") { | |
rs_estimate <- data.frame(rs_l=10, rs_ml=30, rs_h=50, rs_conf=1)} | |
if (app['PLM'] == "Critical") { | |
lm_estimate <- data.frame(lm_l=1, lm_ml=24, lm_h=365, lm_conf=1)} | |
if (app['PLM'] == "High") { | |
lm_estimate <- data.frame(lm_l=10^4, lm_ml=5*10^4, lm_h=10^6, lm_conf=1)} | |
if (app['PLM'] == "Medium") { | |
lm_estimate <- data.frame(lm_l=5*10^3, lm_ml=10^4, lm_h=5*10^4, lm_conf=1)} | |
if (app['PLM'] == "Low") { | |
lm_estimate <- data.frame(lm_l=10^3, lm_ml=2*10^3, lm_h=10^4, lm_conf=1)} | |
if (app['PLM'] == "Very Low") { | |
lm_estimate <- data.frame(lm_l=0, lm_ml=2, h=12, conf=1)} | |
estimate <- cbind(tef_estimate, rs_estimate, lm_estimate, title = as.character(app[['Application.Name']])) | |
#print(paste("TEF is ", TEF)) | |
return(estimate) | |
} | |
scenario_data <- read.csv('app_data.tsv', sep='\t', stringsAsFactors = F) | |
scenario_data <- apply(scenario_data[,1:4], 1, convert_qual_to_quant) | |
results <- lapply(scenario_data, function(x) calculate_ALE(scenario = x[1:12], title = x$title, verbose = TRUE)) | |
summary_data <- do.call(rbind, lapply(results, function(x) { | |
data.frame(title = x[['title']], | |
ALEmedian = median(x[['ALEsamples']]), | |
ALEmax = max(x[['ALEsamples']]) | |
) | |
} | |
)) | |
# scatter plot of median vs. max ALEs | |
gg <- ggplot(summary_data, aes(x = ALEmedian, y = ALEmax)) | |
gg <- gg + geom_point(postion = "jitter", alpha = 0.1) | |
#gg <- gg + stat_bin2d(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red') | |
gg <- gg + stat_binhex(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red') | |
#gg <- gg + geom_text(aes(label = title), size = 4) | |
gg <- gg + scale_x_log10(labels = comma) + scale_y_log10(labels = comma) | |
gg <- gg + theme_bw() | |
print(gg) | |
# histogram of median ALEs | |
gg <- ggplot(summary_data, aes(x = ALEmedian)) | |
gg <- gg + geom_histogram(binwidth = diff(range(summary_data$ALEmedian) / 50), aes(y = ..density..), color = "black", fill = "white") | |
gg <- gg + geom_density(fill = "steelblue", alpha = 1/3) | |
gg <- gg + scale_x_continuous(labels = comma) + scale_y_continuous(labels = comma) | |
gg <- gg + theme_bw() | |
print(gg) | |
# create histogram | |
#gg <- ggplot(ALEsamples, aes(x = ALEsamples)) | |
#gg <- gg + geom_histogram(binwidth = diff(range(ALEsamples$ALEsamples)/50), aes(y = ..density..), color = "black", fill = "white") | |
#gg <- gg + geom_density(fill = "steelblue", alpha = 1/3) | |
#gg <- gg + scale_x_continuous(labels = comma) | |
#gg <- gg + theme_bw() | |
#gg <- gg + geom_point(aes(y=cumloss)) + geom_path(aes(y=cumloss, group=1)) | |
#print(gg) |
For a shiny version of this code, see http://shiny.dds.ec/solvomediocris/
Hi David,
I'm an absolute beginner in R and trying to understand the code.
I'm trying to understand why you use sapply() for example in calculating ALE instead of multiplying the two distributions like in the example Jay was giving (http://datadrivensecurity.info/blog/posts/2014/Jan/severski/).
thanks
Osama S.
Hi, @OSUso! Had to go back and reacquaint myself with this old code. This is a quick 5 minute review. 😄
Jay's version multiples the #er of losses in a given year by the size of those losses. The vectorization technique Jay used is fantastic and super easy to use, but assumes that each loss in a given period is the exact same size (i.e. 5 losses in a given year would all have $1,000 sizes , resulting in a total of 5 * 1000 = $5,000 losses). The above code, and the sampleL assumes that losses can vary in a given year (5 losses in a year may have loss sizes of 900, 1000, 1.1K, 1050, and 950).
I strongly encourage you to take a look at Evaluator (https://evaluator.severski.net) which is the descendant of this original code. Much more robust modelling, data input, reporting and more in there!
This would probably be better if re-written to use the native model capabilities of mc2d (already loaded for use of the rpert function). When using mc2d, the provided sensitivity analysis items would be available for use. I'd like to be generate nice tornado diagrams to show what's most important in a given scenario.