{{ message }}

Instantly share code, notes, and snippets.

# davidski/openFairSim.R

Last active Aug 28, 2021
Sample OpenFAIR model
 # for *pert functons library('mc2d') # given the number of loss events, calculate the sum of losses for the number of events sample_LM <- function(N, L, ML, H, CONF){ # Calculate the aggregate loss # # ARGS: # N - number of threat events to evaluate # L - low boundary # ML - most likely # H - high boundary # CONF - confidence # # RETURNS: # sum of loss events result <- sum(rpert(N, L, ML, H, shape = CONF) ) return(result) } select_events <- function (N, TSestimate, RSestimate) { # Calculate the threat strength and whether the asset resists the attack # # ARGS: # N - number of threat events to evaluate # TSestimate - threat strength estimate # RSestimate - threat strength estimate # # RETURNS: # Number of succesfull attacks (i.e. loss events) # sample threat and resistive strength TSsamples <- rpert(N, TSestimate\$L, TSestimate\$ML, TSestimate\$H, shape = TSestimate\$CONF) RSsamples <- rpert(N, RSestimate\$L, RSestimate\$ML, RSestimate\$H, shape = RSestimate\$CONF) # loss events occur whenever TS > RS VULNsamples <- TSsamples > RSsamples return(length(VULNsamples[VULNsamples == TRUE])) } # calculate the ALE for a given scenario calculate_ALE <- function(scenario, N = 10^4, title = 'Untitled', verbose = F) { # run an OpenFAIR simulation # # ARGS: # scenario - list of l/ml/h/conf pairs for tef, rs, and lm # N - number of simulations to run # title - optional name of scenario # verbose - whether to print progress indicators # # RETURNS: # list of scenario name and ALE samples # make samples repeatable (and l33t) set.seed(31337) if (verbose) cat(paste0('Working on scenario ', title, '\n')) # calibrated estimates # extending to UI framework of your choice is left as an exercise for the reader TEFestimate <- data.frame(L = scenario[['tef_l']], ML = scenario[['tef_ml']], H = scenario[['tef_h']], CONF = scenario[['tef_conf']]) TSestimate <- data.frame(L = 20, ML = 30, H = 70, CONF = 1) RSestimate <- data.frame(L = scenario[['rs_l']], ML = scenario[['rs_ml']], H = scenario[['rs_h']], CONF = scenario[['rs_conf']]) LMestimate <- data.frame(L = scenario[['lm_l']], ML = scenario[['lm_ml']], H = scenario[['lm_h']], CONF = scenario[['lm_conf']]) # TEF - how many contacts do we have in each year TEFsamples <- rpert(N, TEFestimate\$L, TEFestimate\$ML, TEFestimate\$H, shape = TEFestimate\$CONF) # fractional events per year is nonsensical, so round to the nearest integer TEFsamples <- round(TEFsamples) # for each number of contacts (TEF), calculate how many of those succeed (become LEF counts) LEF <- sapply(TEFsamples, function(x) select_events(x, TSestimate, RSestimate)) # for the range of loss events, calculate the annual sum of losses across the range of possible LMs ALEsamples <- sapply(LEF, function(x) sample_LM(x, L = LMestimate\$L, ML = LMestimate\$ML, H = LMestimate\$H, CONF = LMestimate\$CONF)) # summary stats for ALE if (verbose) { print(summary(ALEsamples)) VaR <- quantile(ALEsamples, probs=(0.95)) cat(paste0("Losses at 95th percentile are \$", format(VaR, nsmall = 2, big.mark = ","), "\n")) } simulation_results <- list(title = as.character(title), ALEsamples = ALEsamples) return (simulation_results) }
 # plotting, commas, and binning libraries library('ggplot2') library('scales') library('hexbin') convert_qual_to_quant <- function(app) { # Convert qualitative ratings to quant estimate ranges # # ARGS: # app - list of scenarios, including TEF, RS, and LM qualitative labels # # RETURNS: # list of tef_estimate, rs_estimate, lm_estimate, and scenario name if (app['TEF'] == "Critical") { tef_estimate <- data.frame(tef_l=1, tef_ml=24, tef_h=365, tef_conf=1)} if (app['TEF'] == "High") { tef_estimate <- data.frame(tef_l=1, tef_ml=12, tef_h=50, tef_conf=1)} if (app['TEF'] == "Medium") { tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=50, tef_conf=1)} if (app['TEF'] == "Low") { tef_estimate <- data.frame(tef_l=0, tef_ml=6, tef_h=24, tef_conf=1)} if (app['TEF'] == "Very Low") { tef_estimate <- data.frame(tef_l=0, tef_ml=2, tef_h=12, tef_conf=1)} if (app['RS'] == "Critical") { rs_estimate <- data.frame(rs_l=1, rs_ml=24, rs_h=365, rs_conf=1)} if (app['RS'] == "High") { rs_estimate <- data.frame(rs_l=1, rs_ml=12, rs_h=50, rs_conf=1)} if (app['RS'] == "Medium") { rs_estimate <- data.frame(rs_l=40, rs_ml=50, rs_h=60, rs_conf=1)} if (app['RS'] == "Low") { rs_estimate <- data.frame(rs_l=20, rs_ml=40, rs_h=60, rs_conf=1)} if (app['RS'] == "Very Low") { rs_estimate <- data.frame(rs_l=10, rs_ml=30, rs_h=50, rs_conf=1)} if (app['PLM'] == "Critical") { lm_estimate <- data.frame(lm_l=1, lm_ml=24, lm_h=365, lm_conf=1)} if (app['PLM'] == "High") { lm_estimate <- data.frame(lm_l=10^4, lm_ml=5*10^4, lm_h=10^6, lm_conf=1)} if (app['PLM'] == "Medium") { lm_estimate <- data.frame(lm_l=5*10^3, lm_ml=10^4, lm_h=5*10^4, lm_conf=1)} if (app['PLM'] == "Low") { lm_estimate <- data.frame(lm_l=10^3, lm_ml=2*10^3, lm_h=10^4, lm_conf=1)} if (app['PLM'] == "Very Low") { lm_estimate <- data.frame(lm_l=0, lm_ml=2, h=12, conf=1)} estimate <- cbind(tef_estimate, rs_estimate, lm_estimate, title = as.character(app[['Application.Name']])) #print(paste("TEF is ", TEF)) return(estimate) } scenario_data <- read.csv('app_data.tsv', sep='\t', stringsAsFactors = F) scenario_data <- apply(scenario_data[,1:4], 1, convert_qual_to_quant) results <- lapply(scenario_data, function(x) calculate_ALE(scenario = x[1:12], title = x\$title, verbose = TRUE)) summary_data <- do.call(rbind, lapply(results, function(x) { data.frame(title = x[['title']], ALEmedian = median(x[['ALEsamples']]), ALEmax = max(x[['ALEsamples']]) ) } )) # scatter plot of median vs. max ALEs gg <- ggplot(summary_data, aes(x = ALEmedian, y = ALEmax)) gg <- gg + geom_point(postion = "jitter", alpha = 0.1) #gg <- gg + stat_bin2d(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red') gg <- gg + stat_binhex(bins = 50) + scale_fill_gradient(low = 'lightblue', high = 'red') #gg <- gg + geom_text(aes(label = title), size = 4) gg <- gg + scale_x_log10(labels = comma) + scale_y_log10(labels = comma) gg <- gg + theme_bw() print(gg) # histogram of median ALEs gg <- ggplot(summary_data, aes(x = ALEmedian)) gg <- gg + geom_histogram(binwidth = diff(range(summary_data\$ALEmedian) / 50), aes(y = ..density..), color = "black", fill = "white") gg <- gg + geom_density(fill = "steelblue", alpha = 1/3) gg <- gg + scale_x_continuous(labels = comma) + scale_y_continuous(labels = comma) gg <- gg + theme_bw() print(gg) # create histogram #gg <- ggplot(ALEsamples, aes(x = ALEsamples)) #gg <- gg + geom_histogram(binwidth = diff(range(ALEsamples\$ALEsamples)/50), aes(y = ..density..), color = "black", fill = "white") #gg <- gg + geom_density(fill = "steelblue", alpha = 1/3) #gg <- gg + scale_x_continuous(labels = comma) #gg <- gg + theme_bw() #gg <- gg + geom_point(aes(y=cumloss)) + geom_path(aes(y=cumloss, group=1)) #print(gg)

### davidski commented Jan 20, 2014

 This would probably be better if re-written to use the native model capabilities of mc2d (already loaded for use of the rpert function). When using mc2d, the provided sensitivity analysis items would be available for use. I'd like to be generate nice tornado diagrams to show what's most important in a given scenario.

### davidski commented Feb 13, 2015

 For a shiny version of this code, see http://shiny.dds.ec/solvomediocris/

### OSUso commented Mar 19, 2018

 Hi David, I'm an absolute beginner in R and trying to understand the code. I'm trying to understand why you use sapply() for example in calculating ALE instead of multiplying the two distributions like in the example Jay was giving (http://datadrivensecurity.info/blog/posts/2014/Jan/severski/). thanks Osama S.

### davidski commented Mar 21, 2018

 Hi, @OSUso! Had to go back and reacquaint myself with this old code. This is a quick 5 minute review. 😄 Jay's version multiples the #er of losses in a given year by the size of those losses. The vectorization technique Jay used is fantastic and super easy to use, but assumes that each loss in a given period is the exact same size (i.e. 5 losses in a given year would all have \$1,000 sizes , resulting in a total of 5 * 1000 = \$5,000 losses). The above code, and the sampleL assumes that losses can vary in a given year (5 losses in a year may have loss sizes of 900, 1000, 1.1K, 1050, and 950). I strongly encourage you to take a look at Evaluator (https://evaluator.severski.net) which is the descendant of this original code. Much more robust modelling, data input, reporting and more in there!