Transaction Fraud Detection Performance Evaluation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Utility function to use within AzureML for calculating useful account-level performance metrics | |
# for transaction fraud detection models. Each metric is calculated as a function of the score-cutoff | |
# used. These include: | |
# | |
# Account Detection Rate (ADR): what fraction of the fraud accounts received a score above the | |
# threshold at some point after the fraud started | |
# Value Detection Rate (VDR): From the point in time that the fist transaction scored above | |
# the threshold, what fraction of the losses could have been prevented. | |
# | |
# This code specifically calculates the effect of delayed action: it will calculate VDR for various scenarios, | |
# each assuming a specified delay in taking action after the transaction took place. | |
# | |
# Expected data example: | |
# accountID,Timestamp,Amount,Label,Score | |
# Acct01,8/11/2016 12:44:46 PM,0,0,0.009517137 | |
# Acct01,8/11/2016 2:56:51 PM,10,1,0.0080635994 | |
# Acct01,8/11/2016 4:02:01 PM,12.5,1,0.00806359 | |
# Acct02,8/11/2016 4:05:11 PM,2090,0,0.00806359 | |
# Acct02,9/18/2016 8:47:18 AM,100,0,0.009517137 | |
# ... | |
# | |
# Typical use in AzureML Execute R module: | |
# | |
# source('src/fraudEval/fraudEval.R') | |
# trx <- maml.mapInputPort(1) | |
# results <- evaluatePerf(trx) | |
# maml.mapOutputPort("results") | |
# | |
# | |
# Jacob Spoelstra | |
library(tidyr) | |
library(dplyr) | |
library(ggplot2) | |
library(lubridate) | |
evaluatePerf <- function(trx, sample.nonfraud = 0.1, scorebins = 500){ | |
# Parameters | |
# sample.nonfraud = 0.1 # Sample rate for non-fraud accounts. Assuming that Frauds are not sampled | |
# scorebins = 500 # Granularity of perfromance tables | |
delay.range = c(0, 1, 5, 10, 15) # Delays (in minutes) between event and action to simulate. Fixed for now | |
trx <- trx %>% | |
rename(ts = Timestamp) | |
# By account, enforce monotonically rising scores | |
# Select only fraud transactions | |
message('Extracting fraud transactions') | |
# All trx on fraud accounts | |
frd_a <- trx %>% | |
group_by(accountID) %>% | |
filter(max(Label) > 0.5) %>% | |
arrange(accountID, ts) %>% | |
mutate(isfirst = rank(ts)==1) | |
# Only fraud trx, enforce monitonically rising score | |
frd_f <- frd_a %>% | |
group_by(accountID) %>% | |
filter(Label > 0.5) %>% | |
arrange(accountID, ts) %>% | |
mutate(hiScr = floor(cummax(Score) * scorebins)) | |
# For non-fraud trx, set amount to 0 to not influence VDR calc. | |
frd_n <- frd_a %>% | |
group_by(accountID) %>% | |
filter(Label < 0.5) %>% | |
mutate(hiScr = floor(Score * scorebins), | |
Amount = 0) | |
# combine and sort | |
frd <- frd_f %>% | |
rbind(frd_n) %>% | |
arrange(accountID, ts) | |
# Need this to cover scores that don't occur in data | |
scrref <- data_frame(dscore = (scorebins-1):0) | |
message("Calculating performance based on different delays") | |
results1 <- scrref | |
for(delay.minutes in delay.range){ | |
message("Working on delay of ", delay.minutes, " minutes") | |
frd$dscore <- 0 | |
delay <- make_difftime(minute = delay.minutes) | |
# by account, Update scores based on delay | |
for(r in 1:dim(frd)[1]){ | |
if(frd$isfirst[r]){ | |
last.ts <- frd$ts[r] | |
last.score <- frd$hiScr[r] | |
next | |
} | |
# find most recent entry older than delay | |
score.effective <- last.score[last.ts <= frd$ts[r] - delay] | |
if(length(score.effective)){ | |
frd$dscore[r] <- tail(score.effective, 1) | |
} | |
last.ts <- append(last.ts, frd$ts[r]) | |
last.score <- append(last.score, frd$hiScr[r]) | |
} | |
# Calculate $-weighted recall | |
perf <- frd %>% | |
group_by(dscore) %>% | |
summarise(v = sum(Amount)) %>% | |
right_join(scrref, by = 'dscore') %>% | |
mutate(v = ifelse(is.na(v), 0, v)) %>% | |
arrange(desc(dscore)) %>% | |
mutate(vdr = cumsum(v)/sum(v)) | |
# Save in results frame | |
cname.vdr <- paste0("vdr", delay.minutes) | |
results1[cname.vdr] <- perf$vdr | |
} | |
# calculate ADR | |
perf1 <- frd %>% | |
group_by(accountID) %>% | |
summarize(dscore = floor(max(Score)*scorebins)) %>% | |
group_by(dscore) %>% | |
summarise(fa = n()) %>% | |
right_join(scrref, by = 'dscore') %>% | |
mutate(fa = ifelse(is.na(fa), 0, fa)) %>% | |
arrange(desc(dscore)) %>% | |
mutate(nf = cumsum(fa)) | |
# Calculate approximate nr false-positives by day | |
# Filter out all non-fraud accounts | |
message("Extracting non-fraud transactions") | |
nfrd <- trx %>% | |
group_by(accountID) %>% | |
filter(max(Label) < 0.5) %>% | |
arrange(accountID, ts) %>% | |
mutate(day = yday(ts)) %>% | |
group_by(accountID, day) %>% | |
summarize(dscore = floor(max(Score)*scorebins)) | |
message("Calculating AFPR") | |
perf2 <- nfrd %>% | |
group_by(dscore) %>% | |
summarise(nfa = n()/sample.nonfraud) %>% | |
right_join(perf1, by = 'dscore') %>% | |
mutate(nfa = ifelse(is.na(nfa), 0, nfa)) %>% | |
arrange(desc(dscore)) %>% | |
mutate(nnf = cumsum(nfa), | |
AFPR = nnf/nf, | |
ADR = nf/sum(fa), | |
volume = nnf/sum(nfa)) %>% | |
select(-fa, -nfa) | |
message("Calculating Cases per day") | |
nr.of.days = as.integer(max(trx$ts) - min(trx$ts)) | |
results <- results1 %>% | |
left_join(perf2, by = "dscore") %>% | |
mutate(dscore = dscore/scorebins, | |
volperday= (nf+nnf)/nr.of.days) %>% | |
rename(threshold = dscore) | |
message("Done! See 'results'") | |
# plots | |
plt.theme <- theme(axis.text=element_text(size=14), | |
axis.title=element_text(size=14,face="bold"), | |
plot.title=element_text(size=16, face="bold"), | |
legend.title=element_text(size=14, face="bold"), | |
legend.text=element_text(size=12)) | |
afpr.max = 20 | |
score.min <- (results %>% | |
filter(AFPR<=afpr.max) %>% | |
arrange(threshold))[['threshold']][1] | |
volperday.max <- (results %>% | |
filter(AFPR<=afpr.max) %>% | |
arrange(threshold))[['volperday']][1] | |
# ADR-AFPR | |
print(ggplot(results) + | |
geom_line(aes(AFPR, ADR), size=2, colour='darkblue') + | |
plt.theme + | |
ggtitle("Account-Level Fraud Detection") + | |
xlab("Account False-Positive Ratio") + | |
ylab("Account Detection Rate") + | |
xlim(0,afpr.max)) | |
# VDR-AFPR | |
print(ggplot(results %>% | |
gather(vdr0, vdr1, vdr5, vdr10, vdr15, key='delay', value='VDR')) + | |
geom_line(aes(x=AFPR, y=VDR, color=delay), size=2) + | |
plt.theme + | |
ggtitle("Potential Loss Prevention") + | |
xlab("Account False-Positive Ratio") + | |
ylab("Value Detection Rate") + | |
xlim(0,afpr.max) + | |
scale_colour_discrete(breaks=c('vdr0', 'vdr1', 'vdr5', 'vdr10', 'vdr15'), | |
labels=paste(c('0', '1', '5', '10', '15'), "min")) | |
) | |
# Cases per day | |
print(ggplot(results) + | |
geom_line(aes(threshold, volperday), size=2, colour='darkblue') + | |
plt.theme + | |
xlab("Score Threshold") + | |
ylab("Case Volume Per Day") + | |
ggtitle("Score Distribution") + | |
xlim(1, score.min) + | |
ylim(0, volperday.max * 1.2) | |
) | |
# Score-AFPR | |
print(ggplot(results) + | |
geom_line(aes(threshold, AFPR), size=2, colour='darkblue') + | |
plt.theme + | |
xlab("Score Threshold") + | |
ylab("Account False-Positive Ratio") + | |
ggtitle("Score Calibration") + | |
xlim(1, score.min) + | |
ylim(0, afpr.max * 1.2) | |
) | |
# Score-VDR | |
print(ggplot(results %>% | |
gather(vdr0, vdr1, vdr5, vdr10, vdr15, key='delay', value='VDR')) + | |
geom_line(aes(x=threshold, y=VDR, color=delay), size=2) + | |
plt.theme + | |
ggtitle("Potential Loss Prevention") + | |
xlab("Score Threshold") + | |
ylab("Value Detection Rate") + | |
xlim(1, score.min) + | |
scale_colour_discrete(breaks=c('vdr0', 'vdr1', 'vdr5', 'vdr10', 'vdr15'), | |
labels=paste(c('0', '1', '5', '10', '15'), "min")) | |
) | |
return(results) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Error in seq.default(from = best$lmin, to = best$lmax, by = best$lstep) :
'from' must be of length 1
In addition: Warning messages:
1: Removed 498 rows containing missing values (geom_path).
2: Removed 2490 rows containing missing values (geom_path).
Can you please share data that goes into this code?