Skip to content

Instantly share code, notes, and snippets.

@statcompute
Last active November 25, 2018 18:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statcompute/79c9c62938d488b86dee284ecdf07b5f to your computer and use it in GitHub Desktop.
Save statcompute/79c9c62938d488b86dee284ecdf07b5f to your computer and use it in GitHub Desktop.
use bumping to improve monotonic binning
bump_bin <- function(data, y, x, n) {
n1 <- 50
n2 <- 10
set.seed(2019)
seeds <- c(0, round(runif(n) * as.numeric(paste('1e', ceiling(log10(n)) + 2, sep = '')), 0))
yname <- deparse(substitute(y))
xname <- deparse(substitute(x))
df1 <- data[, c(yname, xname)]
df2 <- df1[!is.na(df1[, xname]), c(xname, yname)]
cor <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
### MONOTONIC BINNING WITH BOOTSTRAP SAMPLES ###
bin <- function(seed) {
if (seed == 0) {
smp <- df2
}
else {
set.seed(seed)
smp <- df2[sample(seq(nrow(df2)), nrow(df2), replace = T), ]
}
reg <- isoreg(smp[, 1], cor / abs(cor) * smp[, 2])
cut <- knots(as.stepfun(reg))
df2$cut <- cut(df2[[xname]], breaks = unique(cut), include.lowest = T)
df3 <- Reduce(rbind,
lapply(split(df2, df2$cut),
function(x) data.frame(n = nrow(x), b = sum(x[[yname]]), g = sum(1 - x[[yname]]),
maxx = max(x[[xname]]), minx = min(x[[xname]]))))
df4 <- df3[which(df3[["n"]] > n1 & df3[["b"]] > n2 & df3[["g"]] > n2), ]
df2$good <- 1 - df2[[yname]]
out <- smbinning::smbinning.custom(df2, "good", xname, cuts = df4$maxx[-nrow(df4)])$ivtable
return(data.frame(iv = out$IV[length(out$IV)], nbin = nrow(out) - 2,
cuts = I(list(df4$maxx[-nrow(df4)])),
abs_cor = abs(cor(as.numeric(row.names(out)[1:(nrow(out) - 2)]),
out$WoE[1:(nrow(out) - 2)], method = "spearman"))))
}
bump_out <- Reduce(rbind, parallel::mclapply(seeds, mc.cores = parallel::detectCores(), bin))
### FIND THE CUT MAXIMIZING THE INFORMATION VALUE ###
cut2 <- bump_out[order(-bump_out["abs_cor"], -bump_out["iv"], bump_out["nbin"]), ]$cuts[[1]]
df1$good <- 1 - df1[[yname]]
return(smbinning::smbinning.custom(df1, "good", xname, cuts = cut2)$ivtable)
}
df <- sas7bdat::read.sas7bdat("Downloads/accepts.sas7bdat")
bump_bin(df, bad, bureau_score, n = 200)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment