Skip to content

Instantly share code, notes, and snippets.

@statcompute
Created July 9, 2019 04:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statcompute/fb7f5637238aeae04e3448791449de04 to your computer and use it in GitHub Desktop.
Save statcompute/fb7f5637238aeae04e3448791449de04 to your computer and use it in GitHub Desktop.
arb_bin <- function(data, y, x) {
yname <- deparse(substitute(y))
xname <- deparse(substitute(x))
df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
df4 <- Reduce(rbind,
lapply(split(df3, df3$yhat),
function(x) data.frame(maxx = max(x$x), yavg = mean(x$y), yhat = round(mean(x$yhat), 8))))
df5 <- df4[order(df4$maxx), ]
h <- ifelse(df5[["yavg"]][1] %in% c(0, 1), 2, 1)
t <- ifelse(df5[["yavg"]][nrow(df5)] %in% c(0, 1), 2, 1)
cuts <- df5$maxx[h:max(h, (nrow(df5) - t))]
return(list(df = manual_bin(data, yname, xname, cuts = cuts),
cuts = cuts))
}
arb_bin(df, bad, rev_util)
# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
# 01 $X <= 24 2653 0.4545 0 414 0.1560 -0.3320 0.0452 13.6285
# 02 $X > 24 & $X <= 36 597 0.1023 0 96 0.1608 -0.2963 0.0082 16.3969
# 03 $X > 36 & $X <= 40 182 0.0312 0 32 0.1758 -0.1890 0.0011 16.9533
# 04 $X > 40 & $X <= 58 669 0.1146 0 137 0.2048 -0.0007 0.0000 16.9615
# 05 $X > 58 & $X <= 60 77 0.0132 0 16 0.2078 0.0177 0.0000 16.9381
# 06 $X > 60 & $X <= 72 408 0.0699 0 95 0.2328 0.1636 0.0020 15.7392
# 07 $X > 72 & $X <= 73 34 0.0058 0 8 0.2353 0.1773 0.0002 15.6305
# 08 $X > 73 & $X <= 75 62 0.0106 0 16 0.2581 0.2999 0.0010 15.2839
# 09 $X > 75 & $X <= 83 246 0.0421 0 70 0.2846 0.4340 0.0089 13.2233
# 10 $X > 83 & $X <= 96 376 0.0644 0 116 0.3085 0.5489 0.0225 9.1266
# 11 $X > 96 & $X <= 98 50 0.0086 0 17 0.3400 0.6927 0.0049 8.4162
# 12 $X > 98 483 0.0827 0 179 0.3706 0.8263 0.0695 0.0000
arb_bin(df, bad, tot_derog)
# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
# 00 is.na($X) 213 0.0365 213 70 0.3286 0.6416 0.0178 2.7716
# 01 $X <= 0 2850 0.4883 0 367 0.1288 -0.5559 0.1268 20.0442
# 02 $X > 0 & $X <= 1 891 0.1526 0 193 0.2166 0.0704 0.0008 18.9469
# 03 $X > 1 & $X <= 2 478 0.0819 0 121 0.2531 0.2740 0.0066 16.5222
# 04 $X > 2 & $X <= 3 332 0.0569 0 86 0.2590 0.3050 0.0058 14.6321
# 05 $X > 3 & $X <= 23 1064 0.1823 0 353 0.3318 0.6557 0.0931 0.4370
# 06 $X > 23 9 0.0015 0 6 0.6667 2.0491 0.0090 0.0000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment