Created
July 9, 2019 04:17
-
-
Save statcompute/fb7f5637238aeae04e3448791449de04 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
arb_bin <- function(data, y, x) { | |
yname <- deparse(substitute(y)) | |
xname <- deparse(substitute(x)) | |
df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname)) | |
df2 <- data.frame(y = df1[[yname]], x = df1[[xname]]) | |
spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs") | |
mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc), | |
ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F) | |
df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred) | |
df4 <- Reduce(rbind, | |
lapply(split(df3, df3$yhat), | |
function(x) data.frame(maxx = max(x$x), yavg = mean(x$y), yhat = round(mean(x$yhat), 8)))) | |
df5 <- df4[order(df4$maxx), ] | |
h <- ifelse(df5[["yavg"]][1] %in% c(0, 1), 2, 1) | |
t <- ifelse(df5[["yavg"]][nrow(df5)] %in% c(0, 1), 2, 1) | |
cuts <- df5$maxx[h:max(h, (nrow(df5) - t))] | |
return(list(df = manual_bin(data, yname, xname, cuts = cuts), | |
cuts = cuts)) | |
} | |
arb_bin(df, bad, rev_util) | |
# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks | |
# 01 $X <= 24 2653 0.4545 0 414 0.1560 -0.3320 0.0452 13.6285 | |
# 02 $X > 24 & $X <= 36 597 0.1023 0 96 0.1608 -0.2963 0.0082 16.3969 | |
# 03 $X > 36 & $X <= 40 182 0.0312 0 32 0.1758 -0.1890 0.0011 16.9533 | |
# 04 $X > 40 & $X <= 58 669 0.1146 0 137 0.2048 -0.0007 0.0000 16.9615 | |
# 05 $X > 58 & $X <= 60 77 0.0132 0 16 0.2078 0.0177 0.0000 16.9381 | |
# 06 $X > 60 & $X <= 72 408 0.0699 0 95 0.2328 0.1636 0.0020 15.7392 | |
# 07 $X > 72 & $X <= 73 34 0.0058 0 8 0.2353 0.1773 0.0002 15.6305 | |
# 08 $X > 73 & $X <= 75 62 0.0106 0 16 0.2581 0.2999 0.0010 15.2839 | |
# 09 $X > 75 & $X <= 83 246 0.0421 0 70 0.2846 0.4340 0.0089 13.2233 | |
# 10 $X > 83 & $X <= 96 376 0.0644 0 116 0.3085 0.5489 0.0225 9.1266 | |
# 11 $X > 96 & $X <= 98 50 0.0086 0 17 0.3400 0.6927 0.0049 8.4162 | |
# 12 $X > 98 483 0.0827 0 179 0.3706 0.8263 0.0695 0.0000 | |
arb_bin(df, bad, tot_derog) | |
# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks | |
# 00 is.na($X) 213 0.0365 213 70 0.3286 0.6416 0.0178 2.7716 | |
# 01 $X <= 0 2850 0.4883 0 367 0.1288 -0.5559 0.1268 20.0442 | |
# 02 $X > 0 & $X <= 1 891 0.1526 0 193 0.2166 0.0704 0.0008 18.9469 | |
# 03 $X > 1 & $X <= 2 478 0.0819 0 121 0.2531 0.2740 0.0066 16.5222 | |
# 04 $X > 2 & $X <= 3 332 0.0569 0 86 0.2590 0.3050 0.0058 14.6321 | |
# 05 $X > 3 & $X <= 23 1064 0.1823 0 353 0.3318 0.6557 0.0931 0.4370 | |
# 06 $X > 23 9 0.0015 0 6 0.6667 2.0491 0.0090 0.0000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment