statcompute/do_arb_bin.R

## do_arb_bin.R
arb_bin <- function(data, y, x) {
  yname <- deparse(substitute(y))
  xname <- deparse(substitute(x))
  df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
  df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
  spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
  mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
                          ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
  df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
  df4 <- Reduce(rbind,
           lapply(split(df3, df3$yhat),
             function(x) data.frame(maxx = max(x$x), yavg = mean(x$y), yhat = round(mean(x$yhat), 8))))
  df5 <- df4[order(df4$maxx), ]
  h <- ifelse(df5[["yavg"]][1] %in% c(0, 1), 2, 1)
  t <- ifelse(df5[["yavg"]][nrow(df5)] %in% c(0, 1), 2, 1)
  cuts <- df5$maxx[h:max(h, (nrow(df5) - t))]
  return(list(df   = manual_bin(data, yname, xname, cuts = cuts),
              cuts = cuts))
}

arb_bin(df, bad, rev_util)
# bin                           rule freq   dist mv_cnt bad_freq bad_rate     woe     iv      ks
#  01                       $X <= 24 2653 0.4545      0      414   0.1560 -0.3320 0.0452 13.6285
#  02             $X > 24 & $X <= 36  597 0.1023      0       96   0.1608 -0.2963 0.0082 16.3969
#  03             $X > 36 & $X <= 40  182 0.0312      0       32   0.1758 -0.1890 0.0011 16.9533
#  04             $X > 40 & $X <= 58  669 0.1146      0      137   0.2048 -0.0007 0.0000 16.9615
#  05             $X > 58 & $X <= 60   77 0.0132      0       16   0.2078  0.0177 0.0000 16.9381
#  06             $X > 60 & $X <= 72  408 0.0699      0       95   0.2328  0.1636 0.0020 15.7392
#  07             $X > 72 & $X <= 73   34 0.0058      0        8   0.2353  0.1773 0.0002 15.6305
#  08             $X > 73 & $X <= 75   62 0.0106      0       16   0.2581  0.2999 0.0010 15.2839
#  09             $X > 75 & $X <= 83  246 0.0421      0       70   0.2846  0.4340 0.0089 13.2233
#  10             $X > 83 & $X <= 96  376 0.0644      0      116   0.3085  0.5489 0.0225  9.1266
#  11             $X > 96 & $X <= 98   50 0.0086      0       17   0.3400  0.6927 0.0049  8.4162
#  12                        $X > 98  483 0.0827      0      179   0.3706  0.8263 0.0695  0.0000

arb_bin(df, bad, tot_derog)
# bin                           rule freq   dist mv_cnt bad_freq bad_rate     woe     iv      ks
#  00                      is.na($X)  213 0.0365    213       70   0.3286  0.6416 0.0178  2.7716
#  01                        $X <= 0 2850 0.4883      0      367   0.1288 -0.5559 0.1268 20.0442
#  02               $X > 0 & $X <= 1  891 0.1526      0      193   0.2166  0.0704 0.0008 18.9469
#  03               $X > 1 & $X <= 2  478 0.0819      0      121   0.2531  0.2740 0.0066 16.5222
#  04               $X > 2 & $X <= 3  332 0.0569      0       86   0.2590  0.3050 0.0058 14.6321
#  05              $X > 3 & $X <= 23 1064 0.1823      0      353   0.3318  0.6557 0.0931  0.4370
#  06                        $X > 23    9 0.0015      0        6   0.6667  2.0491 0.0090  0.0000
	arb_bin <- function(data, y, x) {
	yname <- deparse(substitute(y))
	xname <- deparse(substitute(x))
	df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
	df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
	spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
	mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
	ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
	df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
	df4 <- Reduce(rbind,
	lapply(split(df3, df3$yhat),
	function(x) data.frame(maxx = max(x$x), yavg = mean(x$y), yhat = round(mean(x$yhat), 8))))
	df5 <- df4[order(df4$maxx), ]
	h <- ifelse(df5[["yavg"]][1] %in% c(0, 1), 2, 1)
	t <- ifelse(df5[["yavg"]][nrow(df5)] %in% c(0, 1), 2, 1)
	cuts <- df5$maxx[h:max(h, (nrow(df5) - t))]
	return(list(df = manual_bin(data, yname, xname, cuts = cuts),
	cuts = cuts))
	}

	arb_bin(df, bad, rev_util)
	# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
	# 01 $X <= 24 2653 0.4545 0 414 0.1560 -0.3320 0.0452 13.6285
	# 02 $X > 24 & $X <= 36 597 0.1023 0 96 0.1608 -0.2963 0.0082 16.3969
	# 03 $X > 36 & $X <= 40 182 0.0312 0 32 0.1758 -0.1890 0.0011 16.9533
	# 04 $X > 40 & $X <= 58 669 0.1146 0 137 0.2048 -0.0007 0.0000 16.9615
	# 05 $X > 58 & $X <= 60 77 0.0132 0 16 0.2078 0.0177 0.0000 16.9381
	# 06 $X > 60 & $X <= 72 408 0.0699 0 95 0.2328 0.1636 0.0020 15.7392
	# 07 $X > 72 & $X <= 73 34 0.0058 0 8 0.2353 0.1773 0.0002 15.6305
	# 08 $X > 73 & $X <= 75 62 0.0106 0 16 0.2581 0.2999 0.0010 15.2839
	# 09 $X > 75 & $X <= 83 246 0.0421 0 70 0.2846 0.4340 0.0089 13.2233
	# 10 $X > 83 & $X <= 96 376 0.0644 0 116 0.3085 0.5489 0.0225 9.1266
	# 11 $X > 96 & $X <= 98 50 0.0086 0 17 0.3400 0.6927 0.0049 8.4162
	# 12 $X > 98 483 0.0827 0 179 0.3706 0.8263 0.0695 0.0000

	arb_bin(df, bad, tot_derog)
	# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
	# 00 is.na($X) 213 0.0365 213 70 0.3286 0.6416 0.0178 2.7716
	# 01 $X <= 0 2850 0.4883 0 367 0.1288 -0.5559 0.1268 20.0442
	# 02 $X > 0 & $X <= 1 891 0.1526 0 193 0.2166 0.0704 0.0008 18.9469
	# 03 $X > 1 & $X <= 2 478 0.0819 0 121 0.2531 0.2740 0.0066 16.5222
	# 04 $X > 2 & $X <= 3 332 0.0569 0 86 0.2590 0.3050 0.0058 14.6321
	# 05 $X > 3 & $X <= 23 1064 0.1823 0 353 0.3318 0.6557 0.0931 0.4370
	# 06 $X > 23 9 0.0015 0 6 0.6667 2.0491 0.0090 0.0000