WenSui Liu statcompute

## use_grnn.R
df <- readRDS("df.rds")
source("mob.R")
source("grnnet.R")

# PRE-PROCESS THE DATA WITH MOB PACKAGE
bin_out <- batch_bin(df, 3)
bin_out$BinSum[order(-bin_out$BinSum$iv), ]
#            var nbin unique miss min  median     max      ks     iv
#   bureau_score   34    315  315 443   692.5     848 35.2651 0.8357
#   tot_rev_line   20   3617  477   0 10573.0  205395 26.8943 0.4442

## dummies.R
df <- readRDS("df.rds")
source("mob.R")
bin_out <- batch_bin(df, 3)
bin_out$BinSum[order(-bin_out$BinSum$iv), ]
#           var nbin unique miss min  median     max      ks     iv
#  bureau_score   34    315  315 443   692.5     848 35.2651 0.8357
#  tot_rev_line   20   3617  477   0 10573.0  205395 26.8943 0.4442
# age_oldest_tr   25    460  216   1   137.0     588 20.3646 0.2714
#     tot_derog    7     29  213   0     0.0      32 20.0442 0.2599
#           ltv   17    145    1   0   100.0     176 16.8807 0.1911

## wt_grnn.R
data(Boston, package = "MASS")

### CONSTRUCT THE UNWEIGHTED DATA.FRAME WITH DUPLICATES
df1 <- rbind(Boston[rep(seq(100), 5), ], Boston)
nrow(df1)
# 1006
X1 <- scale(df1[, 1:13])
Y1 <- df1[, 14]
N1 <- grnn.fit(X1, Y1)

## do_arb_bin.R
arb_bin <- function(data, y, x) {
  yname <- deparse(substitute(y))
  xname <- deparse(substitute(x))
  df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
  df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
  spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
  mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
                          ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
  df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
  df4 <- Reduce(rbind,

## ca_glm.R
df1 <- read.csv("/mnt/d/projects/data/credit_count.txt")
df2 <- df1[which(df1$CARDHLDR == 1), ]

ca_glm <- function(fml, data, family, nchunk) {
  cls <- parallel::makeCluster(nchunk, type = "PSOCK")
  df1 <- parallel::parLapplyLB(cls, parallel::clusterSplit(cls, seq(nrow(data))),
                              function(c_) data[c_,])
  parallel::clusterExport(cls, c("fml", "family", "data"), envir = environment())
  est <- parallel::parLapplyLB(cls, df1,
                               function(d_) cbind(coef(summary(glm(fml, data = d_, family = family)))[, 1:2], nrow(d_) / nrow(data)))

## use_qtl_lgd.R
qtl_lgd(df, lgd, ltv)
#$df
#     bin                                   rule freq   dist mv_cnt mean_y     woe     iv      ks
# 1    01                     $X <= 0.2442486803  320 0.1257      0 0.0948 -1.0370 0.0987  9.5173
# 2    02 $X > 0.2442486803 & $X <= 0.3994659888  318 0.1250      0 0.0994 -0.9850 0.0900 18.6516
# 3    03 $X > 0.3994659888 & $X <= 0.5314432946  318 0.1250      0 0.1265 -0.7135 0.0515 25.8646
# 4    04 $X > 0.5314432946 & $X <= 0.6594855396  318 0.1250      0 0.1283 -0.6974 0.0494 32.9504
# 5    05 $X > 0.6594855396 & $X <= 0.7917383883  318 0.1250      0 0.1769 -0.3182 0.0116 36.5819
# 6    06 $X > 0.7917383883 & $X <= 0.9243704807  320 0.1257      0 0.2788  0.2683 0.0097 32.9670
# 7    07 $X > 0.9243704807 & $X <= 1.0800711662  317 0.1246      0 0.4028  0.8251 0.1020 20.6104

## use_wtwoe.R
df <- readRDS("archive/accepts.rds")

head(df, 1)
# bankruptcy bad app_id tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score purch_price  msrp
#          0   0   1001         6      7            46         NaN        NaN          NaN          NaN        0          747       19678 17160
# down_pyt purpose loan_term loan_amt ltv tot_income used_ind weight
#   947.15   LEASE        36 18730.85 109       4800        0   4.75

### BY ITERATIVE PARTITION ###
source("wqtl_bin.R")

## use_woe.R
df <- readRDS("df.rds")
### SHOWING THE RESPONSE IN THE LAST COLUMN ###
head(df, 2)
#tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score ltv tot_income bad
#        6      7            46         NaN        NaN          NaN          NaN        0          747 109    4800.00   0
#        0     21           153           6          1           97         4637        2          744  97    5833.33   0

source("mob.R")
bin_out <- batch_bin(df, 3)
bin_out$BinSum

## wqtl_out.R
wqtl_bin(cbind(df, w = ifelse(df$bad == 1, 1, 5)), bad, tot_derog, w)
#$df
#    bin                           rule  cnt  freq   dist mv_wt bad_freq bad_rate     woe     iv      ks
#1    00                      is.na($X)  213   785 0.0322   785       70   0.0892  0.6416 0.0178  2.7716
#2    01                        $X <= 1 3741 16465 0.6748     0      560   0.0340 -0.3811 0.0828 18.9469
#3    02               $X > 1 & $X <= 2  478  1906 0.0781     0      121   0.0635  0.2740 0.0066 16.5222
#4    03               $X > 2 & $X <= 4  587  2231 0.0914     0      176   0.0789  0.5078 0.0298 10.6623
#5    04                         $X > 4  818  3014 0.1235     0      269   0.0893  0.6426 0.0685  0.0000
#$cuts
#[1] 1 2 4

## wts_bin.R
derog_bin <- qtl_bin(df, bad, tot_derog)

derog_bin
#$df
#   bin                           rule freq   dist mv_cnt bad_freq bad_rate     woe     iv      ks
#    00                      is.na($X)  213 0.0365    213       70   0.3286  0.6416 0.0178  2.7716
#    01                        $X <= 1 3741 0.6409      0      560   0.1497 -0.3811 0.0828 18.9469
#    02               $X > 1 & $X <= 2  478 0.0819      0      121   0.2531  0.2740 0.0066 16.5222
#    03               $X > 2 & $X <= 4  587 0.1006      0      176   0.2998  0.5078 0.0298 10.6623
#    04                         $X > 4  818 0.1401      0      269   0.3289  0.6426 0.0685  0.0000
	df <- readRDS("df.rds")
	source("mob.R")
	source("grnnet.R")

	# PRE-PROCESS THE DATA WITH MOB PACKAGE
	bin_out <- batch_bin(df, 3)
	bin_out$BinSum[order(-bin_out$BinSum$iv), ]
	# var nbin unique miss min median max ks iv
	# bureau_score 34 315 315 443 692.5 848 35.2651 0.8357
	# tot_rev_line 20 3617 477 0 10573.0 205395 26.8943 0.4442
	data(Boston, package = "MASS")

	### CONSTRUCT THE UNWEIGHTED DATA.FRAME WITH DUPLICATES
	df1 <- rbind(Boston[rep(seq(100), 5), ], Boston)
	nrow(df1)
	# 1006
	X1 <- scale(df1[, 1:13])
	Y1 <- df1[, 14]
	N1 <- grnn.fit(X1, Y1)
	arb_bin <- function(data, y, x) {
	yname <- deparse(substitute(y))
	xname <- deparse(substitute(x))
	df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
	df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
	spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
	mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
	ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
	df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
	df4 <- Reduce(rbind,
	df1 <- read.csv("/mnt/d/projects/data/credit_count.txt")
	df2 <- df1[which(df1$CARDHLDR == 1), ]

	ca_glm <- function(fml, data, family, nchunk) {
	cls <- parallel::makeCluster(nchunk, type = "PSOCK")
	df1 <- parallel::parLapplyLB(cls, parallel::clusterSplit(cls, seq(nrow(data))),
	function(c_) data[c_,])
	parallel::clusterExport(cls, c("fml", "family", "data"), envir = environment())
	est <- parallel::parLapplyLB(cls, df1,
	function(d_) cbind(coef(summary(glm(fml, data = d_, family = family)))[, 1:2], nrow(d_) / nrow(data)))
	qtl_lgd(df, lgd, ltv)
	#$df
	# bin rule freq dist mv_cnt mean_y woe iv ks
	# 1 01 $X <= 0.2442486803 320 0.1257 0 0.0948 -1.0370 0.0987 9.5173
	# 2 02 $X > 0.2442486803 & $X <= 0.3994659888 318 0.1250 0 0.0994 -0.9850 0.0900 18.6516
	# 3 03 $X > 0.3994659888 & $X <= 0.5314432946 318 0.1250 0 0.1265 -0.7135 0.0515 25.8646
	# 4 04 $X > 0.5314432946 & $X <= 0.6594855396 318 0.1250 0 0.1283 -0.6974 0.0494 32.9504
	# 5 05 $X > 0.6594855396 & $X <= 0.7917383883 318 0.1250 0 0.1769 -0.3182 0.0116 36.5819
	# 6 06 $X > 0.7917383883 & $X <= 0.9243704807 320 0.1257 0 0.2788 0.2683 0.0097 32.9670
	# 7 07 $X > 0.9243704807 & $X <= 1.0800711662 317 0.1246 0 0.4028 0.8251 0.1020 20.6104
	df <- readRDS("archive/accepts.rds")

	head(df, 1)
	# bankruptcy bad app_id tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score purch_price msrp
	# 0 0 1001 6 7 46 NaN NaN NaN NaN 0 747 19678 17160
	# down_pyt purpose loan_term loan_amt ltv tot_income used_ind weight
	# 947.15 LEASE 36 18730.85 109 4800 0 4.75

	### BY ITERATIVE PARTITION ###
	source("wqtl_bin.R")
	df <- readRDS("df.rds")
	### SHOWING THE RESPONSE IN THE LAST COLUMN ###
	head(df, 2)
	#tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score ltv tot_income bad
	# 6 7 46 NaN NaN NaN NaN 0 747 109 4800.00 0
	# 0 21 153 6 1 97 4637 2 744 97 5833.33 0

	source("mob.R")
	bin_out <- batch_bin(df, 3)
	bin_out$BinSum
	wqtl_bin(cbind(df, w = ifelse(df$bad == 1, 1, 5)), bad, tot_derog, w)
	#$df
	# bin rule cnt freq dist mv_wt bad_freq bad_rate woe iv ks
	#1 00 is.na($X) 213 785 0.0322 785 70 0.0892 0.6416 0.0178 2.7716
	#2 01 $X <= 1 3741 16465 0.6748 0 560 0.0340 -0.3811 0.0828 18.9469
	#3 02 $X > 1 & $X <= 2 478 1906 0.0781 0 121 0.0635 0.2740 0.0066 16.5222
	#4 03 $X > 2 & $X <= 4 587 2231 0.0914 0 176 0.0789 0.5078 0.0298 10.6623
	#5 04 $X > 4 818 3014 0.1235 0 269 0.0893 0.6426 0.0685 0.0000
	#$cuts
	#[1] 1 2 4
	derog_bin <- qtl_bin(df, bad, tot_derog)

	derog_bin
	#$df
	# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
	# 00 is.na($X) 213 0.0365 213 70 0.3286 0.6416 0.0178 2.7716
	# 01 $X <= 1 3741 0.6409 0 560 0.1497 -0.3811 0.0828 18.9469
	# 02 $X > 1 & $X <= 2 478 0.0819 0 121 0.2531 0.2740 0.0066 16.5222
	# 03 $X > 2 & $X <= 4 587 0.1006 0 176 0.2998 0.5078 0.0298 10.6623
	# 04 $X > 4 818 0.1401 0 269 0.3289 0.6426 0.0685 0.0000