Skip to content

Instantly share code, notes, and snippets.

View statcompute's full-sized avatar

WenSui Liu statcompute

  • San Antonio, TX
View GitHub Profile
df <- readRDS("df.rds")
source("mob.R")
source("grnnet.R")
# PRE-PROCESS THE DATA WITH MOB PACKAGE
bin_out <- batch_bin(df, 3)
bin_out$BinSum[order(-bin_out$BinSum$iv), ]
# var nbin unique miss min median max ks iv
# bureau_score 34 315 315 443 692.5 848 35.2651 0.8357
# tot_rev_line 20 3617 477 0 10573.0 205395 26.8943 0.4442
df <- readRDS("df.rds")
source("mob.R")
bin_out <- batch_bin(df, 3)
bin_out$BinSum[order(-bin_out$BinSum$iv), ]
# var nbin unique miss min median max ks iv
# bureau_score 34 315 315 443 692.5 848 35.2651 0.8357
# tot_rev_line 20 3617 477 0 10573.0 205395 26.8943 0.4442
# age_oldest_tr 25 460 216 1 137.0 588 20.3646 0.2714
# tot_derog 7 29 213 0 0.0 32 20.0442 0.2599
# ltv 17 145 1 0 100.0 176 16.8807 0.1911
data(Boston, package = "MASS")
### CONSTRUCT THE UNWEIGHTED DATA.FRAME WITH DUPLICATES
df1 <- rbind(Boston[rep(seq(100), 5), ], Boston)
nrow(df1)
# 1006
X1 <- scale(df1[, 1:13])
Y1 <- df1[, 14]
N1 <- grnn.fit(X1, Y1)
arb_bin <- function(data, y, x) {
yname <- deparse(substitute(y))
xname <- deparse(substitute(x))
df1 <- subset(data, !is.na(data[[xname]]) & data[[yname]] %in% c(0, 1), select = c(xname, yname))
df2 <- data.frame(y = df1[[yname]], x = df1[[xname]])
spc <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
mdl <- Rborist::Rborist(as.matrix(df2$x), df2$y, noValidate = T, nTree = 1, regMono = spc / abs(spc),
ctgCensus = "prob", minInfo = exp(-100), nSamp = nrow(df2) , withRepl = F)
df3 <- data.frame(y = df2$y, x = df2$x, yhat = predict(mdl, newdata = as.matrix(df2$x), ctgCensus = "prob")$yPred)
df4 <- Reduce(rbind,
df1 <- read.csv("/mnt/d/projects/data/credit_count.txt")
df2 <- df1[which(df1$CARDHLDR == 1), ]
ca_glm <- function(fml, data, family, nchunk) {
cls <- parallel::makeCluster(nchunk, type = "PSOCK")
df1 <- parallel::parLapplyLB(cls, parallel::clusterSplit(cls, seq(nrow(data))),
function(c_) data[c_,])
parallel::clusterExport(cls, c("fml", "family", "data"), envir = environment())
est <- parallel::parLapplyLB(cls, df1,
function(d_) cbind(coef(summary(glm(fml, data = d_, family = family)))[, 1:2], nrow(d_) / nrow(data)))
qtl_lgd(df, lgd, ltv)
#$df
# bin rule freq dist mv_cnt mean_y woe iv ks
# 1 01 $X <= 0.2442486803 320 0.1257 0 0.0948 -1.0370 0.0987 9.5173
# 2 02 $X > 0.2442486803 & $X <= 0.3994659888 318 0.1250 0 0.0994 -0.9850 0.0900 18.6516
# 3 03 $X > 0.3994659888 & $X <= 0.5314432946 318 0.1250 0 0.1265 -0.7135 0.0515 25.8646
# 4 04 $X > 0.5314432946 & $X <= 0.6594855396 318 0.1250 0 0.1283 -0.6974 0.0494 32.9504
# 5 05 $X > 0.6594855396 & $X <= 0.7917383883 318 0.1250 0 0.1769 -0.3182 0.0116 36.5819
# 6 06 $X > 0.7917383883 & $X <= 0.9243704807 320 0.1257 0 0.2788 0.2683 0.0097 32.9670
# 7 07 $X > 0.9243704807 & $X <= 1.0800711662 317 0.1246 0 0.4028 0.8251 0.1020 20.6104
df <- readRDS("archive/accepts.rds")
head(df, 1)
# bankruptcy bad app_id tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score purch_price msrp
# 0 0 1001 6 7 46 NaN NaN NaN NaN 0 747 19678 17160
# down_pyt purpose loan_term loan_amt ltv tot_income used_ind weight
# 947.15 LEASE 36 18730.85 109 4800 0 4.75
### BY ITERATIVE PARTITION ###
source("wqtl_bin.R")
df <- readRDS("df.rds")
### SHOWING THE RESPONSE IN THE LAST COLUMN ###
head(df, 2)
#tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score ltv tot_income bad
# 6 7 46 NaN NaN NaN NaN 0 747 109 4800.00 0
# 0 21 153 6 1 97 4637 2 744 97 5833.33 0
source("mob.R")
bin_out <- batch_bin(df, 3)
bin_out$BinSum
wqtl_bin(cbind(df, w = ifelse(df$bad == 1, 1, 5)), bad, tot_derog, w)
#$df
# bin rule cnt freq dist mv_wt bad_freq bad_rate woe iv ks
#1 00 is.na($X) 213 785 0.0322 785 70 0.0892 0.6416 0.0178 2.7716
#2 01 $X <= 1 3741 16465 0.6748 0 560 0.0340 -0.3811 0.0828 18.9469
#3 02 $X > 1 & $X <= 2 478 1906 0.0781 0 121 0.0635 0.2740 0.0066 16.5222
#4 03 $X > 2 & $X <= 4 587 2231 0.0914 0 176 0.0789 0.5078 0.0298 10.6623
#5 04 $X > 4 818 3014 0.1235 0 269 0.0893 0.6426 0.0685 0.0000
#$cuts
#[1] 1 2 4
@statcompute
statcompute / wts_bin.R
Created April 21, 2019 20:58
binning outcome with weights
derog_bin <- qtl_bin(df, bad, tot_derog)
derog_bin
#$df
# bin rule freq dist mv_cnt bad_freq bad_rate woe iv ks
# 00 is.na($X) 213 0.0365 213 70 0.3286 0.6416 0.0178 2.7716
# 01 $X <= 1 3741 0.6409 0 560 0.1497 -0.3811 0.0828 18.9469
# 02 $X > 1 & $X <= 2 478 0.0819 0 121 0.2531 0.2740 0.0066 16.5222
# 03 $X > 2 & $X <= 4 587 0.1006 0 176 0.2998 0.5078 0.0298 10.6623
# 04 $X > 4 818 0.1401 0 269 0.3289 0.6426 0.0685 0.0000