Last active
May 4, 2019 14:40
-
-
Save statcompute/f1936fdc4ac892caeca8dcb27b0b1087 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df <- readRDS("df.rds") | |
### SHOWING THE RESPONSE IN THE LAST COLUMN ### | |
head(df, 2) | |
#tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line rev_util bureau_score ltv tot_income bad | |
# 6 7 46 NaN NaN NaN NaN 0 747 109 4800.00 0 | |
# 0 21 153 6 1 97 4637 2 744 97 5833.33 0 | |
source("mob.R") | |
bin_out <- batch_bin(df, 3) | |
bin_out$BinSum | |
# var nbin unique miss min median max ks iv | |
# tot_derog 7 29 213 0 0.0 32 20.0442 0.2599 | |
# tot_tr 15 67 213 0 16.0 77 17.3002 0.1425 | |
# ...... | |
top <- paste(bin_out$BinSum[order(bin_out$BinSum[["iv"]], decreasing = T), ][1:6, "var"], sep = '') | |
par(mfrow = c(2, 3)) | |
lapply(top, function(x) plot(bin_out$BinLst[[x]]$df[["woe"]], | |
log(bin_out$BinLst[[x]]$df[["bad_rate"]] / (1 - bin_out$BinLst[[x]]$df[["bad_rate"]])), | |
type = "b", main = x, cex.main = 2, xlab = paste("woe of", x), ylab = "logit(bad)", cex = 2, col = "red")) | |
df_woe <- batch_woe(df, bin_out$BinLst) | |
str(df_woe$df) | |
#'data.frame': 5837 obs. of 12 variables: | |
# $ idx_ : int 1 2 3 4 5 6 7 8 9 10 ... | |
# $ woe.tot_derog : num 0.656 -0.556 -0.556 0.274 0.274 ... | |
# $ woe.tot_tr : num 0.407 -0.322 -0.4 -0.322 0.303 ... | |
# ...... | |
### PARSE VARIABLES WITH IV > 0.1 ### | |
x1 <- paste("woe", bin_out$BinSum[bin_out$BinSum[["iv"]] > 0.1, ]$var, sep = ".") | |
# "woe.tot_derog" "woe.tot_tr" "woe.age_oldest_tr" "woe.tot_rev_line" "woe.rev_util" "woe.bureau_score" "woe.ltv" | |
fml1 <- as.formula(paste("bad", paste(x1, collapse = " + "), sep = " ~ ")) | |
sum1 <- summary(glm(fml1, data = cbind(bad = df$bad, df_woe$df), family = "binomial")) | |
### PARSE SIGNIFICANT VARIABLES WITH P-VALUE < 0.05 ### | |
x2 <- paste(row.names(sum1$coefficients)[sum1$coefficients[, 4] < 0.05][-1]) | |
# "woe.age_oldest_tr" "woe.tot_rev_line" "woe.rev_util" "woe.bureau_score" "woe.ltv" | |
fml2 <- as.formula(paste("bad", paste(x2, collapse = " + "), sep = " ~ ")) | |
mdl2 <- glm(fml2, data = cbind(bad = df$bad, df_woe$df), family = "binomial") | |
# Estimate Std. Error z value Pr(>|z|) | |
#(Intercept) -1.38600 0.03801 -36.461 < 2e-16 *** | |
#woe.age_oldest_tr 0.30376 0.08176 3.715 0.000203 *** | |
#woe.tot_rev_line 0.42935 0.06793 6.321 2.61e-10 *** | |
#woe.rev_util 0.29150 0.08721 3.342 0.000831 *** | |
#woe.bureau_score 0.83568 0.04974 16.803 < 2e-16 *** | |
#woe.ltv 0.97789 0.09121 10.721 < 2e-16 *** | |
pROC::roc(response = df$bad, predictor = fitted(mdl2)) | |
# Area under the curve: 0.7751 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment