Skip to content

Instantly share code, notes, and snippets.

@statcompute
Last active August 25, 2019 05:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statcompute/d9dd6b2d9a4690b8260dee724bbfdea8 to your computer and use it in GitHub Desktop.
Save statcompute/d9dd6b2d9a4690b8260dee724bbfdea8 to your computer and use it in GitHub Desktop.
df <- readRDS("df.rds")
source("mob.R")
bin_out <- batch_bin(df, 3)
bin_out$BinSum[order(-bin_out$BinSum$iv), ]
# var nbin unique miss min median max ks iv
# bureau_score 34 315 315 443 692.5 848 35.2651 0.8357
# tot_rev_line 20 3617 477 0 10573.0 205395 26.8943 0.4442
# age_oldest_tr 25 460 216 1 137.0 588 20.3646 0.2714
# tot_derog 7 29 213 0 0.0 32 20.0442 0.2599
# ltv 17 145 1 0 100.0 176 16.8807 0.1911
# rev_util 12 101 0 0 30.0 100 16.9615 0.1635
# tot_tr 15 67 213 0 16.0 77 17.3002 0.1425
# tot_rev_debt 8 3880 477 0 3009.5 96260 8.8722 0.0847
# tot_rev_tr 4 21 636 0 3.0 24 9.0779 0.0789
# tot_income 17 1639 5 0 3400.0 8147167 10.3386 0.0775
# tot_open_tr 7 26 1416 0 5.0 26 6.8695 0.0282
# ONLY SELECT VARIABLES WITH IV > 0.1
dummies <- data.frame(
bad = df$bad,
tot_derog = ifelse(is.na(df$tot_derog), mean(df$tot_derog, na.rm = T), df$tot_derog),
dummy.tot_derog = ifelse(is.na(df$tot_derog), 1, 0),
tot_tr = ifelse(is.na(df$tot_tr), mean(df$tot_tr, na.rm = T), df$tot_tr),
dummy.tot_tr = ifelse(is.na(df$tot_tr), 1, 0),
age_oldest_tr = ifelse(is.na(df$age_oldest_tr), mean(df$age_oldest_tr, na.rm = T), df$age_oldest_tr),
dummy.age_oldest_tr = ifelse(is.na(df$age_oldest_tr), 1, 0),
tot_rev_line = ifelse(is.na(df$tot_rev_line), mean(df$tot_rev_line, na.rm = T), df$tot_rev_line),
dummy.tot_rev_line = ifelse(is.na(df$tot_rev_line), 1, 0),
rev_util = ifelse(is.na(df$rev_util), mean(df$rev_util, na.rm = T), df$rev_util),
dummy.rev_util = ifelse(is.na(df$rev_util), 1, 0),
bureau_score = ifelse(is.na(df$bureau_score), mean(df$bureau_score, na.rm = T), df$bureau_score),
dummy.bureau_score = ifelse(is.na(df$bureau_score), 1, 0),
ltv = ifelse(is.na(df$ltv), mean(df$ltv, na.rm = T), df$ltv),
dummy.ltv = ifelse(is.na(df$ltv), 1, 0))
dm1 <- summary(glm(bad ~ ., data = dummies, family = "binomial"))
dx1 <- paste(row.names(dm1$coefficients)[dm1$coefficients[, 4] < 0.05][-1])
dl1 <- as.formula(paste("bad", paste(dx1, collapse = " + "), sep = " ~ "))
dm2 <- glm(dl1, data = dummies, family = "binomial")
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) 5.827e+00 5.651e-01 10.311 < 2e-16 ***
#age_oldest_tr -1.595e-03 4.526e-04 -3.523 0.000426 ***
#tot_rev_line -1.684e-05 2.910e-06 -5.785 7.25e-09 ***
#dummy.tot_rev_line 5.314e-01 1.434e-01 3.707 0.000210 ***
#rev_util 3.183e-03 1.171e-03 2.718 0.006574 **
#bureau_score -1.390e-02 8.075e-04 -17.209 < 2e-16 ***
#dummy.bureau_score 7.339e-01 1.518e-01 4.835 1.33e-06 ***
#ltv 2.451e-02 2.249e-03 10.895 < 2e-16 ***
roc1 <- pROC::roc(response = df$bad, predictor = fitted(mdl2))
roc2 <- pROC::roc(response = df$bad, predictor = fitted(dm2))
pROC::roc.test(roc1, roc2, method = "delong", paired = T)
# DeLong's test for two correlated ROC curves
# data: roc1 and roc2
# Z = 4.2369, p-value = 2.266e-05
# alternative hypothesis: true difference in AUC is not equal to 0
# sample estimates:
# AUC of roc1 AUC of roc2
# 0.7751298 0.7679757
pscl::vuong(mdl2, dm2)
# Vuong Non-Nested Hypothesis Test-Statistic:
# Vuong z-statistic H_A p-value
# Raw 4.275484 model1 > model2 9.5361e-06
# AIC-corrected 4.565833 model1 > model2 2.4876e-06
# BIC-corrected 5.534434 model1 > model2 1.5612e-08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment