Last active
August 25, 2019 05:04
-
-
Save statcompute/d9dd6b2d9a4690b8260dee724bbfdea8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df <- readRDS("df.rds") | |
source("mob.R") | |
bin_out <- batch_bin(df, 3) | |
bin_out$BinSum[order(-bin_out$BinSum$iv), ] | |
# var nbin unique miss min median max ks iv | |
# bureau_score 34 315 315 443 692.5 848 35.2651 0.8357 | |
# tot_rev_line 20 3617 477 0 10573.0 205395 26.8943 0.4442 | |
# age_oldest_tr 25 460 216 1 137.0 588 20.3646 0.2714 | |
# tot_derog 7 29 213 0 0.0 32 20.0442 0.2599 | |
# ltv 17 145 1 0 100.0 176 16.8807 0.1911 | |
# rev_util 12 101 0 0 30.0 100 16.9615 0.1635 | |
# tot_tr 15 67 213 0 16.0 77 17.3002 0.1425 | |
# tot_rev_debt 8 3880 477 0 3009.5 96260 8.8722 0.0847 | |
# tot_rev_tr 4 21 636 0 3.0 24 9.0779 0.0789 | |
# tot_income 17 1639 5 0 3400.0 8147167 10.3386 0.0775 | |
# tot_open_tr 7 26 1416 0 5.0 26 6.8695 0.0282 | |
# ONLY SELECT VARIABLES WITH IV > 0.1 | |
dummies <- data.frame( | |
bad = df$bad, | |
tot_derog = ifelse(is.na(df$tot_derog), mean(df$tot_derog, na.rm = T), df$tot_derog), | |
dummy.tot_derog = ifelse(is.na(df$tot_derog), 1, 0), | |
tot_tr = ifelse(is.na(df$tot_tr), mean(df$tot_tr, na.rm = T), df$tot_tr), | |
dummy.tot_tr = ifelse(is.na(df$tot_tr), 1, 0), | |
age_oldest_tr = ifelse(is.na(df$age_oldest_tr), mean(df$age_oldest_tr, na.rm = T), df$age_oldest_tr), | |
dummy.age_oldest_tr = ifelse(is.na(df$age_oldest_tr), 1, 0), | |
tot_rev_line = ifelse(is.na(df$tot_rev_line), mean(df$tot_rev_line, na.rm = T), df$tot_rev_line), | |
dummy.tot_rev_line = ifelse(is.na(df$tot_rev_line), 1, 0), | |
rev_util = ifelse(is.na(df$rev_util), mean(df$rev_util, na.rm = T), df$rev_util), | |
dummy.rev_util = ifelse(is.na(df$rev_util), 1, 0), | |
bureau_score = ifelse(is.na(df$bureau_score), mean(df$bureau_score, na.rm = T), df$bureau_score), | |
dummy.bureau_score = ifelse(is.na(df$bureau_score), 1, 0), | |
ltv = ifelse(is.na(df$ltv), mean(df$ltv, na.rm = T), df$ltv), | |
dummy.ltv = ifelse(is.na(df$ltv), 1, 0)) | |
dm1 <- summary(glm(bad ~ ., data = dummies, family = "binomial")) | |
dx1 <- paste(row.names(dm1$coefficients)[dm1$coefficients[, 4] < 0.05][-1]) | |
dl1 <- as.formula(paste("bad", paste(dx1, collapse = " + "), sep = " ~ ")) | |
dm2 <- glm(dl1, data = dummies, family = "binomial") | |
# Estimate Std. Error z value Pr(>|z|) | |
#(Intercept) 5.827e+00 5.651e-01 10.311 < 2e-16 *** | |
#age_oldest_tr -1.595e-03 4.526e-04 -3.523 0.000426 *** | |
#tot_rev_line -1.684e-05 2.910e-06 -5.785 7.25e-09 *** | |
#dummy.tot_rev_line 5.314e-01 1.434e-01 3.707 0.000210 *** | |
#rev_util 3.183e-03 1.171e-03 2.718 0.006574 ** | |
#bureau_score -1.390e-02 8.075e-04 -17.209 < 2e-16 *** | |
#dummy.bureau_score 7.339e-01 1.518e-01 4.835 1.33e-06 *** | |
#ltv 2.451e-02 2.249e-03 10.895 < 2e-16 *** | |
roc1 <- pROC::roc(response = df$bad, predictor = fitted(mdl2)) | |
roc2 <- pROC::roc(response = df$bad, predictor = fitted(dm2)) | |
pROC::roc.test(roc1, roc2, method = "delong", paired = T) | |
# DeLong's test for two correlated ROC curves | |
# data: roc1 and roc2 | |
# Z = 4.2369, p-value = 2.266e-05 | |
# alternative hypothesis: true difference in AUC is not equal to 0 | |
# sample estimates: | |
# AUC of roc1 AUC of roc2 | |
# 0.7751298 0.7679757 | |
pscl::vuong(mdl2, dm2) | |
# Vuong Non-Nested Hypothesis Test-Statistic: | |
# Vuong z-statistic H_A p-value | |
# Raw 4.275484 model1 > model2 9.5361e-06 | |
# AIC-corrected 4.565833 model1 > model2 2.4876e-06 | |
# BIC-corrected 5.534434 model1 > model2 1.5612e-08 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment