Skip to content

Instantly share code, notes, and snippets.

@tonglu
Last active December 30, 2015 09:58
Show Gist options
  • Save tonglu/7812420 to your computer and use it in GitHub Desktop.
Save tonglu/7812420 to your computer and use it in GitHub Desktop.
update_model_score using new weekly data
restoreLevels <- function(datum, model_varnames, model_levels, klasses) {
#model_varnames = .GBM_model$var.names,
#model_levels = .GBM_model$var.levels,
#klasses = attr(.GBM_model$Terms, 'dataClasses')) {
grab_levels <- function(colname) {
level_index <- which(model_varnames == colname)
if (length(level_index) == 0) return(NULL)
model_levels[[level_index]]
}
.datum <<- datum
as.data.frame(sapply(colnames(datum), function(colname) {
column <- datum[[colname]]
if (inherits(column, 'factor')) {
print (pp("ALREADY FACTOR: #{colname}"))
levs <- grab_levels(colname)
if (is.null(levs)) return(column)
else if (is.na(column)) { return(factor('Missing', levels = levs)) }
factor(column, levels = levs)
}
else if (colname %in% names(klasses)
&& klasses[[colname]] == 'factor') {
levs <- grab_levels(colname)
if (is.null(levs)) return(column)
numeric_to_factor(as.numeric(column), levs)
} else column
}, simplify = FALSE), stringsAsFactors = FALSE)
}
library(stringr)
numeric_to_factor <- function(num, levs, na.to.missing = TRUE) {
if (length(levs) == 0) stop('Zero levels provided')
if (length(num) > 1)
return(sapply(num, function(n) { numeric_to_factor(n, levs, na.to.missing) }))
if (na.to.missing && is.na(num))
return(factor('Missing', levels = union(levs, 'Missing')))
if (as.character(num) %in% levs)
return(factor(as.character(num), levels = levs))
in_range_bools <- sapply(levs, function(lev) {
lev <- as.character(lev)
lev <- str_replace_all(lev, " ", "")
lev_split <- strsplit(lev, ",")[[1]]
if (length(lev_split) < 2) {
old_opts <- options(warn = -1)
on.exit(options(old_opts))
level_to_num <- as.numeric(as.character(lev))
return(!is.na(level_to_num) && level_to_num == num)
}
left_bound <- as.numeric(substr(lev_split[1], 2, nchar(lev_split[1])))
right_bound <- as.numeric(substr(lev_split[2], 1, nchar(lev_split[2]) - 1))
left_operator <- if (substr(lev, 1, 1) == '(') `<` else `<=`
right_operator <- if (substr(lev, tmp <- nchar(lev), tmp) == ')') `>` else `>=`
left_operator(left_bound, num) && right_operator(right_bound, num)
})
if (sum(in_range_bools) == 0) return(factor('Missing', levels = union(levs, 'Missing')))
else return(factor(levs[in_range_bools], levels = levs))
}
update_model_score <- function(data_file = "Dec13", GBM_object = gg){
#### check prediction performance on new data ####
new_data <- read.csv(pp("#{Avant.datapath}#{data_file}.csv"))
#oct30 <- drop_bad_loans(oct30)
diff_loan_id <<- setdiff(new_data$loan_id, raw_data2$loan_id)
write.csv(diff_loan_id, pp('new_loan_ids_#{data_file}.csv'))
#write.csv(oct30[which(oct30$loan_id %in% diff_loan_id), c('loan_id', 'initial_default_indicator')], 'new_loans_dep_var.csv')
new_data_2 <- tweak_special_variables('avant', new_data)
new_data_2$loan_purpose <- factor(new_data_2$loan_purpose)
levels(new_data_2$loan_purpose) <- append(levels(new_data_2$loan_purpose), 'Missing')
new_data_2$loan_purpose[which(is.na(new_data_2$loan_purpose))] <- 'Missing'
new_data_2$source <- factor(new_data_2$source)
rl <- restoreLevels(new_data_2[which(new_data_2$loan_id %in% diff_loan_id),], GBM_object$var.names, GBM_object$var.levels, attr(GBM_object$Terms, 'dataClasses') )
test_new_data <<- rl[ , GBM_object$var.names[!GBM_object$var.names %in% "dep_var"]]
print(head(test_new_data))
pred_probs_test_GBM_new_data <- predict.gbm(object = GBM_object, newdata = test_new_data, 5798, type="response")
output <- data.frame(cbind(rl$loan_id, rl$initial_default_indicator, pred_probs_test_GBM_new_data))
colnames(output) <- c("loan_id", "dep_var", "score")
write.csv(output, pp('pred_probs_test_GBM_#{data_file}.csv'))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment