Skip to content

Instantly share code, notes, and snippets.

@szilard
Last active August 25, 2017 18:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save szilard/c204bd4fb1d30a2464c931eff74aad80 to your computer and use it in GitHub Desktop.
Save szilard/c204bd4fb1d30a2464c931eff74aad80 to your computer and use it in GitHub Desktop.
A little framework for experimenting with the impact of various methods for dealing with unbalanced classes for machine learning
## partial credit :) to @earino for the idea
library(lightgbm)
library(data.table)
library(ROCR)
d0_train <- fread("/var/data/bm-ml/train-10m.csv")
d0_test <- fread("/var/data/bm-ml/test.csv")
d0 <- rbind(d0_train, d0_test)
p <- ncol(d0)-1
d0$dep_delayed_15min <- ifelse(d0$dep_delayed_15min=="Y",1,0)
d0_wrules <- lgb.prepare_rules(d0) # lightgbm special treat of categoricals
d0 <- d0_wrules$data
cols_cats <- names(d0_wrules$rules)
# "population" (from which we get "the data" as a sample)
# TODO: get a bigger test set to be able to get samples from that too
d1_train <- as.matrix(d0[1:nrow(d0_train),])
d1_test <- as.matrix(d0[(nrow(d0_train)+1):(nrow(d0)),])
size_neg <- 1000e3
size_pos <- 10e3
d_res <- data.table()
n_rs <- 10
for (k in 1:n_rs) {
print(k)
d1_train_Y <- d1_train[d1_train[,p+1]==1,]
d1_train_N <- d1_train[d1_train[,p+1]==0,]
d1_train_Y <- d1_train_Y[sample(1:nrow(d1_train_Y), size_neg),]
d1_train_N <- d1_train_N[sample(1:nrow(d1_train_N), size_neg),]
d_train_list <- list()
# if we could get a large sample from the positive class
d_train_list[["1-hidden-bigpos"]] <- rbind(d1_train_Y, d1_train_N)
# unbalanced data that would be seen in practice
d2_train_Y <- d1_train_Y[sample(1:nrow(d1_train_Y), size_pos),]
d2_train_N <- d1_train_N
d_train_list[["2-unbalanced"]] <- rbind(d2_train_Y, d2_train_N)
## various rebalancing methods (can use d2_train_Y, d2_train_N only):
# balance by undersampling
d3_train_Y <- d2_train_Y
d3_train_N <- d2_train_N[sample(1:nrow(d2_train_N), size_pos),]
d_train_list[["3a-undersampling-neg"]] <- rbind(d3_train_Y, d3_train_N)
# # ...
# d3_train_Y <- d2_train_Y
# d3_train_N <- d2_train_N
# d_train_list[["3b-..."]] <- rbind(d3_train_Y, d3_train_N)
for (m in names(d_train_list)) {
d_train <- d_train_list[[m]]
dlgb_train <- lgb.Dataset(data = d_train[,1:p], label = d_train[,p+1])
runtm <- system.time({
md <- lgb.train(data = dlgb_train, objective = "binary",
num_threads = parallel::detectCores()/2,
nrounds = 100, learning_rate = 0.1, num_leaves = 1024,
categorical_feature = cols_cats,
verbose = 0)
})[[3]]
phat <- predict(md, data = d1_test[,1:p])
rocr_pred <- prediction(phat, d1_test[,p+1])
auc <- performance(rocr_pred, "auc")@y.values[[1]]
d_res <- rbind(d_res, data.frame(method = m, auc = auc, runtm = runtm))
}
}
d_res_sum <- d_res[,list(avg=mean(auc),sd=sd(auc)),by=method]
d_res_sum
library(ggplot2)
ggplot(d_res_sum, aes(x=method)) + geom_point(aes(y=avg)) + geom_errorbar(aes(ymin = avg-sd, ymax=avg+sd), width = 0.1)
@szilard
Copy link
Author

szilard commented Aug 25, 2017

screen shot 2017-08-25 at 11 36 35 am

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment