Last active
August 25, 2017 18:39
-
-
Save szilard/c204bd4fb1d30a2464c931eff74aad80 to your computer and use it in GitHub Desktop.
A little framework for experimenting with the impact of various methods for dealing with unbalanced classes for machine learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## partial credit :) to @earino for the idea | |
library(lightgbm) | |
library(data.table) | |
library(ROCR) | |
d0_train <- fread("/var/data/bm-ml/train-10m.csv") | |
d0_test <- fread("/var/data/bm-ml/test.csv") | |
d0 <- rbind(d0_train, d0_test) | |
p <- ncol(d0)-1 | |
d0$dep_delayed_15min <- ifelse(d0$dep_delayed_15min=="Y",1,0) | |
d0_wrules <- lgb.prepare_rules(d0) # lightgbm special treat of categoricals | |
d0 <- d0_wrules$data | |
cols_cats <- names(d0_wrules$rules) | |
# "population" (from which we get "the data" as a sample) | |
# TODO: get a bigger test set to be able to get samples from that too | |
d1_train <- as.matrix(d0[1:nrow(d0_train),]) | |
d1_test <- as.matrix(d0[(nrow(d0_train)+1):(nrow(d0)),]) | |
size_neg <- 1000e3 | |
size_pos <- 10e3 | |
d_res <- data.table() | |
n_rs <- 10 | |
for (k in 1:n_rs) { | |
print(k) | |
d1_train_Y <- d1_train[d1_train[,p+1]==1,] | |
d1_train_N <- d1_train[d1_train[,p+1]==0,] | |
d1_train_Y <- d1_train_Y[sample(1:nrow(d1_train_Y), size_neg),] | |
d1_train_N <- d1_train_N[sample(1:nrow(d1_train_N), size_neg),] | |
d_train_list <- list() | |
# if we could get a large sample from the positive class | |
d_train_list[["1-hidden-bigpos"]] <- rbind(d1_train_Y, d1_train_N) | |
# unbalanced data that would be seen in practice | |
d2_train_Y <- d1_train_Y[sample(1:nrow(d1_train_Y), size_pos),] | |
d2_train_N <- d1_train_N | |
d_train_list[["2-unbalanced"]] <- rbind(d2_train_Y, d2_train_N) | |
## various rebalancing methods (can use d2_train_Y, d2_train_N only): | |
# balance by undersampling | |
d3_train_Y <- d2_train_Y | |
d3_train_N <- d2_train_N[sample(1:nrow(d2_train_N), size_pos),] | |
d_train_list[["3a-undersampling-neg"]] <- rbind(d3_train_Y, d3_train_N) | |
# # ... | |
# d3_train_Y <- d2_train_Y | |
# d3_train_N <- d2_train_N | |
# d_train_list[["3b-..."]] <- rbind(d3_train_Y, d3_train_N) | |
for (m in names(d_train_list)) { | |
d_train <- d_train_list[[m]] | |
dlgb_train <- lgb.Dataset(data = d_train[,1:p], label = d_train[,p+1]) | |
runtm <- system.time({ | |
md <- lgb.train(data = dlgb_train, objective = "binary", | |
num_threads = parallel::detectCores()/2, | |
nrounds = 100, learning_rate = 0.1, num_leaves = 1024, | |
categorical_feature = cols_cats, | |
verbose = 0) | |
})[[3]] | |
phat <- predict(md, data = d1_test[,1:p]) | |
rocr_pred <- prediction(phat, d1_test[,p+1]) | |
auc <- performance(rocr_pred, "auc")@y.values[[1]] | |
d_res <- rbind(d_res, data.frame(method = m, auc = auc, runtm = runtm)) | |
} | |
} | |
d_res_sum <- d_res[,list(avg=mean(auc),sd=sd(auc)),by=method] | |
d_res_sum | |
library(ggplot2) | |
ggplot(d_res_sum, aes(x=method)) + geom_point(aes(y=avg)) + geom_errorbar(aes(ymin = avg-sd, ymax=avg+sd), width = 0.1) | |
Author
szilard
commented
Aug 25, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment