Created
October 11, 2016 12:46
-
-
Save yabyzq/7a213d09b4ab4571b2d86e3320bc5b76 to your computer and use it in GitHub Desktop.
R - Unbalanced dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Sampling method | |
#1. Undersampling | |
#2. Oversampling | |
#3. Synthetic - Smote | |
#Ensembling method | |
#1. BalanceCascade, Keep removing majority class examples un4l none is miss-classified | |
#2. EasyEnsemble, ensemble different balanced model | |
#Cost-based method - Cost FN >> cost FP | |
#Tomek link remove majority class near decision border | |
#Generating unbalanced training dataset | |
iris.virginica <- subset(iris, Species == 'virginica') | |
train.virginica <- iris.virginica[sample(1:nrow(iris.virginica), 7, replace=FALSE),] | |
test.virginica <- iris.virginica[sample(1:nrow(iris.virginica), 7, replace=FALSE),] | |
iris.other<- subset(iris, Species != 'virginica') | |
train_ind <- sample(seq_len(nrow(iris.other)), size = floor(0.5 * nrow(iris.other))) | |
train.other <- iris.other[train_ind, ] | |
test.other <- iris.other[-train_ind, ] | |
train <- rbind(train.virginica, train.other) | |
train$virginica[train$Species == 'virginica']<-1 | |
train$virginica[train$Species != 'virginica']<-0 | |
train$virginica <- as.factor(train$virginica) | |
levels(train$virginica)<-c('N','Y') | |
train$Species <- NULL | |
test <- rbind(test.virginica, test.other) | |
test$virginica[test$Species == 'virginica']<-1 | |
test$virginica[test$Species != 'virginica']<-0 | |
test$virginica <- as.factor(test$virginica) | |
levels(test$virginica)<-c('N','Y') | |
test$Species <- NULL | |
#Without sampling | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) | |
#Using SMOTE | |
library(DMwR) | |
train.smote <- SMOTE(virginica ~ ., train, perc.over =600, perc.under = 100) | |
table(train.smote$virginica) | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.smote) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) | |
#Using ROSE | |
library(ROSE) | |
train.over <- ovun.sample(virginica ~ ., data = train, method = "over")$data | |
table(train.over$virginica) | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.over) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) | |
train.under <- ovun.sample(virginica ~ ., data = train, method = "under")$data | |
table(train.under$virginica) | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.under) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) | |
train.both <- ovun.sample(virginica ~ ., data = train, method = "under")$data | |
table(train.both$virginica) | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.both) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) | |
train.rose <- ROSE(virginica ~ ., data = train, seed = 1)$data | |
table(train.rose$virginica) | |
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.rose) | |
prediction <- predict(model, newdata = test,type = "response") | |
table(prediction>0.5, test$virginica) | |
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment