Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created October 11, 2016 12:46
Show Gist options
  • Save yabyzq/7a213d09b4ab4571b2d86e3320bc5b76 to your computer and use it in GitHub Desktop.
Save yabyzq/7a213d09b4ab4571b2d86e3320bc5b76 to your computer and use it in GitHub Desktop.
R - Unbalanced dataset
#Sampling method
#1. Undersampling
#2. Oversampling
#3. Synthetic - Smote
#Ensembling method
#1. BalanceCascade, Keep removing majority class examples un4l none is miss-classified
#2. EasyEnsemble, ensemble different balanced model
#Cost-based method - Cost FN >> cost FP
#Tomek link remove majority class near decision border
#Generating unbalanced training dataset
iris.virginica <- subset(iris, Species == 'virginica')
train.virginica <- iris.virginica[sample(1:nrow(iris.virginica), 7, replace=FALSE),]
test.virginica <- iris.virginica[sample(1:nrow(iris.virginica), 7, replace=FALSE),]
iris.other<- subset(iris, Species != 'virginica')
train_ind <- sample(seq_len(nrow(iris.other)), size = floor(0.5 * nrow(iris.other)))
train.other <- iris.other[train_ind, ]
test.other <- iris.other[-train_ind, ]
train <- rbind(train.virginica, train.other)
train$virginica[train$Species == 'virginica']<-1
train$virginica[train$Species != 'virginica']<-0
train$virginica <- as.factor(train$virginica)
levels(train$virginica)<-c('N','Y')
train$Species <- NULL
test <- rbind(test.virginica, test.other)
test$virginica[test$Species == 'virginica']<-1
test$virginica[test$Species != 'virginica']<-0
test$virginica <- as.factor(test$virginica)
levels(test$virginica)<-c('N','Y')
test$Species <- NULL
#Without sampling
model <- glm(virginica ~.,family=binomial(link='logit'),data=train)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
#Using SMOTE
library(DMwR)
train.smote <- SMOTE(virginica ~ ., train, perc.over =600, perc.under = 100)
table(train.smote$virginica)
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.smote)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
#Using ROSE
library(ROSE)
train.over <- ovun.sample(virginica ~ ., data = train, method = "over")$data
table(train.over$virginica)
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.over)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
train.under <- ovun.sample(virginica ~ ., data = train, method = "under")$data
table(train.under$virginica)
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.under)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
train.both <- ovun.sample(virginica ~ ., data = train, method = "under")$data
table(train.both$virginica)
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.both)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
train.rose <- ROSE(virginica ~ ., data = train, seed = 1)$data
table(train.rose$virginica)
model <- glm(virginica ~.,family=binomial(link='logit'),data=train.rose)
prediction <- predict(model, newdata = test,type = "response")
table(prediction>0.5, test$virginica)
sum(diag(table(prediction>0.5, test$virginica)))/nrow(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment