Skip to content

Instantly share code, notes, and snippets.

@ceshine
Last active October 30, 2017 06:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceshine/f5ca012e193a70dfb82e63d5c2ff0921 to your computer and use it in GitHub Desktop.
Save ceshine/f5ca012e193a70dfb82e63d5c2ff0921 to your computer and use it in GitHub Desktop.
checkpoint::checkpoint("2017-10-26")
pacman::p_load(data.table)
pacman::p_load(caret)
pacman::p_load(ggplot2)
set.seed(998)
mushrooms <- fread("mushrooms.csv", stringsAsFactors=T)
mushrooms[, .N, by=class]
mushrooms[, eval("veil-type") := NULL]
mushrooms[, target := as.factor(class == "e")]
mushrooms[, class := NULL]
inTraining <- createDataPartition(mushrooms$target, p = .75, list = FALSE)
mushrooms.training <-mushrooms[inTraining]
mushrooms.testing <- mushrooms[-inTraining]
odor <- mushrooms.training[, .N, by=.(odor, target)]
ggplot(odor, aes(x=odor, y=N, fill=target)) + geom_bar(stat="identity", position="stack") +
theme_wsj() + guides(fill=guide_legend(title="Edible")) + ggtitle("Odor Distribution") +
scale_x_discrete(labels = c("almond","creosote","foul", "anise","musty","none","pungent","spicy", "fishy"))
gill.color <- mushrooms.training[, .N, by=.(color=get("gill-color"), target)]
ggplot(gill.color, aes(x=color, y=N, fill=target)) + geom_bar(stat="identity", position="stack") +
theme_wsj() + guides(fill=guide_legend(title="Edible"))
spore.print.color <- mushrooms.training[, .N, by=.(color=get("spore-print-color"), target)]
ggplot(spore.print.color, aes(x=color, y=N, fill=target)) + geom_bar(stat="identity", position="stack") +
theme_wsj() + guides(fill=guide_legend(title="Edible"))
odor.gill.color <- mushrooms.training[, .(M=mean(as.numeric(target)-1)), by=.(odor, color=get("gill-color"))]
ggplot(odor.gill.color, aes(odor, color)) + geom_tile(aes(fill=M)) + theme_tufte() +
scale_x_discrete(labels = c("almond","creosote","foul", "anise","musty","none","pungent","spicy", "fishy")) +
ggtitle("Odor & Gill Color Distribution") + guides(fill=guide_legend(title="Edible Rate"))
odor.spore.color <- mushrooms.training[, .(M=mean(as.numeric(target)-1)), by=.(odor, color=get("spore-print-color"))]
ggplot(odor.spore.color, aes(odor, color)) + geom_tile(aes(fill=M)) + theme_tufte() +
scale_x_discrete(labels = c("almond","creosote","foul", "anise","musty","none","pungent","spicy", "fishy")) +
ggtitle("Odor & Spore Print Color Distribution") + guides(fill=guide_legend(title="Edible Rate"))
checkpoint::checkpoint("2017-10-26")
pacman::p_load(ranger)
pacman::p_load(data.table)
pacman::p_load(caret)
pacman::p_load(ggplot2)
pacman::p_load(ggthemes)
set.seed(998)
mushrooms <- fread("mushrooms.csv", stringsAsFactors=T)
mushrooms[, .N, by=class]
mushrooms[, target := as.factor(class == "e")]
mushrooms[, class := NULL]
mushrooms[, eval("veil-type") := NULL]
inTraining <- createDataPartition(mushrooms$target, p = .75, list = FALSE)
mushrooms.training <- data.frame(mushrooms[inTraining])
mushrooms.testing <- data.frame(mushrooms[-inTraining])
get_importance <- function(mtry=6, seeds=191:200, importance='permutation'){
importances <- c()
for(seed in seeds){
ranger_foo <- ranger(
formula = target ~ .,
data = mushrooms.training,
importance = importance,
num.trees = 250,
mtry = mtry,
min.node.size = 10,
respect.unordered.factors = 'order',
seed = seed,
num.threads = 4
)
test.preds <- predict(ranger_foo, mushrooms.testing)
accuracy <- sum(test.preds$predictions == mushrooms.testing$target) / nrow(mushrooms.testing)
cat("val accuracy:", accuracy, "\n")
importances <- rbind(importances, ranger_foo$variable.importance)
# importances[order(-importances)] / max(importances)
}
importances.table <- data.table(importances)
return(importances.table[
, names(importances.table)[order(-sapply(importances.table, mean))], with=F])
}
plot_importances <- function(importances.table, top=NULL, ymin=0, ymax=.3){
if(is.null(top)){top <- ncol(importances.table)}
importances.long <- melt(importances.table[,c(1:top),with=F],
measure.vars=colnames(importances.table)[c(1:top)])
return(
ggplot(importances.long, aes(x=variable, y=value)) + geom_boxplot() + theme_wsj() +
theme(axis.text.x=element_text(angle=-15, size=12, vjust=.5), title=element_text(size=18)) + ylim(ymin, ymax) +
labs(x="Feature", y="Importance")
)
}
importances.perm.table <- get_importance(2, 181:200, "permutation")
plot_importances(importances.perm.table, top=10, ymax=.3) + ggtitle("Permutation Importance with mtry=2")
importances.perm.table <- get_importance(6, 181:200, "permutation")
plot_importances(importances.perm.table, top=10, ymax=.3) + ggtitle("Permutation Importance with mtry=6")
importances.perm.table <- get_importance(21, 181:200, "permutation")
plot_importances(importances.perm.table, top=10, ymax=.5) + ggtitle("Permutation Importance with mtry=21")
importances.perm.table <- get_importance(2, 181:200, "impurity")
plot_importances(importances.perm.table, top=10, ymax=1000) + ggtitle("Gini Importance with mtry=2")
importances.perm.table <- get_importance(6, 181:200, "impurity")
plot_importances(importances.perm.table, top=10, ymax=2000) + ggtitle("Gini Importance with mtry=6")
importances.perm.table <- get_importance(21, 181:200, "impurity")
plot_importances(importances.perm.table, top=10, ymax=3000) + ggtitle("Gini Importance with mtry=21")
mushrooms.training$odor <- NULL
importances.perm.table <- get_importance(2, 181:200, "permutation")
plot_importances(importances.perm.table, top=10, ymax=.3) + ggtitle("Permutation Importance(-odor) with mtry=2")
checkpoint::checkpoint("2017-10-26")
pacman::p_load(data.table)
pacman::p_load(xgboost)
pacman::p_load(caret)
pacman::p_load(ggplot2)
pacman::p_load(ranger)
set.seed(998)
mushrooms <- fread("mushrooms.csv", stringsAsFactors=T)
mushrooms[, .N, by=class]
mushrooms[, eval("veil-type") := NULL]
mushrooms[, target := as.factor(class == "e")]
mushrooms[, class := NULL]
inTraining <- createDataPartition(mushrooms$target, p = .75, list = FALSE)
mushrooms.training <- mushrooms[inTraining]
mushrooms.testing <- mushrooms[-inTraining]
inTraining <- createDataPartition(mushrooms.training$target, p = .8, list = FALSE)
mushrooms.validation <- mushrooms.training[-inTraining]
mushrooms.training <- mushrooms.training[inTraining]
get_xgb_importance <- function(dtrain, dval, feature_names, seeds=191:200, num_round=10, param){
importances <- c()
watchlist <- list(train=dtrain, eval = dval)
for(seed in seeds){
param$seed <- seed
num_round <- 10
bst <- xgb.train(param, dtrain, num_round, watchlist,
objective = "binary:logistic", eval_metric = "error",
maximize = F)
importances <- rbind(importances, xgb.importance(feature_names, model=bst))
}
return(data.table(importances))
}
plot_xgb_importances <- function(importances.table, top=NULL, ymin=0, ymax=.3, importance="Gain"){
feature.means <- importances.table[, .(M=mean(get(importance))), by=Feature]
feature.means <- feature.means[order(-feature.means$M)]
if(!is.null(top)){feature.means <- feature.means[1:top]}
importances.table <- merge(importances.table[, .(Feature, V=get(importance))], feature.means)
importances.table[, Feature := factor(Feature, level=feature.means$Feature)]
return(
ggplot(importances.table, aes(x=Feature, y=V)) +
geom_boxplot() + theme_wsj() +
theme(axis.text.x=element_text(angle=-15, size=12, vjust=.5)) + ylim(ymin, ymax) +
labs(x="Feature", y="Importance")
)
}
get_rf_importance <- function(df.train, df.val, mtry=6, seeds=191:200, importance='permutation'){
importances <- c()
for(seed in seeds){
ranger_foo <- ranger(
formula = target ~ .,
data = df.train,
importance = importance,
num.trees = 250,
mtry = mtry,
min.node.size = 1,
seed = seed,
num.threads = 4
)
test.preds <- predict(ranger_foo, df.val)
accuracy <- sum(test.preds$predictions == df.val$target) / nrow(df.val)
cat("val accuracy:", accuracy, "\n")
importances <- rbind(importances, ranger_foo$variable.importance)
}
importances.table <- data.table(importances)
return(importances.table[
, names(importances.table)[order(-sapply(importances.table, mean))], with=F])
}
plot_rf_importances <- function(importances.table, top=NULL, ymin=0, ymax=.3){
if(is.null(top)){top <- ncol(importances.table)}
importances.long <- melt(importances.table[,c(1:top),with=F],
measure.vars=colnames(importances.table)[c(1:top)])
return(
ggplot(importances.long, aes(x=variable, y=value)) + geom_boxplot() + theme_wsj() +
theme(axis.text.x=element_text(angle=-15, size=12, vjust=.5), title=element_text(size=16)) + ylim(ymin, ymax) +
labs(x="Feature", y="Importance")
)
}
dtrain <- xgb.DMatrix(
model.matrix(~ .-1, mushrooms.training[, -"target"]),
label = as.integer(mushrooms.training$target)-1)
dval <- xgb.DMatrix(
model.matrix(~ .-1, mushrooms.validation[, -"target"]),
label = as.integer(mushrooms.validation$target)-1)
param <- list(max_depth=6, eta=1, nthread=1, silent=1, subsample=.8, colsample_bytree=.8, lambda=0)
importances.table <- get_xgb_importance(
dtrain, dval, colnames(data.frame(model.matrix(~ ., mushrooms.training[, -"target"]))),
1:20, num_round=10, param=param)
plot_xgb_importances(importances.table, top=10, ymax=.75, importance="Gain") +
ggtitle("(XGB) Gain")
plot_xgb_importances(importances.table, top=10, ymax=.3, importance="Cover") +
ggtitle("(XGB) Cover")
plot_xgb_importances(importances.table, top=10, ymax=.2, importance="Frequency") +
ggtitle("(XGB) Frequency/Split")
ohdata <- data.frame(model.matrix(~ .-1, mushrooms.training[, -"target"]))
ohdata$target <- as.numeric(mushrooms.training$target) - 1
ohdata.validation <- data.frame(model.matrix(~ .-1, mushrooms.validation[, -"target"]))
ohdata.validation$target <- as.numeric(mushrooms.validation$target) - 1
importances.perm.table <- get_rf_importance(
ohdata, ohdata.validation,
60, 181:200, "permutation")
plot_rf_importances(importances.perm.table, top=10, ymax=.2) +
ggtitle("(RF)Permutation Importance with mtry=60")
# Numeric encoding
dtrain <- xgb.DMatrix(
as.matrix(sapply(mushrooms.training[, -"target"], as.numeric)),
label = as.integer(mushrooms.training$target)-1)
dval <- xgb.DMatrix(
as.matrix(sapply(mushrooms.validation[, -"target"], as.numeric)),
label = as.integer(mushrooms.validation$target)-1)
param <- list(max_depth=3, eta=1, nthread=1, silent=1, subsample=.8, colsample_bytree=.8, lambda=0)
importances.table <- get_xgb_importance(
dtrain, dval, colnames(mushrooms.testing[, -c(22)]), 1:50, num_round=10, param)
plot_xgb_importances(importances.table, top=10, ymax=1, importance="Gain") +
ggtitle("(XGB) Gain")
plot_xgb_importances(importances.table, top=10, ymax=.75, importance="Cover") +
ggtitle("(XGB) Cover")
plot_xgb_importances(importances.table, top=10, ymax=.5, importance="Frequency") +
ggtitle("(XGB) Frequency/Split")
importances.perm.table <- get_rf_importance(
data.frame(sapply(mushrooms.training, as.numeric)),
data.frame(sapply(mushrooms.validation, as.numeric)),
6, 181:200, "permutation")
plot_rf_importances(importances.perm.table, top=10, ymax=.2) +
ggtitle("(RF)Permutation Importance with mtry=6")
importances.perm.table <- get_rf_importance(
data.frame(sapply(mushrooms.training, as.numeric)),
data.frame(sapply(mushrooms.validation, as.numeric)),
16, 181:200, "permutation")
plot_rf_importances(importances.perm.table, top=10, ymax=.2) +
ggtitle("(RF)Permutation Importance with mtry=16")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment