code for blog post about visualizing predictions within feature space over a number of different models. https://matthewdharris.com/2016/04/19/predicting-in-feature-space/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### FUNCTIONS | |
# a quick function that plots the lowess response curve for binary data | |
plot.logreg <- function(dat){ | |
dat$obs <- as.numeric(as.character(dat$obs)) # convert y factor to {0,1} digits | |
clr <- ifelse(dat$obs == 1, "orange", "blue") | |
pp <- ggplot(dat, aes(x = pred, y = obs)) + | |
geom_point(color = "gray30", alpha = 0.3) + | |
geom_jitter(width = 0.1, height = 0.11, color = clr) + | |
geom_rug(color = clr) + | |
geom_smooth(formula = y ~ x, se = FALSE, color = "gray25") + | |
geom_vline(xintercept = 0.5, color = "gray50", linetype = "dashed") + | |
scale_y_continuous(limits = c(-0.1,1.1), breaks = seq(0,1,0.2), | |
labels = seq(0,1,0.2)) + | |
scale_x_continuous(limits = c(-0.05,1.05), breaks = seq(0,1,0.2), | |
labels = seq(0,1,0.2)) + | |
ylab("Presence/Absence") + | |
xlab("Predicted Probability") + | |
theme_bw() | |
return(pp) | |
} | |
# Root Mean Square Error function | |
RMSE <- function(obs, pred){ | |
rmse <- sqrt(mean((obs-pred)^2)) | |
return(rmse) | |
} | |
# A quick plot function to looking at binary density overlap | |
plot.dens <- function(dens_dat){ | |
library("reshape2") | |
vars <- ncol(dens_dat)/2 | |
dens_melt <- reshape2::melt(dens_dat) | |
x_num <- NULL | |
for(i in 1:vars){ | |
x_num <- c(x_num, rep(i, nrow(dens_dat)*2)) | |
} | |
dens_melt$x_num <- x_num | |
dens_melt$presence <- rep(rep(c(1,0), each = nrow(dens_dat)),vars) | |
pp <- ggplot(dens_melt, aes(x = value, group = variable, | |
fill = as.factor(presence))) + | |
geom_density(alpha = 0.5) + | |
facet_grid(x_num ~ ., scales = "free") + | |
theme_bw() | |
return(pp) | |
} | |
# function to sample data for train/test sets | |
get.train.test.sets <- function(dat, train_v_test_fraction, sites_per_train, sites_per_test, | |
absence_presence_balance, data_reduction_fraction){ | |
# set up train and test site SITENO lists | |
site_names <- unique(dat$SITENO)[-length(unique(dat$SITENO))] # remove "background" | |
train_sites <- site_names[sample(seq_along(site_names),length(site_names)*train_v_test_fraction)] | |
test_sites <- site_names[!(site_names %in% train_sites)] | |
# reduce train and test site data volume, but still sampling from test or train | |
if(sites_per_train == "all") sites_per_train <- length(train_sites) | |
if(sites_per_test == "all") sites_per_test <- length(test_sites) | |
tr_sites <- filter(dat, presence == 1, SITENO %in% sample(train_sites, sites_per_train)) | |
te_sites <- filter(dat, presence == 1, SITENO %in% sample(test_sites , sites_per_test)) | |
# sample a 3 X multiplied site background sample for test and train | |
tr_background <- filter(dat, presence == 0) %>% | |
sample_n(nrow(tr_sites)*absence_presence_balance) | |
te_background <- filter(dat, presence == 0) %>% | |
sample_n(nrow(te_sites)*absence_presence_balance) | |
# make test and train presence/absence data sets // reduce data volume significantly as % of cells | |
train <- rbind_all(list(tr_sites, tr_background)) %>% | |
sample_frac(data_reduction_fraction) | |
test <- rbind_all(list(te_sites, te_background)) %>% | |
sample_frac(data_reduction_fraction) | |
return(list(train = train, test = test)) | |
} | |
# function for making final plot data for function space predictions | |
# a bit complicated, but needs to account for all the different predict methods | |
# each class of model has a bit of a different twist, so this function accounts for that | |
plot.prediction.grid <- function(model, pred_grid, model_type, x_test, x_train, | |
test_data_set, train_data_set){ | |
# for Logistic Regression | |
if(class(model)[1] == "glm"){ | |
p_test <- predict(model, newdata = pred_grid, type = "response") | |
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim") | |
} else if(class(model)[1] == "randomForest"){ | |
# for RF | |
p_test <- predict(model, newdata = pred_grid, type = "prob") | |
p_test <- data.frame(pred = p_test[,"1"], obs = 2, pred_grid, dset = "sim") | |
} else if(class(model)[1] == "svm"){ | |
# for SVM-OC | |
if(length(unique(train_data_set$obs)) == 1){ | |
p_test <- predict(model, newdata = pred_grid) | |
p_test <- ifelse(p_test == TRUE, 1, 0) | |
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim") | |
} else if(length(unique(train_data_set$obs)) == 2){ | |
# for SVM | |
p_test <- predict(model, newdata = pred_grid,probability=TRUE) | |
p_test <- data.frame(pred = attr(p_test, "probabilities")[,"1"], obs = 2, pred_grid, dset = "sim") | |
} | |
} else if(class(model)[1] == "ksvm"){ | |
# one class | |
p_test <- predict(model, newdata = pred_grid) | |
p_test <- ifelse(p_test == TRUE, 1, 0) | |
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim") | |
} else if(class(model)[1] == "xgb.Booster"){ | |
# for XGB | |
p_test <- predict(model, newdata = as.matrix(pred_grid)) | |
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim") | |
} else if(class(model)[1] == "MaxEnt"){ | |
# for MAXENT | |
p_test <- predict(model, as.matrix(pred_grid)) | |
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim") | |
} | |
plot_test <- data.frame(test_data_set, x_test) | |
if(length(unique(train_data_set$obs)) == 1){ | |
plot_train <- data.frame(train_data_set, oc_x_train) | |
} else if(length(unique(train_data_set$obs)) == 2){ | |
plot_train <- data.frame(train_data_set, x_train) | |
} | |
plot_dat <- data.frame(rbind(plot_train, plot_test), | |
dset = rep(c("train", "test"), | |
times = c(nrow(plot_train), nrow(plot_test)))) | |
plot_dat <- rbind(plot_dat, p_test) | |
plot_dat$V1 <- as.numeric(scale(plot_dat$V1, center = TRUE)) | |
plot_dat$V2 <- as.numeric(scale(plot_dat$V2, center = TRUE)) | |
return(plot_dat) | |
} | |
# Not in function | |
'%ni%' <- Negate('%in%') | |
### PACKAGES | |
library("ggplot2") | |
library("ggalt") | |
library("dplyr") | |
library("e1071") | |
library("caret") # only for confmatrix | |
library("AUC") | |
library("data.table") | |
library("randomForest") | |
library("kernlab") | |
library("viridis") | |
library("xgboost") | |
library("dismo") | |
library("rJava") | |
library("xtable") | |
library("knitr") | |
### CODE | |
## Get Real Data | |
og_dat <- fread("~/YOUR_DATA.csv") | |
# remove some uneeded columns | |
dat <- data.frame(og_dat) | |
dat <- dat[!(substr(dat$SITENO, 1, 4) == "36XX"),] | |
dat <- dat[, c("cd_conf", "tpi_250c", "presence", "SITENO")] | |
colnames(dat) <- c("V1", "V2", "presence", "SITENO") | |
# set up data partition variables | |
train_v_test_fraction <- 0.75 # 0 to 1 | |
sites_per_train <- "all" # integer or "all" | |
sites_per_test <- "all" # integer or "all" | |
absence_presence_balance <- 1.1 # 1 for balanced; typically 3 | |
data_reduction_fraction <- 0.2 # 0 to 1 | |
# retrive a train/test set as list | |
test_train <- get.train.test.sets(dat, train_v_test_fraction, sites_per_train, sites_per_test, | |
absence_presence_balance, data_reduction_fraction) | |
# split train/test list into individual train and test objects | |
train <- test_train[["train"]] | |
test <- test_train[["test"]] | |
train <- data.frame(y = train$presence, x1 = train[,1], x2 = train[,2]) | |
test <- data.frame(y = test$presence, x1 = test[,1], x2 = test[,2]) | |
# plot density just for fun | |
p_dat <- data.frame(rbind(test_train[["train"]], test_train[["test"]])) | |
x1_1 <- p_dat[which(p_dat$presence == 1),1] | |
x1_0 <- sample(p_dat[which(p_dat$presence == 0),1], length(x1_1)) | |
x2_1 <- p_dat[which(p_dat$presence == 1),2] | |
x2_0 <- sample(p_dat[which(p_dat$presence == 0),2], length(x2_1)) | |
plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0)) | |
# ## Or... Simulate Data | |
# y1 <- rep(1,100) | |
# y0 <- rep(0,100) | |
# x1_1 <- rnorm(100,1,1) | |
# x1_0 <- rnorm(100,3,1) | |
# x2_1 <- rnorm(100,5,1) | |
# x2_0 <- rnorm(100,6,1) | |
# # needs to go 1 then 0 for each x_i | |
# plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0)) | |
# dat <- data.frame(y = c(y1,y0), x1 = c(x1_1,x1_0), x2 = c(x2_1, x2_0)) | |
# train <- sample_frac(dat,0.5) | |
# train_index <- as.numeric(row.names(train)) | |
# test <- dat[-train_index,] | |
# for vector representation | |
x_train <- train[,-1] # remove "y" | |
y_train <- train$y | |
y_train_num <- as.numeric(as.character(y_train)) | |
x_test <- test[,-1] # remove "y" | |
y_test <- test$y | |
y_test_num <- as.numeric(as.character(y_test)) | |
# for one-class-classification | |
oc_train <- train[which(train$y == 1),] | |
oc_x_train <- oc_train[,-1] # remove "y" | |
oc_y_train <- as.numeric(as.character(oc_train$y)) | |
## Build Models | |
# basic Logistic Regression | |
m1 <- glm(y ~ ., data = train, family = "binomial") | |
# SVM - defaults C-classification, Radial basis function (RBF) | |
m2 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE) | |
# SVM - various kernels and hyperparameters | |
wts <- table(as.factor(y_train)) / length(y_train) | |
m3 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE, | |
cost = 5, kernel = "radial", class.weights = wts) | |
# SVM - one-classification | |
m4 <- svm(x = oc_x_train, y = oc_y_train, data = train, | |
type = "one-classification", nu = 0.1, cost = 1) | |
# RF | |
m5 <- randomForest(x = x_train, y = as.factor(y_train), | |
ntree = 500, nodesize = 100 ) | |
# kernlab one-svc | |
oc_dat <- data.frame(y = oc_y_train, oc_x_train) | |
m6 <- ksvm(y ~ .,data=oc_dat,kernel="rbfdot", type = "one-svc", | |
kpar=list(sigma=0.05),C=1, nu = 0.1) | |
# XGB | |
m7 <- xgboost(data = as.matrix(train[,-1]), label = train$y, max.depth = 4, eta = 0.1, | |
nround = 10, objective = "binary:logistic") | |
# MAXENT | |
m8 <- maxent(x = x_train, p = y_train) | |
## Predict models | |
# basic logistic regression | |
p1 <- predict(m1, x_train, type = "response") | |
p1 <- data.frame(pred = p1, obs = y_train_num) | |
p1_test <- predict(m1, newdata = test, type = "response") | |
p1_test <- data.frame(pred = p1_test, obs = y_test_num) | |
# SVM - degaults | |
p2 <- predict(m2, x_train, probability=TRUE) | |
p2 <- data.frame(pred = attr(p2, "probabilities")[,"1"], obs = y_train_num) | |
p2_test <- predict(m2, newdata = x_test, probability=TRUE) | |
p2_test <- data.frame(pred = attr(p2_test, "probabilities")[,"1"], obs = y_test_num) | |
# SVM - various kernels & k = 5 fold CV | |
p3 <- predict(m3, x_train, probability=TRUE) | |
p3 <- data.frame(pred = attr(p3, "probabilities")[,"1"], obs = y_train_num) | |
p3_test <- predict(m3, newdata = x_test, probability=TRUE) | |
p3_test <- data.frame(pred = attr(p3_test, "probabilities")[,"1"], obs = y_test_num) | |
# SVM - one-class-classification | |
p4 <- predict(m4, oc_x_train) | |
p4 <- ifelse(p4 == TRUE, 1, 0) | |
p4 <- data.frame(pred = as.numeric(p4), obs = oc_y_train) | |
p4_test <- predict(m4, newdata = x_test) | |
p4_test <- ifelse(p4_test == TRUE, 1, 0) | |
p4_test <- data.frame(pred = as.numeric(p4_test), obs = y_test_num) | |
# RF | |
p5 <- predict(m5, x_train, type="prob") | |
p5 <- data.frame(pred = p5[,"1"], obs = y_train_num) | |
p5_test <- predict(m5, newdata = x_test, type="prob") | |
p5_test <- data.frame(pred = p5_test[,"1"], obs = y_test_num) | |
# kernlab oc SVM | |
p6 <- predict(m6, oc_x_train) | |
p6 <- ifelse(p6 == TRUE, 1, 0) | |
p6 <- data.frame(pred = p6, obs = oc_y_train) | |
p6_test <- predict(m6, newdata = x_test) | |
p6_test <- ifelse(p6_test == TRUE, 1, 0) | |
p6_test <- data.frame(pred = p6_test, obs = y_test_num) | |
# XGB | |
p7 <- predict(m7, as.matrix(x_train)) | |
p7 <- data.frame(pred = p7, obs = y_train) | |
p7_test <- predict(m7, newdata = as.matrix(x_test)) | |
p7_test <- data.frame(pred = p7_test, obs = y_test_num) | |
# MAXENT | |
p8 <- predict(m8, as.matrix(x_train)) | |
p8 <- data.frame(pred = p8, obs = y_train) | |
p8_test <- predict(m8, as.matrix(x_test)) | |
p8_test <- data.frame(pred = p8_test, obs = y_test_num) | |
## Test results | |
# RMSE (my function, not caret function) | |
p1_RMSE_train <- round(RMSE(p1$obs, p1$pred),3) | |
p1_RMSE_test <- round(RMSE(p1_test$obs, p1_test$pred),3) | |
p2_RMSE_train <- round(RMSE(p2$obs, p2$pred),3) | |
p2_RMSE_test <- round(RMSE(p2_test$obs, p2_test$pred),3) | |
p3_RMSE_train <- round(RMSE(p3$obs, p3$pred),3) | |
p3_RMSE_test <- round(RMSE(p3_test$obs, p3_test$pred),3) | |
p4_RMSE_train <- round(RMSE(p4$obs, p4$pred),3) | |
p4_RMSE_test <- round(RMSE(p4_test$obs, p4_test$pred),3) | |
p5_RMSE_train <- round(RMSE(p5$obs, p5$pred),3) | |
p5_RMSE_test <- round(RMSE(p5_test$obs, p5_test$pred),3) | |
p6_RMSE_train <- round(RMSE(p6$obs, p6$pred),3) | |
p6_RMSE_test <- round(RMSE(p6_test$obs, p6_test$pred),3) | |
p7_RMSE_train <- round(RMSE(p7$obs, p7$pred),3) | |
p7_RMSE_test <- round(RMSE(p7_test$obs, p7_test$pred),3) | |
p8_RMSE_train <- round(RMSE(p8$obs, p8$pred),3) | |
p8_RMSE_test <- round(RMSE(p8_test$obs, p8_test$pred),3) | |
RMSE_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "SVM OCC", | |
"RF", "one-svc", "XGB", "MAXENT"), | |
RMSE_train = c(p1_RMSE_train, p2_RMSE_train, | |
p3_RMSE_train, p4_RMSE_train, | |
p5_RMSE_train, p6_RMSE_train, | |
p7_RMSE_train, p8_RMSE_train), | |
RMSE_test = c(p1_RMSE_test, p2_RMSE_test, | |
p3_RMSE_test, p4_RMSE_test, | |
p5_RMSE_test, p6_RMSE_test, | |
p7_RMSE_test, p8_RMSE_test)) | |
# AUC - AUC package | |
p1_roc <- roc(p1$pred, as.factor(p1$obs)) | |
p1_train_auc <- round(auc(p1_roc),3) | |
p1_test_roc <- roc(p1_test$pred, as.factor(p1_test$obs)) | |
p1_test_auc <- round(auc(p1_test_roc),3) | |
p2_roc <- roc(p2$pred, as.factor(p2$obs)) | |
p2_train_auc <- round(auc(p2_roc),3) | |
p2_test_roc <- roc(p2_test$pred, as.factor(p2_test$obs)) | |
p2_test_auc <- round(auc(p2_test_roc),3) | |
p3_roc <- roc(p3$pred, as.factor(p3$obs)) | |
p3_train_auc <- round(auc(p3_roc),3) | |
p3_test_roc <- roc(p3_test$pred, as.factor(p3_test$obs)) | |
p3_test_auc <- round(auc(p3_test_roc),3) | |
p5_roc <- roc(p5$pred, as.factor(p5$obs)) | |
p5_train_auc <- round(auc(p5_roc),3) | |
p5_test_roc <- roc(p5_test$pred, as.factor(p5_test$obs)) | |
p5_test_auc <- round(auc(p5_test_roc),3) | |
p7_roc <- roc(p7$pred, as.factor(p7$obs)) | |
p7_train_auc <- round(auc(p7_roc),3) | |
p7_test_roc <- roc(p7_test$pred, as.factor(p7_test$obs)) | |
p7_test_auc <- round(auc(p7_test_roc),3) | |
p8_roc <- roc(p8$pred, as.factor(p8$obs)) | |
p8_train_auc <- round(auc(p8_roc),3) | |
p8_test_roc <- roc(p8_test$pred, as.factor(p8_test$obs)) | |
p8_test_auc <- round(auc(p8_test_roc),3) | |
auc_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "RF", "XGB", "MAXENT"), | |
AUC_train = c(p1_train_auc, p2_train_auc, | |
p3_train_auc, p5_train_auc, | |
p7_train_auc, p8_train_auc), | |
AUC_test = c(p1_test_auc, p2_test_auc, | |
p3_test_auc, p5_test_auc, | |
p7_test_auc, p8_test_auc)) | |
kable(RMSE_scores) | |
# print(xtable(RMSE_scores, digits = c(3)), type = "html", include.rownames=FALSE) | |
kable(auc_scores) | |
# print(xtable(auc_scores, digits = c(3)), type = "html", include.rownames=FALSE) | |
# confusion matrix | |
(c4 <- table(p4)) # only 1 ref class | |
(c4_test <- confusionMatrix(p4_test$pred, p4_test$obs)) | |
# plot lowess response curve for interest | |
plot(plot.logreg(p1)) | |
plot(plot.logreg(p1_test)) | |
plot(plot.logreg(p2)) | |
plot(plot.logreg(p2_test)) | |
plot(plot.logreg(p3)) | |
plot(plot.logreg(p3_test)) | |
# plot(plot.logreg(p4)) # no probabilities, not effective plot | |
# plot(plot.logreg(p4_test)) # same as above | |
plot(plot.logreg(p5)) | |
plot(plot.logreg(p5_test)) | |
plot(plot.logreg(p7)) | |
plot(plot.logreg(p7_test)) | |
plot(plot.logreg(p8)) | |
plot(plot.logreg(p8_test)) | |
### Testing ggplot multivariate discrimination plot | |
model_num <- 2 | |
model <- get(paste0("m", model_num)) | |
test_data_set <- get(paste0("p", model_num, "_test")) | |
train_data_set <- get(paste0("p", model_num)) | |
# create grid of feature space for prediction | |
pred_grid <- expand.grid(V1 = seq(min(p_dat$V1), max(p_dat$V1), | |
abs((min(p_dat$V1) -max(p_dat$V1)) /50)), | |
V2 = seq(min(p_dat$V2), max(p_dat$V2), | |
abs((min(p_dat$V2) -max(p_dat$V2)) /50))) | |
pred_grid <- data.frame(pred_grid) | |
# Predict feature space and join with train/test data and predictions | |
# results in dataframe formated for plotting | |
plot_dat <- plot.prediction.grid(model, pred_grid, mod_type, x_test, x_train, | |
test_data_set, train_data_set) | |
# plot the feature space and prediction | |
p <- ggplot() + | |
# feature space | |
geom_raster(data = filter(plot_dat, dset == "sim"), | |
aes(x = V1, y = V2, fill = pred), alpha = 0.9, | |
interpolate = TRUE) + | |
# viridis color of feature space | |
scale_fill_viridis(option="inferno", guide = guide_legend(title = "Prob."), | |
breaks = seq(0,1,0.1), labels = seq(0,1,0.1), | |
direction = 1) + | |
# background as points if desired | |
# geom_point(data = filter(plot_dat, obs == 0 & dset == "test"), | |
# aes(x = V1, y = V2), alpha = 0.05, color = "gray50") + | |
# background as density (ggalt package) | |
geom_bkde2d(data = filter(plot_dat, obs == 0 & dset == "test"), | |
aes(x = V1, y = V2), alpha = 0.6, color = "gray80", | |
bandwidth=c(0.1, 0.8)) + | |
# training point locations | |
geom_point(data = filter(plot_dat, obs == 1 & dset == "train"), | |
shape = 20, aes(x = V1, y = V2), color = "red", | |
alpha = 0.025) + | |
#test point locations | |
geom_point(data = filter(plot_dat, obs == 1 & dset == "test"), | |
shape = 20, aes(x = V1, y = V2), color = "green", | |
alpha = 0.03) + | |
# theme | |
theme_bw() + | |
# labs, needs developer version of ggplot as of 04/19/16 | |
labs(title="Archaeological Site Presence Probability", | |
subtitle="MAXENT, Test AUC = 0.833", | |
caption="Data: Pennsylvania pre-contact archaeological site locations", | |
x = "Cost Distance to Confluence", | |
y = "Terrain Position Index") + | |
# make it pretty | |
theme( | |
panel.border = element_rect(colour = "gray90"), | |
axis.text.x = element_text(angle = 0, size = 6, family = "Trebuchet MS"), | |
axis.text.y = element_text(size = 6, family = "Trebuchet MS"), | |
axis.title = element_text(size = 8, family = "Trebuchet MS"), | |
plot.caption = element_text(size = 8, hjust=0, | |
family = "Trebuchet MS"), | |
plot.title=element_text(family="TrebuchetMS-Bold"), | |
plot.subtitle=element_text(family="TrebuchetMS-Italic"), | |
legend.text = element_text(size = 8, family = "Trebuchet MS") | |
) | |
plot(p) | |
ggsave(filename = "MAXENT.png", width = 6, height = 4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment