Skip to content

Instantly share code, notes, and snippets.

@mrecos
Created April 19, 2016 20:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mrecos/6fd3fac9087c60fb8bbf1a3361946d6c to your computer and use it in GitHub Desktop.
Save mrecos/6fd3fac9087c60fb8bbf1a3361946d6c to your computer and use it in GitHub Desktop.
code for blog post about visualizing predictions within feature space over a number of different models. https://matthewdharris.com/2016/04/19/predicting-in-feature-space/
### FUNCTIONS
# a quick function that plots the lowess response curve for binary data
plot.logreg <- function(dat){
dat$obs <- as.numeric(as.character(dat$obs)) # convert y factor to {0,1} digits
clr <- ifelse(dat$obs == 1, "orange", "blue")
pp <- ggplot(dat, aes(x = pred, y = obs)) +
geom_point(color = "gray30", alpha = 0.3) +
geom_jitter(width = 0.1, height = 0.11, color = clr) +
geom_rug(color = clr) +
geom_smooth(formula = y ~ x, se = FALSE, color = "gray25") +
geom_vline(xintercept = 0.5, color = "gray50", linetype = "dashed") +
scale_y_continuous(limits = c(-0.1,1.1), breaks = seq(0,1,0.2),
labels = seq(0,1,0.2)) +
scale_x_continuous(limits = c(-0.05,1.05), breaks = seq(0,1,0.2),
labels = seq(0,1,0.2)) +
ylab("Presence/Absence") +
xlab("Predicted Probability") +
theme_bw()
return(pp)
}
# Root Mean Square Error function
RMSE <- function(obs, pred){
rmse <- sqrt(mean((obs-pred)^2))
return(rmse)
}
# A quick plot function to looking at binary density overlap
plot.dens <- function(dens_dat){
library("reshape2")
vars <- ncol(dens_dat)/2
dens_melt <- reshape2::melt(dens_dat)
x_num <- NULL
for(i in 1:vars){
x_num <- c(x_num, rep(i, nrow(dens_dat)*2))
}
dens_melt$x_num <- x_num
dens_melt$presence <- rep(rep(c(1,0), each = nrow(dens_dat)),vars)
pp <- ggplot(dens_melt, aes(x = value, group = variable,
fill = as.factor(presence))) +
geom_density(alpha = 0.5) +
facet_grid(x_num ~ ., scales = "free") +
theme_bw()
return(pp)
}
# function to sample data for train/test sets
get.train.test.sets <- function(dat, train_v_test_fraction, sites_per_train, sites_per_test,
absence_presence_balance, data_reduction_fraction){
# set up train and test site SITENO lists
site_names <- unique(dat$SITENO)[-length(unique(dat$SITENO))] # remove "background"
train_sites <- site_names[sample(seq_along(site_names),length(site_names)*train_v_test_fraction)]
test_sites <- site_names[!(site_names %in% train_sites)]
# reduce train and test site data volume, but still sampling from test or train
if(sites_per_train == "all") sites_per_train <- length(train_sites)
if(sites_per_test == "all") sites_per_test <- length(test_sites)
tr_sites <- filter(dat, presence == 1, SITENO %in% sample(train_sites, sites_per_train))
te_sites <- filter(dat, presence == 1, SITENO %in% sample(test_sites , sites_per_test))
# sample a 3 X multiplied site background sample for test and train
tr_background <- filter(dat, presence == 0) %>%
sample_n(nrow(tr_sites)*absence_presence_balance)
te_background <- filter(dat, presence == 0) %>%
sample_n(nrow(te_sites)*absence_presence_balance)
# make test and train presence/absence data sets // reduce data volume significantly as % of cells
train <- rbind_all(list(tr_sites, tr_background)) %>%
sample_frac(data_reduction_fraction)
test <- rbind_all(list(te_sites, te_background)) %>%
sample_frac(data_reduction_fraction)
return(list(train = train, test = test))
}
# function for making final plot data for function space predictions
# a bit complicated, but needs to account for all the different predict methods
# each class of model has a bit of a different twist, so this function accounts for that
plot.prediction.grid <- function(model, pred_grid, model_type, x_test, x_train,
test_data_set, train_data_set){
# for Logistic Regression
if(class(model)[1] == "glm"){
p_test <- predict(model, newdata = pred_grid, type = "response")
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
} else if(class(model)[1] == "randomForest"){
# for RF
p_test <- predict(model, newdata = pred_grid, type = "prob")
p_test <- data.frame(pred = p_test[,"1"], obs = 2, pred_grid, dset = "sim")
} else if(class(model)[1] == "svm"){
# for SVM-OC
if(length(unique(train_data_set$obs)) == 1){
p_test <- predict(model, newdata = pred_grid)
p_test <- ifelse(p_test == TRUE, 1, 0)
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
} else if(length(unique(train_data_set$obs)) == 2){
# for SVM
p_test <- predict(model, newdata = pred_grid,probability=TRUE)
p_test <- data.frame(pred = attr(p_test, "probabilities")[,"1"], obs = 2, pred_grid, dset = "sim")
}
} else if(class(model)[1] == "ksvm"){
# one class
p_test <- predict(model, newdata = pred_grid)
p_test <- ifelse(p_test == TRUE, 1, 0)
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
} else if(class(model)[1] == "xgb.Booster"){
# for XGB
p_test <- predict(model, newdata = as.matrix(pred_grid))
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
} else if(class(model)[1] == "MaxEnt"){
# for MAXENT
p_test <- predict(model, as.matrix(pred_grid))
p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
}
plot_test <- data.frame(test_data_set, x_test)
if(length(unique(train_data_set$obs)) == 1){
plot_train <- data.frame(train_data_set, oc_x_train)
} else if(length(unique(train_data_set$obs)) == 2){
plot_train <- data.frame(train_data_set, x_train)
}
plot_dat <- data.frame(rbind(plot_train, plot_test),
dset = rep(c("train", "test"),
times = c(nrow(plot_train), nrow(plot_test))))
plot_dat <- rbind(plot_dat, p_test)
plot_dat$V1 <- as.numeric(scale(plot_dat$V1, center = TRUE))
plot_dat$V2 <- as.numeric(scale(plot_dat$V2, center = TRUE))
return(plot_dat)
}
# Not in function
'%ni%' <- Negate('%in%')
### PACKAGES
library("ggplot2")
library("ggalt")
library("dplyr")
library("e1071")
library("caret") # only for confmatrix
library("AUC")
library("data.table")
library("randomForest")
library("kernlab")
library("viridis")
library("xgboost")
library("dismo")
library("rJava")
library("xtable")
library("knitr")
### CODE
## Get Real Data
og_dat <- fread("~/YOUR_DATA.csv")
# remove some uneeded columns
dat <- data.frame(og_dat)
dat <- dat[!(substr(dat$SITENO, 1, 4) == "36XX"),]
dat <- dat[, c("cd_conf", "tpi_250c", "presence", "SITENO")]
colnames(dat) <- c("V1", "V2", "presence", "SITENO")
# set up data partition variables
train_v_test_fraction <- 0.75 # 0 to 1
sites_per_train <- "all" # integer or "all"
sites_per_test <- "all" # integer or "all"
absence_presence_balance <- 1.1 # 1 for balanced; typically 3
data_reduction_fraction <- 0.2 # 0 to 1
# retrive a train/test set as list
test_train <- get.train.test.sets(dat, train_v_test_fraction, sites_per_train, sites_per_test,
absence_presence_balance, data_reduction_fraction)
# split train/test list into individual train and test objects
train <- test_train[["train"]]
test <- test_train[["test"]]
train <- data.frame(y = train$presence, x1 = train[,1], x2 = train[,2])
test <- data.frame(y = test$presence, x1 = test[,1], x2 = test[,2])
# plot density just for fun
p_dat <- data.frame(rbind(test_train[["train"]], test_train[["test"]]))
x1_1 <- p_dat[which(p_dat$presence == 1),1]
x1_0 <- sample(p_dat[which(p_dat$presence == 0),1], length(x1_1))
x2_1 <- p_dat[which(p_dat$presence == 1),2]
x2_0 <- sample(p_dat[which(p_dat$presence == 0),2], length(x2_1))
plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))
# ## Or... Simulate Data
# y1 <- rep(1,100)
# y0 <- rep(0,100)
# x1_1 <- rnorm(100,1,1)
# x1_0 <- rnorm(100,3,1)
# x2_1 <- rnorm(100,5,1)
# x2_0 <- rnorm(100,6,1)
# # needs to go 1 then 0 for each x_i
# plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))
# dat <- data.frame(y = c(y1,y0), x1 = c(x1_1,x1_0), x2 = c(x2_1, x2_0))
# train <- sample_frac(dat,0.5)
# train_index <- as.numeric(row.names(train))
# test <- dat[-train_index,]
# for vector representation
x_train <- train[,-1] # remove "y"
y_train <- train$y
y_train_num <- as.numeric(as.character(y_train))
x_test <- test[,-1] # remove "y"
y_test <- test$y
y_test_num <- as.numeric(as.character(y_test))
# for one-class-classification
oc_train <- train[which(train$y == 1),]
oc_x_train <- oc_train[,-1] # remove "y"
oc_y_train <- as.numeric(as.character(oc_train$y))
## Build Models
# basic Logistic Regression
m1 <- glm(y ~ ., data = train, family = "binomial")
# SVM - defaults C-classification, Radial basis function (RBF)
m2 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE)
# SVM - various kernels and hyperparameters
wts <- table(as.factor(y_train)) / length(y_train)
m3 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE,
cost = 5, kernel = "radial", class.weights = wts)
# SVM - one-classification
m4 <- svm(x = oc_x_train, y = oc_y_train, data = train,
type = "one-classification", nu = 0.1, cost = 1)
# RF
m5 <- randomForest(x = x_train, y = as.factor(y_train),
ntree = 500, nodesize = 100 )
# kernlab one-svc
oc_dat <- data.frame(y = oc_y_train, oc_x_train)
m6 <- ksvm(y ~ .,data=oc_dat,kernel="rbfdot", type = "one-svc",
kpar=list(sigma=0.05),C=1, nu = 0.1)
# XGB
m7 <- xgboost(data = as.matrix(train[,-1]), label = train$y, max.depth = 4, eta = 0.1,
nround = 10, objective = "binary:logistic")
# MAXENT
m8 <- maxent(x = x_train, p = y_train)
## Predict models
# basic logistic regression
p1 <- predict(m1, x_train, type = "response")
p1 <- data.frame(pred = p1, obs = y_train_num)
p1_test <- predict(m1, newdata = test, type = "response")
p1_test <- data.frame(pred = p1_test, obs = y_test_num)
# SVM - degaults
p2 <- predict(m2, x_train, probability=TRUE)
p2 <- data.frame(pred = attr(p2, "probabilities")[,"1"], obs = y_train_num)
p2_test <- predict(m2, newdata = x_test, probability=TRUE)
p2_test <- data.frame(pred = attr(p2_test, "probabilities")[,"1"], obs = y_test_num)
# SVM - various kernels & k = 5 fold CV
p3 <- predict(m3, x_train, probability=TRUE)
p3 <- data.frame(pred = attr(p3, "probabilities")[,"1"], obs = y_train_num)
p3_test <- predict(m3, newdata = x_test, probability=TRUE)
p3_test <- data.frame(pred = attr(p3_test, "probabilities")[,"1"], obs = y_test_num)
# SVM - one-class-classification
p4 <- predict(m4, oc_x_train)
p4 <- ifelse(p4 == TRUE, 1, 0)
p4 <- data.frame(pred = as.numeric(p4), obs = oc_y_train)
p4_test <- predict(m4, newdata = x_test)
p4_test <- ifelse(p4_test == TRUE, 1, 0)
p4_test <- data.frame(pred = as.numeric(p4_test), obs = y_test_num)
# RF
p5 <- predict(m5, x_train, type="prob")
p5 <- data.frame(pred = p5[,"1"], obs = y_train_num)
p5_test <- predict(m5, newdata = x_test, type="prob")
p5_test <- data.frame(pred = p5_test[,"1"], obs = y_test_num)
# kernlab oc SVM
p6 <- predict(m6, oc_x_train)
p6 <- ifelse(p6 == TRUE, 1, 0)
p6 <- data.frame(pred = p6, obs = oc_y_train)
p6_test <- predict(m6, newdata = x_test)
p6_test <- ifelse(p6_test == TRUE, 1, 0)
p6_test <- data.frame(pred = p6_test, obs = y_test_num)
# XGB
p7 <- predict(m7, as.matrix(x_train))
p7 <- data.frame(pred = p7, obs = y_train)
p7_test <- predict(m7, newdata = as.matrix(x_test))
p7_test <- data.frame(pred = p7_test, obs = y_test_num)
# MAXENT
p8 <- predict(m8, as.matrix(x_train))
p8 <- data.frame(pred = p8, obs = y_train)
p8_test <- predict(m8, as.matrix(x_test))
p8_test <- data.frame(pred = p8_test, obs = y_test_num)
## Test results
# RMSE (my function, not caret function)
p1_RMSE_train <- round(RMSE(p1$obs, p1$pred),3)
p1_RMSE_test <- round(RMSE(p1_test$obs, p1_test$pred),3)
p2_RMSE_train <- round(RMSE(p2$obs, p2$pred),3)
p2_RMSE_test <- round(RMSE(p2_test$obs, p2_test$pred),3)
p3_RMSE_train <- round(RMSE(p3$obs, p3$pred),3)
p3_RMSE_test <- round(RMSE(p3_test$obs, p3_test$pred),3)
p4_RMSE_train <- round(RMSE(p4$obs, p4$pred),3)
p4_RMSE_test <- round(RMSE(p4_test$obs, p4_test$pred),3)
p5_RMSE_train <- round(RMSE(p5$obs, p5$pred),3)
p5_RMSE_test <- round(RMSE(p5_test$obs, p5_test$pred),3)
p6_RMSE_train <- round(RMSE(p6$obs, p6$pred),3)
p6_RMSE_test <- round(RMSE(p6_test$obs, p6_test$pred),3)
p7_RMSE_train <- round(RMSE(p7$obs, p7$pred),3)
p7_RMSE_test <- round(RMSE(p7_test$obs, p7_test$pred),3)
p8_RMSE_train <- round(RMSE(p8$obs, p8$pred),3)
p8_RMSE_test <- round(RMSE(p8_test$obs, p8_test$pred),3)
RMSE_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "SVM OCC",
"RF", "one-svc", "XGB", "MAXENT"),
RMSE_train = c(p1_RMSE_train, p2_RMSE_train,
p3_RMSE_train, p4_RMSE_train,
p5_RMSE_train, p6_RMSE_train,
p7_RMSE_train, p8_RMSE_train),
RMSE_test = c(p1_RMSE_test, p2_RMSE_test,
p3_RMSE_test, p4_RMSE_test,
p5_RMSE_test, p6_RMSE_test,
p7_RMSE_test, p8_RMSE_test))
# AUC - AUC package
p1_roc <- roc(p1$pred, as.factor(p1$obs))
p1_train_auc <- round(auc(p1_roc),3)
p1_test_roc <- roc(p1_test$pred, as.factor(p1_test$obs))
p1_test_auc <- round(auc(p1_test_roc),3)
p2_roc <- roc(p2$pred, as.factor(p2$obs))
p2_train_auc <- round(auc(p2_roc),3)
p2_test_roc <- roc(p2_test$pred, as.factor(p2_test$obs))
p2_test_auc <- round(auc(p2_test_roc),3)
p3_roc <- roc(p3$pred, as.factor(p3$obs))
p3_train_auc <- round(auc(p3_roc),3)
p3_test_roc <- roc(p3_test$pred, as.factor(p3_test$obs))
p3_test_auc <- round(auc(p3_test_roc),3)
p5_roc <- roc(p5$pred, as.factor(p5$obs))
p5_train_auc <- round(auc(p5_roc),3)
p5_test_roc <- roc(p5_test$pred, as.factor(p5_test$obs))
p5_test_auc <- round(auc(p5_test_roc),3)
p7_roc <- roc(p7$pred, as.factor(p7$obs))
p7_train_auc <- round(auc(p7_roc),3)
p7_test_roc <- roc(p7_test$pred, as.factor(p7_test$obs))
p7_test_auc <- round(auc(p7_test_roc),3)
p8_roc <- roc(p8$pred, as.factor(p8$obs))
p8_train_auc <- round(auc(p8_roc),3)
p8_test_roc <- roc(p8_test$pred, as.factor(p8_test$obs))
p8_test_auc <- round(auc(p8_test_roc),3)
auc_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "RF", "XGB", "MAXENT"),
AUC_train = c(p1_train_auc, p2_train_auc,
p3_train_auc, p5_train_auc,
p7_train_auc, p8_train_auc),
AUC_test = c(p1_test_auc, p2_test_auc,
p3_test_auc, p5_test_auc,
p7_test_auc, p8_test_auc))
kable(RMSE_scores)
# print(xtable(RMSE_scores, digits = c(3)), type = "html", include.rownames=FALSE)
kable(auc_scores)
# print(xtable(auc_scores, digits = c(3)), type = "html", include.rownames=FALSE)
# confusion matrix
(c4 <- table(p4)) # only 1 ref class
(c4_test <- confusionMatrix(p4_test$pred, p4_test$obs))
# plot lowess response curve for interest
plot(plot.logreg(p1))
plot(plot.logreg(p1_test))
plot(plot.logreg(p2))
plot(plot.logreg(p2_test))
plot(plot.logreg(p3))
plot(plot.logreg(p3_test))
# plot(plot.logreg(p4)) # no probabilities, not effective plot
# plot(plot.logreg(p4_test)) # same as above
plot(plot.logreg(p5))
plot(plot.logreg(p5_test))
plot(plot.logreg(p7))
plot(plot.logreg(p7_test))
plot(plot.logreg(p8))
plot(plot.logreg(p8_test))
### Testing ggplot multivariate discrimination plot
model_num <- 2
model <- get(paste0("m", model_num))
test_data_set <- get(paste0("p", model_num, "_test"))
train_data_set <- get(paste0("p", model_num))
# create grid of feature space for prediction
pred_grid <- expand.grid(V1 = seq(min(p_dat$V1), max(p_dat$V1),
abs((min(p_dat$V1) -max(p_dat$V1)) /50)),
V2 = seq(min(p_dat$V2), max(p_dat$V2),
abs((min(p_dat$V2) -max(p_dat$V2)) /50)))
pred_grid <- data.frame(pred_grid)
# Predict feature space and join with train/test data and predictions
# results in dataframe formated for plotting
plot_dat <- plot.prediction.grid(model, pred_grid, mod_type, x_test, x_train,
test_data_set, train_data_set)
# plot the feature space and prediction
p <- ggplot() +
# feature space
geom_raster(data = filter(plot_dat, dset == "sim"),
aes(x = V1, y = V2, fill = pred), alpha = 0.9,
interpolate = TRUE) +
# viridis color of feature space
scale_fill_viridis(option="inferno", guide = guide_legend(title = "Prob."),
breaks = seq(0,1,0.1), labels = seq(0,1,0.1),
direction = 1) +
# background as points if desired
# geom_point(data = filter(plot_dat, obs == 0 & dset == "test"),
# aes(x = V1, y = V2), alpha = 0.05, color = "gray50") +
# background as density (ggalt package)
geom_bkde2d(data = filter(plot_dat, obs == 0 & dset == "test"),
aes(x = V1, y = V2), alpha = 0.6, color = "gray80",
bandwidth=c(0.1, 0.8)) +
# training point locations
geom_point(data = filter(plot_dat, obs == 1 & dset == "train"),
shape = 20, aes(x = V1, y = V2), color = "red",
alpha = 0.025) +
#test point locations
geom_point(data = filter(plot_dat, obs == 1 & dset == "test"),
shape = 20, aes(x = V1, y = V2), color = "green",
alpha = 0.03) +
# theme
theme_bw() +
# labs, needs developer version of ggplot as of 04/19/16
labs(title="Archaeological Site Presence Probability",
subtitle="MAXENT, Test AUC = 0.833",
caption="Data: Pennsylvania pre-contact archaeological site locations",
x = "Cost Distance to Confluence",
y = "Terrain Position Index") +
# make it pretty
theme(
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 6, family = "Trebuchet MS"),
axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
axis.title = element_text(size = 8, family = "Trebuchet MS"),
plot.caption = element_text(size = 8, hjust=0,
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Italic"),
legend.text = element_text(size = 8, family = "Trebuchet MS")
)
plot(p)
ggsave(filename = "MAXENT.png", width = 6, height = 4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment