mrecos/Feature_space_prediction.r

## Feature_space_prediction.r
### FUNCTIONS
# a quick function that plots the lowess response curve for binary data
plot.logreg <- function(dat){
  dat$obs <- as.numeric(as.character(dat$obs)) # convert y factor to {0,1} digits
  clr <- ifelse(dat$obs == 1, "orange", "blue")
  pp <- ggplot(dat, aes(x = pred, y = obs)) +
    geom_point(color = "gray30", alpha = 0.3) +
    geom_jitter(width = 0.1, height = 0.11, color = clr) +
    geom_rug(color = clr) +
    geom_smooth(formula = y ~ x, se = FALSE, color = "gray25") +
    geom_vline(xintercept = 0.5, color = "gray50", linetype = "dashed") +
    scale_y_continuous(limits = c(-0.1,1.1), breaks = seq(0,1,0.2),
                         labels = seq(0,1,0.2)) +
    scale_x_continuous(limits = c(-0.05,1.05), breaks = seq(0,1,0.2),
                       labels = seq(0,1,0.2)) +
    ylab("Presence/Absence") +
    xlab("Predicted Probability") +
    theme_bw()
  return(pp)
}
# Root Mean Square Error function
RMSE <- function(obs, pred){
  rmse <- sqrt(mean((obs-pred)^2))
  return(rmse)
}
# A quick plot function to looking at binary density overlap
plot.dens <- function(dens_dat){
  library("reshape2")
  vars <- ncol(dens_dat)/2
  dens_melt <- reshape2::melt(dens_dat)
  x_num <- NULL
  for(i in 1:vars){
     x_num <- c(x_num, rep(i, nrow(dens_dat)*2))
  }
  dens_melt$x_num <- x_num
  dens_melt$presence <- rep(rep(c(1,0), each = nrow(dens_dat)),vars)
  pp <- ggplot(dens_melt, aes(x = value, group = variable,
                              fill = as.factor(presence))) +
    geom_density(alpha = 0.5) +
    facet_grid(x_num ~ ., scales = "free") +
    theme_bw()
  return(pp)
}

# function to sample data for train/test sets
get.train.test.sets <- function(dat, train_v_test_fraction, sites_per_train, sites_per_test,
                                absence_presence_balance, data_reduction_fraction){
  # set up train and test site SITENO lists
  site_names <- unique(dat$SITENO)[-length(unique(dat$SITENO))] # remove "background"
  train_sites <- site_names[sample(seq_along(site_names),length(site_names)*train_v_test_fraction)]
  test_sites <- site_names[!(site_names %in% train_sites)]

  # reduce train and test site data volume, but still sampling from test or train
  if(sites_per_train == "all") sites_per_train <- length(train_sites)
  if(sites_per_test == "all") sites_per_test <- length(test_sites)
  tr_sites <- filter(dat, presence == 1, SITENO %in% sample(train_sites, sites_per_train))
  te_sites <- filter(dat, presence == 1, SITENO %in% sample(test_sites , sites_per_test))


  # sample a 3 X multiplied site background sample for test and train
  tr_background <- filter(dat, presence == 0) %>%
    sample_n(nrow(tr_sites)*absence_presence_balance)
  te_background <- filter(dat, presence == 0) %>%
    sample_n(nrow(te_sites)*absence_presence_balance)

  # make test and train presence/absence data sets // reduce data volume significantly as % of cells
  train <- rbind_all(list(tr_sites, tr_background)) %>%
    sample_frac(data_reduction_fraction)
  test <- rbind_all(list(te_sites, te_background)) %>%
    sample_frac(data_reduction_fraction)

  return(list(train = train, test = test))
}

# function for making final plot data for function space predictions
# a bit complicated, but needs to account for all the different predict methods
# each class of model has a bit of a different twist, so this function accounts for that
plot.prediction.grid <- function(model, pred_grid, model_type, x_test, x_train,
                                 test_data_set, train_data_set){
  # for Logistic Regression
  if(class(model)[1] == "glm"){
    p_test <- predict(model, newdata = pred_grid, type = "response")
    p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
  } else if(class(model)[1]  == "randomForest"){
    # for RF
    p_test <- predict(model, newdata = pred_grid, type = "prob")
    p_test <- data.frame(pred = p_test[,"1"], obs = 2, pred_grid, dset = "sim")
  } else if(class(model)[1]  == "svm"){
    # for SVM-OC
    if(length(unique(train_data_set$obs)) == 1){
      p_test <- predict(model, newdata = pred_grid)
      p_test <- ifelse(p_test == TRUE, 1, 0)
      p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
    } else if(length(unique(train_data_set$obs)) == 2){
      # for SVM
      p_test <- predict(model, newdata = pred_grid,probability=TRUE)
      p_test <- data.frame(pred = attr(p_test, "probabilities")[,"1"], obs = 2, pred_grid, dset = "sim")
    }
  } else if(class(model)[1]  == "ksvm"){
    # one class
    p_test <- predict(model, newdata = pred_grid)
    p_test <- ifelse(p_test == TRUE, 1, 0)
    p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
  } else if(class(model)[1]  == "xgb.Booster"){
    # for XGB
    p_test <- predict(model, newdata = as.matrix(pred_grid))
    p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
  } else if(class(model)[1]  == "MaxEnt"){
  # for MAXENT
  p_test <- predict(model, as.matrix(pred_grid))
  p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
  }
  plot_test <- data.frame(test_data_set, x_test)
  if(length(unique(train_data_set$obs)) == 1){
    plot_train <- data.frame(train_data_set, oc_x_train)
  } else if(length(unique(train_data_set$obs)) == 2){
    plot_train <- data.frame(train_data_set, x_train)
  }
  plot_dat <- data.frame(rbind(plot_train, plot_test),
                         dset = rep(c("train", "test"),
                                    times = c(nrow(plot_train), nrow(plot_test))))
  plot_dat <- rbind(plot_dat, p_test)
  plot_dat$V1 <- as.numeric(scale(plot_dat$V1, center = TRUE))
  plot_dat$V2 <- as.numeric(scale(plot_dat$V2, center = TRUE))
  return(plot_dat)
}

# Not in function
'%ni%' <- Negate('%in%')


### PACKAGES
library("ggplot2")
library("ggalt")
library("dplyr")
library("e1071")
library("caret") # only for confmatrix
library("AUC")
library("data.table")
library("randomForest")
library("kernlab")
library("viridis")
library("xgboost")
library("dismo")
library("rJava")
library("xtable")
library("knitr")

### CODE
## Get Real Data
og_dat <- fread("~/YOUR_DATA.csv")
# remove some uneeded columns
dat <- data.frame(og_dat)
dat <- dat[!(substr(dat$SITENO, 1, 4) == "36XX"),]
dat <- dat[, c("cd_conf", "tpi_250c", "presence", "SITENO")]
colnames(dat) <- c("V1", "V2", "presence", "SITENO")
# set up data partition variables
train_v_test_fraction <- 0.75   # 0 to 1
sites_per_train <- "all"     # integer or "all"
sites_per_test <- "all"       # integer or "all"
absence_presence_balance <- 1.1  # 1 for balanced; typically 3
data_reduction_fraction <- 0.2 # 0 to 1
# retrive a train/test set as list
test_train <- get.train.test.sets(dat, train_v_test_fraction, sites_per_train, sites_per_test,
                                  absence_presence_balance, data_reduction_fraction)
# split train/test list into individual train and test objects
train <- test_train[["train"]]
test <- test_train[["test"]]
train <- data.frame(y = train$presence, x1 = train[,1], x2 = train[,2])
test <- data.frame(y = test$presence, x1 = test[,1], x2 = test[,2])
# plot density just for fun
p_dat <- data.frame(rbind(test_train[["train"]], test_train[["test"]]))
x1_1 <- p_dat[which(p_dat$presence == 1),1]
x1_0 <- sample(p_dat[which(p_dat$presence == 0),1], length(x1_1))
x2_1 <- p_dat[which(p_dat$presence == 1),2]
x2_0 <- sample(p_dat[which(p_dat$presence == 0),2], length(x2_1))
plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))

# ## Or... Simulate Data
# y1 <- rep(1,100)
# y0 <- rep(0,100)
# x1_1 <- rnorm(100,1,1)
# x1_0 <- rnorm(100,3,1)
# x2_1 <- rnorm(100,5,1)
# x2_0 <- rnorm(100,6,1)
# # needs to go 1 then 0 for each x_i
# plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))
# dat <- data.frame(y = c(y1,y0), x1 = c(x1_1,x1_0), x2 = c(x2_1, x2_0))
# train <- sample_frac(dat,0.5)
# train_index <- as.numeric(row.names(train))
# test <- dat[-train_index,]

# for vector representation
x_train <- train[,-1] # remove "y"
y_train <- train$y
y_train_num <- as.numeric(as.character(y_train))
x_test <- test[,-1] # remove "y"
y_test <- test$y
y_test_num <- as.numeric(as.character(y_test))
# for one-class-classification
oc_train <- train[which(train$y == 1),]
oc_x_train <- oc_train[,-1] # remove "y"
oc_y_train <- as.numeric(as.character(oc_train$y))

  ## Build Models
  # basic Logistic Regression
  m1 <- glm(y ~ ., data = train, family = "binomial")
  # SVM - defaults C-classification, Radial basis function (RBF)
  m2 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE)
  # SVM - various kernels and hyperparameters
  wts <- table(as.factor(y_train)) / length(y_train)
  m3 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE,
            cost = 5, kernel = "radial", class.weights = wts)
  # SVM - one-classification
  m4 <- svm(x = oc_x_train, y = oc_y_train, data = train,
            type = "one-classification", nu = 0.1, cost = 1)
  # RF
  m5 <- randomForest(x = x_train, y = as.factor(y_train),
                     ntree = 500, nodesize = 100 )
  # kernlab one-svc
  oc_dat <- data.frame(y = oc_y_train, oc_x_train)
  m6 <-  ksvm(y ~ .,data=oc_dat,kernel="rbfdot", type = "one-svc",
              kpar=list(sigma=0.05),C=1, nu = 0.1)
  # XGB
  m7 <- xgboost(data = as.matrix(train[,-1]), label = train$y, max.depth = 4, eta = 0.1,
                nround = 10, objective = "binary:logistic")
  # MAXENT
  m8 <- maxent(x = x_train, p = y_train)

  ## Predict models
  # basic logistic regression
  p1 <- predict(m1, x_train, type = "response")
  p1 <- data.frame(pred = p1, obs = y_train_num)
  p1_test <- predict(m1, newdata = test, type = "response")
  p1_test <- data.frame(pred = p1_test, obs = y_test_num)
  # SVM - degaults
  p2 <- predict(m2, x_train, probability=TRUE)
  p2 <- data.frame(pred = attr(p2, "probabilities")[,"1"], obs = y_train_num)
  p2_test <- predict(m2, newdata = x_test, probability=TRUE)
  p2_test <- data.frame(pred = attr(p2_test, "probabilities")[,"1"], obs = y_test_num)
  # SVM - various kernels & k = 5 fold CV
  p3 <- predict(m3, x_train, probability=TRUE)
  p3 <- data.frame(pred = attr(p3, "probabilities")[,"1"], obs = y_train_num)
  p3_test <- predict(m3, newdata = x_test, probability=TRUE)
  p3_test <- data.frame(pred = attr(p3_test, "probabilities")[,"1"], obs = y_test_num)
  # SVM - one-class-classification
  p4 <- predict(m4, oc_x_train)
  p4 <- ifelse(p4 == TRUE, 1, 0)
  p4 <- data.frame(pred = as.numeric(p4), obs = oc_y_train)
  p4_test <- predict(m4, newdata = x_test)
  p4_test <- ifelse(p4_test == TRUE, 1, 0)
  p4_test <- data.frame(pred = as.numeric(p4_test), obs = y_test_num)
  # RF
  p5 <- predict(m5, x_train, type="prob")
  p5 <- data.frame(pred = p5[,"1"], obs = y_train_num)
  p5_test <- predict(m5, newdata = x_test, type="prob")
  p5_test <- data.frame(pred = p5_test[,"1"], obs = y_test_num)
  # kernlab oc SVM
  p6 <- predict(m6, oc_x_train)
  p6 <- ifelse(p6 == TRUE, 1, 0)
  p6 <- data.frame(pred = p6, obs = oc_y_train)
  p6_test <- predict(m6, newdata = x_test)
  p6_test <- ifelse(p6_test == TRUE, 1, 0)
  p6_test <- data.frame(pred = p6_test, obs = y_test_num)
  # XGB
  p7 <- predict(m7, as.matrix(x_train))
  p7 <- data.frame(pred = p7, obs = y_train)
  p7_test <- predict(m7, newdata = as.matrix(x_test))
  p7_test <- data.frame(pred = p7_test, obs = y_test_num)
  # MAXENT
  p8 <- predict(m8, as.matrix(x_train))
  p8 <- data.frame(pred = p8, obs = y_train)
  p8_test <- predict(m8, as.matrix(x_test))
  p8_test <- data.frame(pred = p8_test, obs = y_test_num)


  ## Test results
  # RMSE (my function, not caret function)
  p1_RMSE_train <- round(RMSE(p1$obs, p1$pred),3)
  p1_RMSE_test <- round(RMSE(p1_test$obs, p1_test$pred),3)
  p2_RMSE_train <- round(RMSE(p2$obs, p2$pred),3)
  p2_RMSE_test <- round(RMSE(p2_test$obs, p2_test$pred),3)
  p3_RMSE_train <- round(RMSE(p3$obs, p3$pred),3)
  p3_RMSE_test <- round(RMSE(p3_test$obs, p3_test$pred),3)
  p4_RMSE_train <- round(RMSE(p4$obs, p4$pred),3)
  p4_RMSE_test <- round(RMSE(p4_test$obs, p4_test$pred),3)
  p5_RMSE_train <- round(RMSE(p5$obs, p5$pred),3)
  p5_RMSE_test <- round(RMSE(p5_test$obs, p5_test$pred),3)
  p6_RMSE_train <- round(RMSE(p6$obs, p6$pred),3)
  p6_RMSE_test <- round(RMSE(p6_test$obs, p6_test$pred),3)
  p7_RMSE_train <- round(RMSE(p7$obs, p7$pred),3)
  p7_RMSE_test <- round(RMSE(p7_test$obs, p7_test$pred),3)
  p8_RMSE_train <- round(RMSE(p8$obs, p8$pred),3)
  p8_RMSE_test <- round(RMSE(p8_test$obs, p8_test$pred),3)
  RMSE_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "SVM OCC",
                                       "RF", "one-svc", "XGB", "MAXENT"),
                           RMSE_train = c(p1_RMSE_train, p2_RMSE_train,
                                          p3_RMSE_train, p4_RMSE_train,
                                          p5_RMSE_train, p6_RMSE_train,
                                          p7_RMSE_train, p8_RMSE_train),
                           RMSE_test = c(p1_RMSE_test, p2_RMSE_test,
                                         p3_RMSE_test, p4_RMSE_test,
                                         p5_RMSE_test, p6_RMSE_test,
                                         p7_RMSE_test, p8_RMSE_test))
  # AUC - AUC package
  p1_roc <- roc(p1$pred, as.factor(p1$obs))
  p1_train_auc <- round(auc(p1_roc),3)
  p1_test_roc <- roc(p1_test$pred, as.factor(p1_test$obs))
  p1_test_auc <- round(auc(p1_test_roc),3)
  p2_roc <- roc(p2$pred, as.factor(p2$obs))
  p2_train_auc <- round(auc(p2_roc),3)
  p2_test_roc <- roc(p2_test$pred, as.factor(p2_test$obs))
  p2_test_auc <- round(auc(p2_test_roc),3)
  p3_roc <- roc(p3$pred, as.factor(p3$obs))
  p3_train_auc <- round(auc(p3_roc),3)
  p3_test_roc <- roc(p3_test$pred, as.factor(p3_test$obs))
  p3_test_auc <- round(auc(p3_test_roc),3)
  p5_roc <- roc(p5$pred, as.factor(p5$obs))
  p5_train_auc <- round(auc(p5_roc),3)
  p5_test_roc <- roc(p5_test$pred, as.factor(p5_test$obs))
  p5_test_auc <- round(auc(p5_test_roc),3)
  p7_roc <- roc(p7$pred, as.factor(p7$obs))
  p7_train_auc <- round(auc(p7_roc),3)
  p7_test_roc <- roc(p7_test$pred, as.factor(p7_test$obs))
  p7_test_auc <- round(auc(p7_test_roc),3)
  p8_roc <- roc(p8$pred, as.factor(p8$obs))
  p8_train_auc <- round(auc(p8_roc),3)
  p8_test_roc <- roc(p8_test$pred, as.factor(p8_test$obs))
  p8_test_auc <- round(auc(p8_test_roc),3)
  auc_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "RF", "XGB", "MAXENT"),
                           AUC_train = c(p1_train_auc, p2_train_auc,
                                         p3_train_auc, p5_train_auc,
                                         p7_train_auc, p8_train_auc),
                           AUC_test = c(p1_test_auc, p2_test_auc,
                                        p3_test_auc, p5_test_auc,
                                        p7_test_auc, p8_test_auc))
  kable(RMSE_scores)
  # print(xtable(RMSE_scores, digits = c(3)), type = "html", include.rownames=FALSE)
  kable(auc_scores)
  # print(xtable(auc_scores, digits = c(3)), type = "html", include.rownames=FALSE)

# confusion matrix
(c4 <- table(p4)) # only 1 ref class
(c4_test <- confusionMatrix(p4_test$pred, p4_test$obs))

# plot lowess response curve for interest
plot(plot.logreg(p1))
plot(plot.logreg(p1_test))
plot(plot.logreg(p2))
plot(plot.logreg(p2_test))
plot(plot.logreg(p3))
plot(plot.logreg(p3_test))
# plot(plot.logreg(p4)) # no probabilities, not effective plot
# plot(plot.logreg(p4_test)) # same as above
plot(plot.logreg(p5))
plot(plot.logreg(p5_test))
plot(plot.logreg(p7))
plot(plot.logreg(p7_test))
plot(plot.logreg(p8))
plot(plot.logreg(p8_test))


### Testing ggplot multivariate discrimination plot
model_num <- 2
model <- get(paste0("m", model_num))
test_data_set <- get(paste0("p", model_num, "_test"))
train_data_set <- get(paste0("p", model_num))
# create grid of feature space for prediction
pred_grid <- expand.grid(V1 = seq(min(p_dat$V1), max(p_dat$V1),
                                  abs((min(p_dat$V1) -max(p_dat$V1)) /50)),
                         V2 = seq(min(p_dat$V2), max(p_dat$V2),
                                  abs((min(p_dat$V2) -max(p_dat$V2)) /50)))
pred_grid <- data.frame(pred_grid)
# Predict feature space and join with train/test data and predictions
# results in dataframe formated for plotting
plot_dat <- plot.prediction.grid(model, pred_grid, mod_type, x_test, x_train,
                                 test_data_set, train_data_set)
# plot the feature space and prediction
p <- ggplot() +
  # feature space
  geom_raster(data = filter(plot_dat, dset == "sim"),
               aes(x = V1, y = V2, fill = pred), alpha = 0.9,
              interpolate = TRUE) +
  # viridis color of feature space
  scale_fill_viridis(option="inferno", guide = guide_legend(title = "Prob."),
                     breaks = seq(0,1,0.1), labels = seq(0,1,0.1),
                     direction = 1) +
  # background as points if desired
  # geom_point(data = filter(plot_dat, obs == 0 & dset == "test"),
             # aes(x = V1, y = V2), alpha = 0.05, color = "gray50") +
  # background as density (ggalt package)
  geom_bkde2d(data = filter(plot_dat, obs == 0 & dset == "test"),
              aes(x = V1, y = V2), alpha = 0.6, color = "gray80",
              bandwidth=c(0.1, 0.8)) +
  # training point locations
  geom_point(data = filter(plot_dat, obs == 1 & dset == "train"),
             shape = 20, aes(x = V1, y = V2), color = "red",
             alpha = 0.025) +
  #test point locations
  geom_point(data = filter(plot_dat, obs == 1 & dset == "test"),
             shape = 20, aes(x = V1, y = V2), color = "green",
             alpha = 0.03) +
  # theme
  theme_bw() +
  # labs, needs developer version of ggplot as of 04/19/16
  labs(title="Archaeological Site Presence Probability",
       subtitle="MAXENT, Test AUC = 0.833",
       caption="Data: Pennsylvania pre-contact archaeological site locations",
       x = "Cost Distance to Confluence",
       y = "Terrain Position Index") +
  # make it pretty
  theme(
    panel.border = element_rect(colour = "gray90"),
    axis.text.x = element_text(angle = 0, size = 6, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
    axis.title = element_text(size = 8, family = "Trebuchet MS"),
    plot.caption = element_text(size = 8, hjust=0,
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Italic"),
    legend.text = element_text(size = 8, family = "Trebuchet MS")
  )

plot(p)
ggsave(filename = "MAXENT.png", width = 6, height = 4)
	### FUNCTIONS
	# a quick function that plots the lowess response curve for binary data
	plot.logreg <- function(dat){
	dat$obs <- as.numeric(as.character(dat$obs)) # convert y factor to {0,1} digits
	clr <- ifelse(dat$obs == 1, "orange", "blue")
	pp <- ggplot(dat, aes(x = pred, y = obs)) +
	geom_point(color = "gray30", alpha = 0.3) +
	geom_jitter(width = 0.1, height = 0.11, color = clr) +
	geom_rug(color = clr) +
	geom_smooth(formula = y ~ x, se = FALSE, color = "gray25") +
	geom_vline(xintercept = 0.5, color = "gray50", linetype = "dashed") +
	scale_y_continuous(limits = c(-0.1,1.1), breaks = seq(0,1,0.2),
	labels = seq(0,1,0.2)) +
	scale_x_continuous(limits = c(-0.05,1.05), breaks = seq(0,1,0.2),
	labels = seq(0,1,0.2)) +
	ylab("Presence/Absence") +
	xlab("Predicted Probability") +
	theme_bw()
	return(pp)
	}
	# Root Mean Square Error function
	RMSE <- function(obs, pred){
	rmse <- sqrt(mean((obs-pred)^2))
	return(rmse)
	}
	# A quick plot function to looking at binary density overlap
	plot.dens <- function(dens_dat){
	library("reshape2")
	vars <- ncol(dens_dat)/2
	dens_melt <- reshape2::melt(dens_dat)
	x_num <- NULL
	for(i in 1:vars){
	x_num <- c(x_num, rep(i, nrow(dens_dat)*2))
	}
	dens_melt$x_num <- x_num
	dens_melt$presence <- rep(rep(c(1,0), each = nrow(dens_dat)),vars)
	pp <- ggplot(dens_melt, aes(x = value, group = variable,
	fill = as.factor(presence))) +
	geom_density(alpha = 0.5) +
	facet_grid(x_num ~ ., scales = "free") +
	theme_bw()
	return(pp)
	}

	# function to sample data for train/test sets
	get.train.test.sets <- function(dat, train_v_test_fraction, sites_per_train, sites_per_test,
	absence_presence_balance, data_reduction_fraction){
	# set up train and test site SITENO lists
	site_names <- unique(dat$SITENO)[-length(unique(dat$SITENO))] # remove "background"
	train_sites <- site_names[sample(seq_along(site_names),length(site_names)*train_v_test_fraction)]
	test_sites <- site_names[!(site_names %in% train_sites)]

	# reduce train and test site data volume, but still sampling from test or train
	if(sites_per_train == "all") sites_per_train <- length(train_sites)
	if(sites_per_test == "all") sites_per_test <- length(test_sites)
	tr_sites <- filter(dat, presence == 1, SITENO %in% sample(train_sites, sites_per_train))
	te_sites <- filter(dat, presence == 1, SITENO %in% sample(test_sites , sites_per_test))


	# sample a 3 X multiplied site background sample for test and train
	tr_background <- filter(dat, presence == 0) %>%
	sample_n(nrow(tr_sites)*absence_presence_balance)
	te_background <- filter(dat, presence == 0) %>%
	sample_n(nrow(te_sites)*absence_presence_balance)

	# make test and train presence/absence data sets // reduce data volume significantly as % of cells
	train <- rbind_all(list(tr_sites, tr_background)) %>%
	sample_frac(data_reduction_fraction)
	test <- rbind_all(list(te_sites, te_background)) %>%
	sample_frac(data_reduction_fraction)

	return(list(train = train, test = test))
	}

	# function for making final plot data for function space predictions
	# a bit complicated, but needs to account for all the different predict methods
	# each class of model has a bit of a different twist, so this function accounts for that
	plot.prediction.grid <- function(model, pred_grid, model_type, x_test, x_train,
	test_data_set, train_data_set){
	# for Logistic Regression
	if(class(model)[1] == "glm"){
	p_test <- predict(model, newdata = pred_grid, type = "response")
	p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
	} else if(class(model)[1] == "randomForest"){
	# for RF
	p_test <- predict(model, newdata = pred_grid, type = "prob")
	p_test <- data.frame(pred = p_test[,"1"], obs = 2, pred_grid, dset = "sim")
	} else if(class(model)[1] == "svm"){
	# for SVM-OC
	if(length(unique(train_data_set$obs)) == 1){
	p_test <- predict(model, newdata = pred_grid)
	p_test <- ifelse(p_test == TRUE, 1, 0)
	p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
	} else if(length(unique(train_data_set$obs)) == 2){
	# for SVM
	p_test <- predict(model, newdata = pred_grid,probability=TRUE)
	p_test <- data.frame(pred = attr(p_test, "probabilities")[,"1"], obs = 2, pred_grid, dset = "sim")
	}
	} else if(class(model)[1] == "ksvm"){
	# one class
	p_test <- predict(model, newdata = pred_grid)
	p_test <- ifelse(p_test == TRUE, 1, 0)
	p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
	} else if(class(model)[1] == "xgb.Booster"){
	# for XGB
	p_test <- predict(model, newdata = as.matrix(pred_grid))
	p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
	} else if(class(model)[1] == "MaxEnt"){
	# for MAXENT
	p_test <- predict(model, as.matrix(pred_grid))
	p_test <- data.frame(pred = p_test, obs = 2, pred_grid, dset = "sim")
	}
	plot_test <- data.frame(test_data_set, x_test)
	if(length(unique(train_data_set$obs)) == 1){
	plot_train <- data.frame(train_data_set, oc_x_train)
	} else if(length(unique(train_data_set$obs)) == 2){
	plot_train <- data.frame(train_data_set, x_train)
	}
	plot_dat <- data.frame(rbind(plot_train, plot_test),
	dset = rep(c("train", "test"),
	times = c(nrow(plot_train), nrow(plot_test))))
	plot_dat <- rbind(plot_dat, p_test)
	plot_dat$V1 <- as.numeric(scale(plot_dat$V1, center = TRUE))
	plot_dat$V2 <- as.numeric(scale(plot_dat$V2, center = TRUE))
	return(plot_dat)
	}

	# Not in function
	'%ni%' <- Negate('%in%')


	### PACKAGES
	library("ggplot2")
	library("ggalt")
	library("dplyr")
	library("e1071")
	library("caret") # only for confmatrix
	library("AUC")
	library("data.table")
	library("randomForest")
	library("kernlab")
	library("viridis")
	library("xgboost")
	library("dismo")
	library("rJava")
	library("xtable")
	library("knitr")

	### CODE
	## Get Real Data
	og_dat <- fread("~/YOUR_DATA.csv")
	# remove some uneeded columns
	dat <- data.frame(og_dat)
	dat <- dat[!(substr(dat$SITENO, 1, 4) == "36XX"),]
	dat <- dat[, c("cd_conf", "tpi_250c", "presence", "SITENO")]
	colnames(dat) <- c("V1", "V2", "presence", "SITENO")
	# set up data partition variables
	train_v_test_fraction <- 0.75 # 0 to 1
	sites_per_train <- "all" # integer or "all"
	sites_per_test <- "all" # integer or "all"
	absence_presence_balance <- 1.1 # 1 for balanced; typically 3
	data_reduction_fraction <- 0.2 # 0 to 1
	# retrive a train/test set as list
	test_train <- get.train.test.sets(dat, train_v_test_fraction, sites_per_train, sites_per_test,
	absence_presence_balance, data_reduction_fraction)
	# split train/test list into individual train and test objects
	train <- test_train[["train"]]
	test <- test_train[["test"]]
	train <- data.frame(y = train$presence, x1 = train[,1], x2 = train[,2])
	test <- data.frame(y = test$presence, x1 = test[,1], x2 = test[,2])
	# plot density just for fun
	p_dat <- data.frame(rbind(test_train[["train"]], test_train[["test"]]))
	x1_1 <- p_dat[which(p_dat$presence == 1),1]
	x1_0 <- sample(p_dat[which(p_dat$presence == 0),1], length(x1_1))
	x2_1 <- p_dat[which(p_dat$presence == 1),2]
	x2_0 <- sample(p_dat[which(p_dat$presence == 0),2], length(x2_1))
	plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))

	# ## Or... Simulate Data
	# y1 <- rep(1,100)
	# y0 <- rep(0,100)
	# x1_1 <- rnorm(100,1,1)
	# x1_0 <- rnorm(100,3,1)
	# x2_1 <- rnorm(100,5,1)
	# x2_0 <- rnorm(100,6,1)
	# # needs to go 1 then 0 for each x_i
	# plot.dens(data.frame(x1_1, x1_0, x2_1,x2_0))
	# dat <- data.frame(y = c(y1,y0), x1 = c(x1_1,x1_0), x2 = c(x2_1, x2_0))
	# train <- sample_frac(dat,0.5)
	# train_index <- as.numeric(row.names(train))
	# test <- dat[-train_index,]

	# for vector representation
	x_train <- train[,-1] # remove "y"
	y_train <- train$y
	y_train_num <- as.numeric(as.character(y_train))
	x_test <- test[,-1] # remove "y"
	y_test <- test$y
	y_test_num <- as.numeric(as.character(y_test))
	# for one-class-classification
	oc_train <- train[which(train$y == 1),]
	oc_x_train <- oc_train[,-1] # remove "y"
	oc_y_train <- as.numeric(as.character(oc_train$y))

	## Build Models
	# basic Logistic Regression
	m1 <- glm(y ~ ., data = train, family = "binomial")
	# SVM - defaults C-classification, Radial basis function (RBF)
	m2 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE)
	# SVM - various kernels and hyperparameters
	wts <- table(as.factor(y_train)) / length(y_train)
	m3 <- svm(x = x_train, y = as.factor(y_train), probability = TRUE,
	cost = 5, kernel = "radial", class.weights = wts)
	# SVM - one-classification
	m4 <- svm(x = oc_x_train, y = oc_y_train, data = train,
	type = "one-classification", nu = 0.1, cost = 1)
	# RF
	m5 <- randomForest(x = x_train, y = as.factor(y_train),
	ntree = 500, nodesize = 100 )
	# kernlab one-svc
	oc_dat <- data.frame(y = oc_y_train, oc_x_train)
	m6 <- ksvm(y ~ .,data=oc_dat,kernel="rbfdot", type = "one-svc",
	kpar=list(sigma=0.05),C=1, nu = 0.1)
	# XGB
	m7 <- xgboost(data = as.matrix(train[,-1]), label = train$y, max.depth = 4, eta = 0.1,
	nround = 10, objective = "binary:logistic")
	# MAXENT
	m8 <- maxent(x = x_train, p = y_train)

	## Predict models
	# basic logistic regression
	p1 <- predict(m1, x_train, type = "response")
	p1 <- data.frame(pred = p1, obs = y_train_num)
	p1_test <- predict(m1, newdata = test, type = "response")
	p1_test <- data.frame(pred = p1_test, obs = y_test_num)
	# SVM - degaults
	p2 <- predict(m2, x_train, probability=TRUE)
	p2 <- data.frame(pred = attr(p2, "probabilities")[,"1"], obs = y_train_num)
	p2_test <- predict(m2, newdata = x_test, probability=TRUE)
	p2_test <- data.frame(pred = attr(p2_test, "probabilities")[,"1"], obs = y_test_num)
	# SVM - various kernels & k = 5 fold CV
	p3 <- predict(m3, x_train, probability=TRUE)
	p3 <- data.frame(pred = attr(p3, "probabilities")[,"1"], obs = y_train_num)
	p3_test <- predict(m3, newdata = x_test, probability=TRUE)
	p3_test <- data.frame(pred = attr(p3_test, "probabilities")[,"1"], obs = y_test_num)
	# SVM - one-class-classification
	p4 <- predict(m4, oc_x_train)
	p4 <- ifelse(p4 == TRUE, 1, 0)
	p4 <- data.frame(pred = as.numeric(p4), obs = oc_y_train)
	p4_test <- predict(m4, newdata = x_test)
	p4_test <- ifelse(p4_test == TRUE, 1, 0)
	p4_test <- data.frame(pred = as.numeric(p4_test), obs = y_test_num)
	# RF
	p5 <- predict(m5, x_train, type="prob")
	p5 <- data.frame(pred = p5[,"1"], obs = y_train_num)
	p5_test <- predict(m5, newdata = x_test, type="prob")
	p5_test <- data.frame(pred = p5_test[,"1"], obs = y_test_num)
	# kernlab oc SVM
	p6 <- predict(m6, oc_x_train)
	p6 <- ifelse(p6 == TRUE, 1, 0)
	p6 <- data.frame(pred = p6, obs = oc_y_train)
	p6_test <- predict(m6, newdata = x_test)
	p6_test <- ifelse(p6_test == TRUE, 1, 0)
	p6_test <- data.frame(pred = p6_test, obs = y_test_num)
	# XGB
	p7 <- predict(m7, as.matrix(x_train))
	p7 <- data.frame(pred = p7, obs = y_train)
	p7_test <- predict(m7, newdata = as.matrix(x_test))
	p7_test <- data.frame(pred = p7_test, obs = y_test_num)
	# MAXENT
	p8 <- predict(m8, as.matrix(x_train))
	p8 <- data.frame(pred = p8, obs = y_train)
	p8_test <- predict(m8, as.matrix(x_test))
	p8_test <- data.frame(pred = p8_test, obs = y_test_num)


	## Test results
	# RMSE (my function, not caret function)
	p1_RMSE_train <- round(RMSE(p1$obs, p1$pred),3)
	p1_RMSE_test <- round(RMSE(p1_test$obs, p1_test$pred),3)
	p2_RMSE_train <- round(RMSE(p2$obs, p2$pred),3)
	p2_RMSE_test <- round(RMSE(p2_test$obs, p2_test$pred),3)
	p3_RMSE_train <- round(RMSE(p3$obs, p3$pred),3)
	p3_RMSE_test <- round(RMSE(p3_test$obs, p3_test$pred),3)
	p4_RMSE_train <- round(RMSE(p4$obs, p4$pred),3)
	p4_RMSE_test <- round(RMSE(p4_test$obs, p4_test$pred),3)
	p5_RMSE_train <- round(RMSE(p5$obs, p5$pred),3)
	p5_RMSE_test <- round(RMSE(p5_test$obs, p5_test$pred),3)
	p6_RMSE_train <- round(RMSE(p6$obs, p6$pred),3)
	p6_RMSE_test <- round(RMSE(p6_test$obs, p6_test$pred),3)
	p7_RMSE_train <- round(RMSE(p7$obs, p7$pred),3)
	p7_RMSE_test <- round(RMSE(p7_test$obs, p7_test$pred),3)
	p8_RMSE_train <- round(RMSE(p8$obs, p8$pred),3)
	p8_RMSE_test <- round(RMSE(p8_test$obs, p8_test$pred),3)
	RMSE_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "SVM OCC",
	"RF", "one-svc", "XGB", "MAXENT"),
	RMSE_train = c(p1_RMSE_train, p2_RMSE_train,
	p3_RMSE_train, p4_RMSE_train,
	p5_RMSE_train, p6_RMSE_train,
	p7_RMSE_train, p8_RMSE_train),
	RMSE_test = c(p1_RMSE_test, p2_RMSE_test,
	p3_RMSE_test, p4_RMSE_test,
	p5_RMSE_test, p6_RMSE_test,
	p7_RMSE_test, p8_RMSE_test))
	# AUC - AUC package
	p1_roc <- roc(p1$pred, as.factor(p1$obs))
	p1_train_auc <- round(auc(p1_roc),3)
	p1_test_roc <- roc(p1_test$pred, as.factor(p1_test$obs))
	p1_test_auc <- round(auc(p1_test_roc),3)
	p2_roc <- roc(p2$pred, as.factor(p2$obs))
	p2_train_auc <- round(auc(p2_roc),3)
	p2_test_roc <- roc(p2_test$pred, as.factor(p2_test$obs))
	p2_test_auc <- round(auc(p2_test_roc),3)
	p3_roc <- roc(p3$pred, as.factor(p3$obs))
	p3_train_auc <- round(auc(p3_roc),3)
	p3_test_roc <- roc(p3_test$pred, as.factor(p3_test$obs))
	p3_test_auc <- round(auc(p3_test_roc),3)
	p5_roc <- roc(p5$pred, as.factor(p5$obs))
	p5_train_auc <- round(auc(p5_roc),3)
	p5_test_roc <- roc(p5_test$pred, as.factor(p5_test$obs))
	p5_test_auc <- round(auc(p5_test_roc),3)
	p7_roc <- roc(p7$pred, as.factor(p7$obs))
	p7_train_auc <- round(auc(p7_roc),3)
	p7_test_roc <- roc(p7_test$pred, as.factor(p7_test$obs))
	p7_test_auc <- round(auc(p7_test_roc),3)
	p8_roc <- roc(p8$pred, as.factor(p8$obs))
	p8_train_auc <- round(auc(p8_roc),3)
	p8_test_roc <- roc(p8_test$pred, as.factor(p8_test$obs))
	p8_test_auc <- round(auc(p8_test_roc),3)
	auc_scores <- data.frame(models = c("GLM", "SVM RBF", "SVM Poly", "RF", "XGB", "MAXENT"),
	AUC_train = c(p1_train_auc, p2_train_auc,
	p3_train_auc, p5_train_auc,
	p7_train_auc, p8_train_auc),
	AUC_test = c(p1_test_auc, p2_test_auc,
	p3_test_auc, p5_test_auc,
	p7_test_auc, p8_test_auc))
	kable(RMSE_scores)
	# print(xtable(RMSE_scores, digits = c(3)), type = "html", include.rownames=FALSE)
	kable(auc_scores)
	# print(xtable(auc_scores, digits = c(3)), type = "html", include.rownames=FALSE)

	# confusion matrix
	(c4 <- table(p4)) # only 1 ref class
	(c4_test <- confusionMatrix(p4_test$pred, p4_test$obs))

	# plot lowess response curve for interest
	plot(plot.logreg(p1))
	plot(plot.logreg(p1_test))
	plot(plot.logreg(p2))
	plot(plot.logreg(p2_test))
	plot(plot.logreg(p3))
	plot(plot.logreg(p3_test))
	# plot(plot.logreg(p4)) # no probabilities, not effective plot
	# plot(plot.logreg(p4_test)) # same as above
	plot(plot.logreg(p5))
	plot(plot.logreg(p5_test))
	plot(plot.logreg(p7))
	plot(plot.logreg(p7_test))
	plot(plot.logreg(p8))
	plot(plot.logreg(p8_test))


	### Testing ggplot multivariate discrimination plot
	model_num <- 2
	model <- get(paste0("m", model_num))
	test_data_set <- get(paste0("p", model_num, "_test"))
	train_data_set <- get(paste0("p", model_num))
	# create grid of feature space for prediction
	pred_grid <- expand.grid(V1 = seq(min(p_dat$V1), max(p_dat$V1),
	abs((min(p_dat$V1) -max(p_dat$V1)) /50)),
	V2 = seq(min(p_dat$V2), max(p_dat$V2),
	abs((min(p_dat$V2) -max(p_dat$V2)) /50)))
	pred_grid <- data.frame(pred_grid)
	# Predict feature space and join with train/test data and predictions
	# results in dataframe formated for plotting
	plot_dat <- plot.prediction.grid(model, pred_grid, mod_type, x_test, x_train,
	test_data_set, train_data_set)
	# plot the feature space and prediction
	p <- ggplot() +
	# feature space
	geom_raster(data = filter(plot_dat, dset == "sim"),
	aes(x = V1, y = V2, fill = pred), alpha = 0.9,
	interpolate = TRUE) +
	# viridis color of feature space
	scale_fill_viridis(option="inferno", guide = guide_legend(title = "Prob."),
	breaks = seq(0,1,0.1), labels = seq(0,1,0.1),
	direction = 1) +
	# background as points if desired
	# geom_point(data = filter(plot_dat, obs == 0 & dset == "test"),
	# aes(x = V1, y = V2), alpha = 0.05, color = "gray50") +
	# background as density (ggalt package)
	geom_bkde2d(data = filter(plot_dat, obs == 0 & dset == "test"),
	aes(x = V1, y = V2), alpha = 0.6, color = "gray80",
	bandwidth=c(0.1, 0.8)) +
	# training point locations
	geom_point(data = filter(plot_dat, obs == 1 & dset == "train"),
	shape = 20, aes(x = V1, y = V2), color = "red",
	alpha = 0.025) +
	#test point locations
	geom_point(data = filter(plot_dat, obs == 1 & dset == "test"),
	shape = 20, aes(x = V1, y = V2), color = "green",
	alpha = 0.03) +
	# theme
	theme_bw() +
	# labs, needs developer version of ggplot as of 04/19/16
	labs(title="Archaeological Site Presence Probability",
	subtitle="MAXENT, Test AUC = 0.833",
	caption="Data: Pennsylvania pre-contact archaeological site locations",
	x = "Cost Distance to Confluence",
	y = "Terrain Position Index") +
	# make it pretty
	theme(
	panel.border = element_rect(colour = "gray90"),
	axis.text.x = element_text(angle = 0, size = 6, family = "Trebuchet MS"),
	axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
	axis.title = element_text(size = 8, family = "Trebuchet MS"),
	plot.caption = element_text(size = 8, hjust=0,
	family = "Trebuchet MS"),
	plot.title=element_text(family="TrebuchetMS-Bold"),
	plot.subtitle=element_text(family="TrebuchetMS-Italic"),
	legend.text = element_text(size = 8, family = "Trebuchet MS")
	)

	plot(p)
	ggsave(filename = "MAXENT.png", width = 6, height = 4)