Code for blog post: A post to compare a bunch of visualizations against the boxplot.
# little Not In function
'%ni%' <- Negate('%in%')
dat <- fread("YOUR DATA LOCATION.CSV")
dat <- data.frame(dat)
### Data is a nXp dataframe where the p columns contain:
# [presence] as a 0/1 binary presence/absence indicator
# [SITENO] which is a character field that has the group level name for presence and "background" for background samples
# any number of varaibles that you want to model as indicators of presence/absence
# the n rows are individual measurements of the variables at sampling locations within SITENO spatail groups.
# to replicate these, and dataframe with 0/1 [presence], chr group [SITENO], and numeric variables should work.
# remove some unneeded columns
dat_trimmed <- dat[,colnames(dat) %ni% c("V1", "tpi_sd250c", "tpi_cls250c",
# set up data partition variables
train_v_test_fraction <- 0.8 # 0 to 1
sites_per_train <- "all" # integer or "all"
sites_per_test <- "all" # integer or "all"
absence_presence_balance <- 3 # 1 for balanced; typically 3
data_reduction_fraction <- 1 # 0 to 1
runs <- 10 # resample repeats
# function to sample data for train/test sets
get.train.test.sets <- function(dat, train_v_test_fraction, sites_per_train, sites_per_test,
absence_presence_balance, data_reduction_fraction){
# set up train and test site SITENO lists
site_names <- unique(dat$SITENO)[-length(unique(dat$SITENO))] # remove "background"
train_sites <- site_names[sample(seq_along(site_names),length(site_names)*train_v_test_fraction)]
test_sites <- site_names[!(site_names %in% train_sites)]
# reduce train and test site data volume, but still sampling from test or train
if(sites_per_train == "all") sites_per_train <- length(train_sites)
if(sites_per_test == "all") sites_per_test <- length(test_sites)
tr_sites <- filter(dat, presence == 1, SITENO %in% sample(train_sites, sites_per_train))
te_sites <- filter(dat, presence == 1, SITENO %in% sample(test_sites , sites_per_test))
# sample a 3 X multiplied site background sample for test and train
tr_background <- filter(dat, presence == 0) %>%
te_background <- filter(dat, presence == 0) %>%
# make test and train presence/absence data sets // reduce data volume significantly as % of cells
train <- rbind_all(list(tr_sites, tr_background)) %>%
test <- rbind_all(list(te_sites, te_background)) %>%
return(list(train = train, test = test))
# container for results
IV_results <- data.frame()
# loop through resamples to get results
for(i in 1:runs){
# retrive a train/test set as list
test_train <- get.train.test.sets(dat_trimmed, train_v_test_fraction, sites_per_train, sites_per_test,
absence_presence_balance, data_reduction_fraction)
# split train/test list into individual train and test objects
train <- test_train[["train"]]
test <- test_train[["test"]]
### variable imporatnce with Information::create_infotables
IV <- create_infotables(data = train,
valid = test,
y = "presence",
# to see single resample results, use these functions
# knitr::kable(IV$Summary)
# variable <- "e_hyd_min"
# knitr::kable(IV$Tables[[variable]])
# plot_infotables(IV, variable, show_values=TRUE)
iv_df <- data.frame(IV$Summary, run = i)
IV_results <- rbind(IV_results, iv_df)
# table representations
IV_summary <- group_by(IV_results, Variable) %>%
summarise(Minimum = min(AdjIV),
p5 = quantile(AdjIV, probs = 0.05),
Mean = mean(AdjIV),
Median = median(AdjIV),
p95 = quantile(AdjIV, probs = 0.95),
Maximum = max(AdjIV)) %>%
IV_summary[,-1] <- round(IV_summary[,-1,0],2)
# for maing latex and HTML table for blog post
# x_IV_summary <- xtable(IV_summary)
# print(x_IV_summary)
# print(xtable(x_IV_summary), type = "html")
# line with line end version
p1 <- ggplot(data = IV_results, aes(x = run, y = AdjIV, group = Variable, color = Variable)) +
geom_line() +
theme_bw() +
geom_label_repel(data = IV_results %>% filter(run==runs),
aes(label=Variable, fill=Variable),
nudge_x = 2.5, nudge_y = 0, size=2, color='white',
force=5, segment.color='#bbbbbb', max.iter = 3000) +
expand_limits(x=(runs + 2)) +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples",
x = "Resample",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
scale_x_continuous(breaks = seq(0,runs,1)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_lines.png", width = 6, height = 4)
# lines as splines
p2 <- ggplot(data = IV_results, aes(x = run, y = AdjIV, group = Variable, color = Variable)) +
geom_xspline(spline_shape=-0.4, size=0.5) +
geom_point() +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples",
x = "Resample",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
scale_x_continuous(breaks = seq(0,runs,1)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_splines.png", width = 6, height = 4)
# ordered boxplot version
mean_adjIV <- group_by(IV_results, Variable) %>%
summarise(mean = mean(AdjIV)) %>%
IV_results$Variable_ordered <-factor(IV_results$Variable,
levels = mean_adjIV[order(mean_adjIV$mean, decreasing = TRUE),
p3 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV)) +
geom_hline(yintercept = 0, color = "gray70", linetype = "dashed") +
geom_boxplot(fill = "skyblue", outlier.colour = "gray20", outlier.shape = 1) +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 90, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_boxplot.png", width = 6, height = 4)
# bar plot with error
se <- function(x) sqrt(var(x)/length(x))
IV_bar_summary <- group_by(IV_results, Variable) %>%
summarise(SE = se(AdjIV),
Mean = mean(AdjIV),
Upper = Mean + SE,
Lower = Mean - SE) %>%
IV_bar_summary$Variable_ordered <-factor(IV_bar_summary$Variable,
levels = IV_bar_summary[order(IV_bar_summary$Mean, decreasing = TRUE),
p4 <- ggplot(data = IV_bar_summary, aes(x = Variable_ordered, y = Mean)) +
geom_hline(yintercept = 0, color = "gray70", linetype = "dashed") +
geom_bar(stat = "identity", fill = "skyblue") +
geom_errorbar(aes(ymax = Upper, ymin = Lower), width=0.25) +
# geom_boxplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV), fill = "skyblue", outlier.colour = "gray20", outlier.shape = 1)
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV, Showing Standard Error",
x = "Variable",
y = "Adjusted Information Value (IV)") +
# scale_y_continuous(breaks = seq(-1,1,0.25)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 90, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_Bar.png", width = 6, height = 4)
# Density overlap
p5 <- ggplot(data = IV_results, aes(x = AdjIV, group = Variable, fill = Variable)) +
geom_density(alpha = 0.33) +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples",
x = "Adjusted Information Value (IV)",
y = "Density") +
scale_x_continuous(breaks = seq(-1,1,0.25)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_density.png", width = 6, height = 4)
# facetted densities
p6 <- ggplot(data = IV_results, aes(x = AdjIV, group = Variable_ordered)) +
geom_density(fill = "skyblue") +
geom_vline(xintercept = 0, color = "gray50", linetype = "dashed") +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples",
x = "Adjusted Information Value (IV)",
y = "Density") +
scale_x_continuous(breaks = seq(-1,1,0.25)) +
facet_grid(Variable_ordered~.) +
legend.position = "none",
panel.grid = element_blank(),
strip.background = element_rect(colour = "gray50", fill = "white"),
strip.text.y = element_text(colour = "black", size = 6, face = "bold", family = "Trebuchet MS"),
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_blank(),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic"),
axis.ticks.y = element_blank()
ggsave(filename = "IV_density_facet.png", width = 8, height = 10)
# facet wrapped densities
p7 <- ggplot(data = IV_results, aes(x = AdjIV, group = Variable_ordered)) +
geom_density(fill = "skyblue") +
geom_vline(xintercept = 0, color = "gray50", linetype = "dashed") +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples",
x = "Adjusted Information Value (IV)",
y = "Density") +
scale_x_continuous(breaks = seq(-1,1,0.5)) +
facet_wrap( ~ Variable_ordered, ncol = 4) +
legend.position = "none",
panel.grid = element_blank(),
strip.background = element_rect(colour = "gray50", fill = "white"),
strip.text = element_text(colour = "black", size = 9, face = "bold", family = "Trebuchet MS"),
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_blank(),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic"),
axis.ticks.y = element_blank()
ggsave(filename = "IV_density_facet_wrap.png", width = 6, height = 6)
# bivariate
# ordered by AdjIV because that is what we care about in the end; not raw IV
p7 <- ggplot(IV_results, aes(x = IV, y = PENALTY,
group = Variable_ordered, color = Variable_ordered)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "gray50") +
theme_bw() +
labs(title="Variable Information vs. CV Penalty for Site Presence & Absence",
subtitle="Values for 10 Resamples, Ordered by Mean AdjIV ",
x = "Information Value (IV)",
y = "CV Penalty") +
facet_wrap(~Variable_ordered, nrow = 4) +
legend.position = "none",
# panel.grid = element_blank(),
strip.background = element_rect(colour = "gray50", fill = "white"),
strip.text.y = element_text(colour = "black", size = 8, face = "bold", family = "Trebuchet MS"),
axis.text.x = element_text(angle = 0, size = 8, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_ bivariate.png", width = 8, height = 6)
# Compare ranked variables plot
sort_adjIV <- group_by(IV_results, Variable) %>%
summarise(mean_AdjIV = mean(AdjIV),
mean_IV = mean(IV)) %>%
IV_ord <- sort_adjIV[order(sort_adjIV$mean_IV, decreasing = TRUE),"Variable"]
AdjIV_ord <- sort_adjIV[order(sort_adjIV$mean_AdjIV, decreasing = TRUE), "Variable"]
var_rank <- data.frame(IV = IV_ord, AdjIV = AdjIV_ord,
IV_rank = seq(1:length(IV_ord)),
AdjIV_rank = match(IV_ord, AdjIV_ord))
p8 <- ggplot(var_rank) +
geom_text(aes(x="IV", y = 19-IV_rank, label=IV), size=3.2, hjust=1, family="TrebuchetMS-Bold") +
geom_text(aes(x="AdjIV", y = 19-AdjIV_rank, label=IV), size=3.2, hjust=0, family="TrebuchetMS-Bold") +
geom_segment(aes(x="AdjIV", y=19-AdjIV_rank, xend="IV", yend=19-IV_rank), alpha=.5) +
geom_hline(yintercept = -1) +
theme_void() +
# scale_y_continuous(breaks=c(-1,19), labels=c("","")) +
scale_x_discrete(limits = c("IV", "AdjIV")) +
labs(title="Rank Change Betweeen IV and AdjIV",
x = "Metric",
y = "Relative Rank") +
axis.text.x = element_text(angle = 0, size = 12, hjust = 1, family = "Trebuchet MS", face = "bold"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic"),
axis.title.x = element_text(),
axis.title.y = element_text(angle = 90)
ggsave(filename = "IV_rank_change.png", width = 6, height = 4)
# Violin and Boxplot combination
p9 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV, group = Variable_ordered)) +
geom_hline(yintercept = 0, color = "gray70", linetype = "dashed") +
geom_violin(color = "white", fill = "skyblue", scale = "width", alpha = 0.65) +
geom_boxplot(color = "skyblue3", fill = "dodgerblue4", width = 0.3,
outlier.colour = "dodgerblue4", outlier.shape = 19, size = 0.25) +
# coord_flip() +
theme_bw() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV, Showing Standard Error",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
legend.position = "none",
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_violin_boxplot.png", width = 6, height = 4)
# totally wonky code for stem & Leaf and boxplot combo
trim.leading <- function (x) sub("^\\s+", "", x)
# simulate data
cc <- data.frame(value = replicate(4,rnorm(30,sample(3:7,1),1)), g = 1:30)
# loop over colums to capture stem() output and format
leaf_bind <- NULL
for(i in 1:(ncol(cc)-1)){
tmp <- capture.output(stem(cc[,i], width = 100, scale=1, atom = 2))[-2]
ss <- as.numeric(substring(trim.leading(tmp),1,2))
ss[1] <- ss[3]-2
ss[2] <- ss[3]-1
ss[length(ss)] <- ss[length(ss)-1]+1
# add formated S&L to output table with group
sdf = data.frame(tmp, digi = ss, rr=1:length(tmp), g = i)
leaf_bind <- rbind(leaf_bind, sdf)
# hasty clean up
ccm <- cc[,-(ncol(cc))]
colnames(ccm) <- seq(1:ncol(ccm))
# melt to plot
ccm <- melt(ccm)
colnames(ccm) <- c("g", "value")
p10 <- ggplot()+
geom_boxplot(data = ccm, aes(y = value, x = g), color = "skyblue3",
fill = "skyblue", width = 0.2, size = 0.25, alpha = 0.5) +
geom_text(data = leaf_bind, aes(y = digi, x= g+0.1, label=tmp),
hjust=0, fontface = "bold", size = 3, family = "TrebuchetMS") +
theme_bw() +
expand_limits(x = 5.5) +
labs(title="Boxplots & Steam and Leaf",
subtitle="beacuse, it's what Tukey would do",
x = "Class",
y = "Authentic Value")+
plot.title = element_text(family="TrebuchetMS-Bold", size = 20),
plot.subtitle = element_text(family="TrebuchetMS-Italic", size = 15),
axis.title = element_text(size = 10, family = "Trebuchet MS", face = "bold")
ggsave(filename = "IV_bp_stemleaf.png", width = 6, height = 4)
# Tufte bos plots plots
p11 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV)) +
geom_tufteboxplot(median.type = "line", whisker.type = 'point', hoffset = 0) +
theme_tufte(ticks = FALSE) +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
axis.text.x = element_text(angle = 90, size = 6, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
axis.title = element_text(size = 8, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_tufte_boxplot1.png", width = 6, height = 4)
p12 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV)) +
geom_tufteboxplot() +
theme_tufte(ticks = FALSE) +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
axis.text.x = element_text(angle = 90, size = 6, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
axis.title = element_text(size = 8, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_tufte_boxplot2.png", width = 6, height = 4)
# simple dot plot with Tufte sparsity
p13 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV)) +
geom_point(size = 0.75) +
theme_tufte() +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
axis.text.x = element_text(angle = 90, size = 6, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
axis.title = element_text(size = 8, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_tufte_point.png", width = 6, height = 4)
p14 <- ggplot(data = IV_results, aes(x = Variable_ordered, y = AdjIV)) +
geom_tufteboxplot(median.type = "line") +
theme_tufte(ticks = FALSE) +
labs(title="Variable Information Value for Site Presence & Absence",
subtitle="Values for 10 Resamples Ordered by Mean IV",
x = "Variable",
y = "Adjusted Information Value (IV)") +
scale_y_continuous(breaks = seq(-1,1,0.25)) +
axis.text.x = element_text(angle = 90, size = 6, hjust = 1, family = "Trebuchet MS"),
axis.text.y = element_text(size = 6, family = "Trebuchet MS"),
axis.title = element_text(size = 8, family = "Trebuchet MS", face = "bold"),
plot.title = element_text(family="TrebuchetMS-Bold"),
plot.subtitle = element_text(family="TrebuchetMS-Italic")
ggsave(filename = "IV_tufte_boxplot3.png", width = 6, height = 4)
