Skip to content

Instantly share code, notes, and snippets.

@mattbaggott
Created November 6, 2012 07:17
Show Gist options
  • Save mattbaggott/4023209 to your computer and use it in GitHub Desktop.
Save mattbaggott/4023209 to your computer and use it in GitHub Desktop.
Visually weighted regression / Watercolor plots by Felix Schönbrodt
# Copyright 2012 Felix Schönbrodt
# All rights reserved.
#
# FreeBSD License
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER `AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation
# are those of the authors and should not be interpreted as representing
# official policies, either expressed or implied, of the copyright
# holder.
# Version history:
# 0.1: original code
# 0.1.1: changed license to FreeBSD; re-established compability to ggplot2 (new version 0.9.2)
## Visually weighted regression / Watercolor plots
## Idea: Solomon Hsiang, with additional ideas from many blog commenters
# B = number bootstrapped smoothers
# shade: plot the shaded confidence region?
# shade.alpha: should the CI shading fade out at the edges? (by reducing alpha; 0 = no alpha decrease, 0.1 = medium alpha decrease, 0.5 = strong alpha decrease)
# spag: plot spaghetti lines?
# spag.color: color of spaghetti lines
# mweight: should the median smoother be visually weighted?
# show.lm: should the linear regresison line be plotted?
# show.CI: should the 95% CI limits be plotted?
# show.median: should the median smoother be plotted?
# median.col: color of the median smoother
# shape: shape of points
# method: the fitting function for the spaghettis; default: loess
# bw = TRUE: define a default b&w-palette
# slices: number of slices in x and y direction for the shaded region. Higher numbers make a smoother plot, but takes longer to draw. I wouldn'T go beyond 500
# palette: provide a custom color palette for the watercolors
# ylim: restrict range of the watercoloring
# quantize: either "continuous", or "SD". In the latter case, we get three color regions for 1, 2, and 3 SD (an idea of John Mashey)
# add: if add == FALSE, a new ggplot is returned. If add == TRUE, only the elements are returned, which can be added to an existing ggplot (with the '+' operator)
# ...: further parameters passed to the fitting function, in the case of loess, for example, "span = .9", or "family = 'symmetric'"
vwReg <- function(formula, data, title="", B=1000, shade=TRUE, shade.alpha=.1, spag=FALSE, spag.color="darkblue", mweight=TRUE, show.lm=FALSE, show.median = TRUE, median.col = "white", shape = 21, show.CI=FALSE, method=loess, bw=FALSE, slices=200, palette=colorRampPalette(c("#FFEDA0", "#DD0000"), bias=2)(20), ylim=NULL, quantize = "continuous", add=FALSE, ...) {
IV <- all.vars(formula)[2]
DV <- all.vars(formula)[1]
data <- na.omit(data[order(data[, IV]), c(IV, DV)])
if (bw == TRUE) {
palette <- colorRampPalette(c("#EEEEEE", "#999999", "#333333"), bias=2)(20)
}
print("Computing boostrapped smoothers ...")
newx <- data.frame(seq(min(data[, IV]), max(data[, IV]), length=slices))
colnames(newx) <- IV
l0.boot <- matrix(NA, nrow=nrow(newx), ncol=B)
l0 <- method(formula, data)
for (i in 1:B) {
data2 <- data[sample(nrow(data), replace=TRUE), ]
data2 <- data2[order(data2[, IV]), ]
if (class(l0)=="loess") {
m1 <- method(formula, data2, control = loess.control(surface = "i", statistics="a", trace.hat="a"), ...)
} else {
m1 <- method(formula, data2, ...)
}
l0.boot[, i] <- predict(m1, newdata=newx)
}
# compute median and CI limits of bootstrap
library(plyr)
library(reshape2)
CI.boot <- adply(l0.boot, 1, function(x) quantile(x, prob=c(.025, .5, .975, pnorm(c(-3, -2, -1, 0, 1, 2, 3))), na.rm=TRUE))[, -1]
colnames(CI.boot)[1:10] <- c("LL", "M", "UL", paste0("SD", 1:7))
CI.boot$x <- newx[, 1]
CI.boot$width <- CI.boot$UL - CI.boot$LL
# scale the CI width to the range 0 to 1 and flip it (bigger numbers = narrower CI)
CI.boot$w2 <- (CI.boot$width - min(CI.boot$width))
CI.boot$w3 <- 1-(CI.boot$w2/max(CI.boot$w2))
# convert bootstrapped spaghettis to long format
b2 <- melt(l0.boot)
b2$x <- newx[,1]
colnames(b2) <- c("index", "B", "value", "x")
library(ggplot2)
library(RColorBrewer)
# Construct ggplot
# All plot elements are constructed as a list, so they can be added to an existing ggplot
# if add == FALSE: provide the basic ggplot object
p0 <- ggplot(data, aes_string(x=IV, y=DV)) + theme_bw()
# initialize elements with NULL (if they are defined, they are overwritten with something meaningful)
gg.tiles <- gg.poly <- gg.spag <- gg.median <- gg.CI1 <- gg.CI2 <- gg.lm <- gg.points <- gg.title <- NULL
if (shade == TRUE) {
quantize <- match.arg(quantize, c("continuous", "SD"))
if (quantize == "continuous") {
print("Computing density estimates for each vertical cut ...")
flush.console()
if (is.null(ylim)) {
min_value <- min(min(l0.boot, na.rm=TRUE), min(data[, DV], na.rm=TRUE))
max_value <- max(max(l0.boot, na.rm=TRUE), max(data[, DV], na.rm=TRUE))
ylim <- c(min_value, max_value)
}
# vertical cross-sectional density estimate
d2 <- ddply(b2[, c("x", "value")], .(x), function(df) {
res <- data.frame(density(df$value, na.rm=TRUE, n=slices, from=ylim[1], to=ylim[2])[c("x", "y")])
#res <- data.frame(density(df$value, na.rm=TRUE, n=slices)[c("x", "y")])
colnames(res) <- c("y", "dens")
return(res)
}, .progress="text")
maxdens <- max(d2$dens)
mindens <- min(d2$dens)
d2$dens.scaled <- (d2$dens - mindens)/maxdens
## Tile approach
d2$alpha.factor <- d2$dens.scaled^shade.alpha
gg.tiles <- list(geom_tile(data=d2, aes(x=x, y=y, fill=dens.scaled, alpha=alpha.factor)), scale_fill_gradientn("dens.scaled", colours=palette), scale_alpha_continuous(range=c(0.001, 1)))
}
if (quantize == "SD") {
## Polygon approach
SDs <- melt(CI.boot[, c("x", paste0("SD", 1:7))], id.vars="x")
count <- 0
d3 <- data.frame()
col <- c(1,2,3,3,2,1)
for (i in 1:6) {
seg1 <- SDs[SDs$variable == paste0("SD", i), ]
seg2 <- SDs[SDs$variable == paste0("SD", i+1), ]
seg <- rbind(seg1, seg2[nrow(seg2):1, ])
seg$group <- count
seg$col <- col[i]
count <- count + 1
d3 <- rbind(d3, seg)
}
gg.poly <- list(geom_polygon(data=d3, aes(x=x, y=value, color=NULL, fill=col, group=group)), scale_fill_gradientn("dens.scaled", colours=palette, values=seq(-1, 3, 1)))
}
}
print("Build ggplot figure ...")
flush.console()
if (spag==TRUE) {
gg.spag <- geom_path(data=b2, aes(x=x, y=value, group=B), size=0.7, alpha=10/B, color=spag.color)
}
if (show.median == TRUE) {
if (mweight == TRUE) {
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M, alpha=w3^3), size=.6, linejoin="mitre", color=median.col)
} else {
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M), size = 0.6, linejoin="mitre", color=median.col)
}
}
# Confidence limits
if (show.CI == TRUE) {
gg.CI1 <- geom_path(data=CI.boot, aes(x=x, y=UL), size=1, color="red")
gg.CI2 <- geom_path(data=CI.boot, aes(x=x, y=LL), size=1, color="red")
}
# plain linear regression line
if (show.lm==TRUE) {gg.lm <- geom_smooth(method="lm", color="darkgreen", se=FALSE)}
gg.points <- geom_point(data=data, aes_string(x=IV, y=DV), size=1, shape=shape, fill="white", color="black")
if (title != "") {
gg.title <- theme(title=title)
}
gg.elements <- list(gg.tiles, gg.poly, gg.spag, gg.median, gg.CI1, gg.CI2, gg.lm, gg.points, gg.title, theme(legend.position="none"))
if (add == FALSE) {
return(p0 + gg.elements)
} else {
return(gg.elements)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment