Skip to content

Instantly share code, notes, and snippets.

@Protonk
Last active December 13, 2015 19:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Protonk/4961378 to your computer and use it in GitHub Desktop.
Save Protonk/4961378 to your computer and use it in GitHub Desktop.
Data Visualization under Uncertainty notes and code

Data Visualization under Uncertainty

An attempt has been made to present these as they appear in the slides, but I might miss or re-order 1-2.

Sources

xkcd

Enliven project

Science!

Polling

Boxplots and such

Periscopic's visualization

Forecasting

library(reshape2)
library(ggplot2)
library(plyr)
#####
##
## Violin Plots in ggplot2
##
#####
# iris is a pretty standard dataset and good for this example
iris.long <- melt(iris, id.vars = "Species")
iris.long[, "variable"] <- gsub("\\.", " ", iris.long[, "variable"])
# there are cooler ways to do this with %in%
# but it is worth showing this way.
iris.long[, "Species"] <- paste0(toupper(substr(iris.long[, "Species"], 0, 1)),
substring(iris.long[, "Species"], 2))
# Violin plot allows us to show density estimates for multiple
# variables together
# sometimes preferable to boxplots as we see here
ggplot(data = iris.long, aes(x = Species, fill = Species, y = value)) +
geom_boxplot() + xlab('') + ylab("Length or Width in mm") +
facet_wrap(~ variable) + ggtitle("Iris Boxplots")
ggplot(data = iris.long, aes(x = Species, y = value, fill = Species)) +
geom_violin() + ylab('') + ylab("Length or Width in mm") +
facet_wrap(~ variable) + ggtitle("Iris Violin Plots")
#####
##
## Pollster
##
#####
obama.favorable.df <- read.csv("http://elections.huffingtonpost.com/pollster/obama-favorable-rating.csv", header = TRUE, as.is = TRUE)
obama.favorable.df[, "End.Date"] <- as.Date(obama.favorable.df[, "End.Date"],
format = "%Y-%m-%d")
# cut() has a method for dates which is very handy
obama.favorable.df[, "Quarter"] <- cut.Date(obama.favorable.df[, "End.Date"],
breaks = "quarter")
# Not necessary, but cleans things up for inspection
obama.favorable.df <- obama.favorable.df[, c("Pollster",
"Quarter", "End.Date",
"Number.of.Observations",
"Population", "Mode",
"Favorable", "Unfavorable",
"Undecided")]
# avg helpful for plotting colors
obama.favorable.df <- ddply(obama.favorable.df, "Quarter", transform, FavAvg = median(Favorable))
# Boxplot by quarters
ggplot(data = obama.favorable.df, aes(x = Quarter, y = Favorable, fill = FavAvg)) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 30, size = rel(1.2), vjust = 0.5)) +
scale_fill_gradient2(low = "red", high = "lightblue", midpoint = 50, guide = FALSE) +
ylab('') + xlab('') + ggtitle("Obama Favorability Ratings")
###
###
### Time Series
###
###
# Start with one time series
# build a model
# 2nd argument is order of the model: (p, d, q)
# we cheat a bit here because we know the simulated data
model.ts <- HoltWinters(airmiles, gamma = FALSE)
# one forecast
forecast.ts <- predict(model.ts, n.ahead = 10,
prediction.interval = TRUE,
level = 0.95)
#####
##
## Confidence regions in ggplot2
##
#####
## ggplot2 doesn't handle time series objects natively so we convert to df
sim.df <- data.frame(Time = as.numeric(time(airmiles)),
Value = as.numeric(airmiles),
Upper = 0,
Lower = 0,
Status = factor("Observed",
levels = c("Observed",
"Predicted",
"Fitted")))
forecast.df <- data.frame(Time = as.numeric(time(forecast.ts) - 1),
Value = as.numeric(forecast.ts[, "fit"]),
Upper = as.numeric(forecast.ts[, "upr"]),
Lower = as.numeric(forecast.ts[, "lwr"]),
Status = "Predicted")
fitted.df <- data.frame(Time = as.numeric(time(model.ts[["fitted"]])),
Value = as.numeric(model.ts[["fitted"]][,"xhat"]),
Upper = 0,
Lower = 0,
Status = "Fitted")
combinedts.df <- do.call(rbind, list(sim.df, forecast.df, fitted.df))
# our critical value for a confidence interval derived from t-scores
# the predict model object gives us standard errors if we want them
ggplot(data = combinedts.df, aes(x = Time, y = Value, colour = Status)) +
geom_ribbon(data = subset(combinedts.df, Status == "Predicted"),
aes(ymin = Upper,
ymax = Lower),
alpha = 0.2) +
geom_line() + geom_vline(xintercept = 1960, linetype = "longdash") +
xlab('') + ylab("Airline Miles per Year") + ggtitle("HW Confidence Intervals")
#####
##
## Contour plots and margins
##
#####
mvn <- mvrnorm(n = 3000, c(2,2), matrix(c(10,1,3,3),2,2))
mvn <- data.frame(x = mvn[, 1], y = mvn[, 2])
# very basic 2d contour plot
ggplot(data = mvn, aes(x = x, y = y)) + stat_density2d() + geom_point(alpha = 0.2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment