An attempt has been made to present these as they appear in the slides, but I might miss or re-order 1-2.
Last active
December 13, 2015 19:18
-
-
Save Protonk/4961378 to your computer and use it in GitHub Desktop.
Data Visualization under Uncertainty notes and code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reshape2) | |
library(ggplot2) | |
library(plyr) | |
##### | |
## | |
## Violin Plots in ggplot2 | |
## | |
##### | |
# iris is a pretty standard dataset and good for this example | |
iris.long <- melt(iris, id.vars = "Species") | |
iris.long[, "variable"] <- gsub("\\.", " ", iris.long[, "variable"]) | |
# there are cooler ways to do this with %in% | |
# but it is worth showing this way. | |
iris.long[, "Species"] <- paste0(toupper(substr(iris.long[, "Species"], 0, 1)), | |
substring(iris.long[, "Species"], 2)) | |
# Violin plot allows us to show density estimates for multiple | |
# variables together | |
# sometimes preferable to boxplots as we see here | |
ggplot(data = iris.long, aes(x = Species, fill = Species, y = value)) + | |
geom_boxplot() + xlab('') + ylab("Length or Width in mm") + | |
facet_wrap(~ variable) + ggtitle("Iris Boxplots") | |
ggplot(data = iris.long, aes(x = Species, y = value, fill = Species)) + | |
geom_violin() + ylab('') + ylab("Length or Width in mm") + | |
facet_wrap(~ variable) + ggtitle("Iris Violin Plots") | |
##### | |
## | |
## Pollster | |
## | |
##### | |
obama.favorable.df <- read.csv("http://elections.huffingtonpost.com/pollster/obama-favorable-rating.csv", header = TRUE, as.is = TRUE) | |
obama.favorable.df[, "End.Date"] <- as.Date(obama.favorable.df[, "End.Date"], | |
format = "%Y-%m-%d") | |
# cut() has a method for dates which is very handy | |
obama.favorable.df[, "Quarter"] <- cut.Date(obama.favorable.df[, "End.Date"], | |
breaks = "quarter") | |
# Not necessary, but cleans things up for inspection | |
obama.favorable.df <- obama.favorable.df[, c("Pollster", | |
"Quarter", "End.Date", | |
"Number.of.Observations", | |
"Population", "Mode", | |
"Favorable", "Unfavorable", | |
"Undecided")] | |
# avg helpful for plotting colors | |
obama.favorable.df <- ddply(obama.favorable.df, "Quarter", transform, FavAvg = median(Favorable)) | |
# Boxplot by quarters | |
ggplot(data = obama.favorable.df, aes(x = Quarter, y = Favorable, fill = FavAvg)) + | |
geom_boxplot() + theme(axis.text.x = element_text(angle = 30, size = rel(1.2), vjust = 0.5)) + | |
scale_fill_gradient2(low = "red", high = "lightblue", midpoint = 50, guide = FALSE) + | |
ylab('') + xlab('') + ggtitle("Obama Favorability Ratings") | |
### | |
### | |
### Time Series | |
### | |
### | |
# Start with one time series | |
# build a model | |
# 2nd argument is order of the model: (p, d, q) | |
# we cheat a bit here because we know the simulated data | |
model.ts <- HoltWinters(airmiles, gamma = FALSE) | |
# one forecast | |
forecast.ts <- predict(model.ts, n.ahead = 10, | |
prediction.interval = TRUE, | |
level = 0.95) | |
##### | |
## | |
## Confidence regions in ggplot2 | |
## | |
##### | |
## ggplot2 doesn't handle time series objects natively so we convert to df | |
sim.df <- data.frame(Time = as.numeric(time(airmiles)), | |
Value = as.numeric(airmiles), | |
Upper = 0, | |
Lower = 0, | |
Status = factor("Observed", | |
levels = c("Observed", | |
"Predicted", | |
"Fitted"))) | |
forecast.df <- data.frame(Time = as.numeric(time(forecast.ts) - 1), | |
Value = as.numeric(forecast.ts[, "fit"]), | |
Upper = as.numeric(forecast.ts[, "upr"]), | |
Lower = as.numeric(forecast.ts[, "lwr"]), | |
Status = "Predicted") | |
fitted.df <- data.frame(Time = as.numeric(time(model.ts[["fitted"]])), | |
Value = as.numeric(model.ts[["fitted"]][,"xhat"]), | |
Upper = 0, | |
Lower = 0, | |
Status = "Fitted") | |
combinedts.df <- do.call(rbind, list(sim.df, forecast.df, fitted.df)) | |
# our critical value for a confidence interval derived from t-scores | |
# the predict model object gives us standard errors if we want them | |
ggplot(data = combinedts.df, aes(x = Time, y = Value, colour = Status)) + | |
geom_ribbon(data = subset(combinedts.df, Status == "Predicted"), | |
aes(ymin = Upper, | |
ymax = Lower), | |
alpha = 0.2) + | |
geom_line() + geom_vline(xintercept = 1960, linetype = "longdash") + | |
xlab('') + ylab("Airline Miles per Year") + ggtitle("HW Confidence Intervals") | |
##### | |
## | |
## Contour plots and margins | |
## | |
##### | |
mvn <- mvrnorm(n = 3000, c(2,2), matrix(c(10,1,3,3),2,2)) | |
mvn <- data.frame(x = mvn[, 1], y = mvn[, 2]) | |
# very basic 2d contour plot | |
ggplot(data = mvn, aes(x = x, y = y)) + stat_density2d() + geom_point(alpha = 0.2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment