Protonk/notes.md

## notes.md

      
    Raw
  

              notes.md
            
          
    Data Visualization under Uncertainty

An attempt has been made to present these as they appear in the slides, but I might miss or re-order 1-2.
Sources

xkcd


Useless

Enliven project


Original post
Amanda Marcotte
the challenge of data
followup from Enliven

Science!


How Dark Sky works
Professional communication and concision

Polling


Milwaukee Journal Sentinel
Huffington Post's Pollster
any given 538 post

Boxplots and such


Box plot on Wikipedia
Anscombe's quartet

Periscopic's visualization


Gun deaths in 2010

Forecasting


Podcast analytics
Amanda Cox's amazing forecasting visualization
Sea level rise bootstrap


## rplots.R
library(reshape2)
library(ggplot2)
library(plyr)

#####
##
## Violin Plots in ggplot2
##
#####

# iris is a pretty standard dataset and good for this example
iris.long <- melt(iris, id.vars = "Species")

iris.long[, "variable"] <- gsub("\\.", " ", iris.long[, "variable"])

# there are cooler ways to do this with %in%
# but it is worth showing this way.
iris.long[, "Species"] <- paste0(toupper(substr(iris.long[, "Species"], 0, 1)),
                                 substring(iris.long[, "Species"], 2))

# Violin plot allows us to show density estimates for multiple
# variables together

# sometimes preferable to boxplots as we see here

ggplot(data = iris.long, aes(x = Species, fill = Species, y = value)) +
  geom_boxplot() + xlab('') + ylab("Length or Width in mm") +
  facet_wrap(~ variable) + ggtitle("Iris Boxplots")

ggplot(data = iris.long, aes(x = Species, y = value, fill = Species)) +
  geom_violin() + ylab('') + ylab("Length or Width in mm") +
  facet_wrap(~ variable) + ggtitle("Iris Violin Plots")


#####
##
## Pollster
##
#####

obama.favorable.df <- read.csv("http://elections.huffingtonpost.com/pollster/obama-favorable-rating.csv", header = TRUE, as.is = TRUE)


obama.favorable.df[, "End.Date"] <- as.Date(obama.favorable.df[, "End.Date"],
                                            format = "%Y-%m-%d")

# cut() has a method for dates which is very handy

obama.favorable.df[, "Quarter"] <- cut.Date(obama.favorable.df[, "End.Date"],
                                         breaks = "quarter")

# Not necessary, but cleans things up for inspection

obama.favorable.df <- obama.favorable.df[, c("Pollster",
                                             "Quarter", "End.Date",
                                             "Number.of.Observations",
                                             "Population", "Mode",
                                             "Favorable", "Unfavorable",
                                             "Undecided")]

# avg helpful for plotting colors

obama.favorable.df <- ddply(obama.favorable.df, "Quarter", transform, FavAvg = median(Favorable))

# Boxplot by quarters

ggplot(data = obama.favorable.df, aes(x = Quarter, y = Favorable, fill = FavAvg)) +
  geom_boxplot() + theme(axis.text.x = element_text(angle = 30, size = rel(1.2), vjust = 0.5)) +
  scale_fill_gradient2(low = "red", high = "lightblue", midpoint = 50, guide = FALSE) +
  ylab('') + xlab('') + ggtitle("Obama Favorability Ratings")

###
###
### Time Series
###
###

# Start with one time series


# build a model
# 2nd argument is order of the model: (p, d, q)
# we cheat a bit here because we know the simulated data

model.ts <- HoltWinters(airmiles, gamma = FALSE)

# one forecast

forecast.ts <- predict(model.ts, n.ahead = 10,
                       prediction.interval = TRUE,
                       level = 0.95)

#####
##
## Confidence regions in ggplot2
##
#####

## ggplot2 doesn't handle time series objects natively so we convert to df

sim.df <- data.frame(Time = as.numeric(time(airmiles)),
                     Value = as.numeric(airmiles),
                     Upper = 0,
                     Lower = 0,
                     Status = factor("Observed",
                                     levels = c("Observed",
                                                "Predicted",
                                                "Fitted")))

forecast.df <- data.frame(Time = as.numeric(time(forecast.ts) - 1),
                          Value = as.numeric(forecast.ts[, "fit"]),
                          Upper = as.numeric(forecast.ts[, "upr"]),
                          Lower = as.numeric(forecast.ts[, "lwr"]),
                          Status = "Predicted")
fitted.df <- data.frame(Time = as.numeric(time(model.ts[["fitted"]])),
                        Value = as.numeric(model.ts[["fitted"]][,"xhat"]),
                        Upper = 0,
                        Lower = 0,
                        Status = "Fitted")

combinedts.df <- do.call(rbind, list(sim.df, forecast.df, fitted.df))

# our critical value for a confidence interval derived from t-scores
# the predict model object gives us standard errors if we want them


ggplot(data = combinedts.df, aes(x = Time, y = Value, colour = Status)) +
  geom_ribbon(data = subset(combinedts.df, Status == "Predicted"),
                            aes(ymin = Upper,
                                ymax = Lower),
                            alpha = 0.2) +
  geom_line() + geom_vline(xintercept = 1960, linetype = "longdash") +
  xlab('') + ylab("Airline Miles per Year") + ggtitle("HW Confidence Intervals")

#####
##
## Contour plots and margins
##
#####

mvn <- mvrnorm(n = 3000, c(2,2), matrix(c(10,1,3,3),2,2))

mvn <- data.frame(x = mvn[, 1], y = mvn[, 2])


# very basic 2d contour plot

ggplot(data = mvn, aes(x = x, y = y)) + stat_density2d() + geom_point(alpha = 0.2)
	library(reshape2)
	library(ggplot2)
	library(plyr)

	#####
	##
	## Violin Plots in ggplot2
	##
	#####

	# iris is a pretty standard dataset and good for this example
	iris.long <- melt(iris, id.vars = "Species")

	iris.long[, "variable"] <- gsub("\\.", " ", iris.long[, "variable"])

	# there are cooler ways to do this with %in%
	# but it is worth showing this way.
	iris.long[, "Species"] <- paste0(toupper(substr(iris.long[, "Species"], 0, 1)),
	substring(iris.long[, "Species"], 2))

	# Violin plot allows us to show density estimates for multiple
	# variables together

	# sometimes preferable to boxplots as we see here

	ggplot(data = iris.long, aes(x = Species, fill = Species, y = value)) +
	geom_boxplot() + xlab('') + ylab("Length or Width in mm") +
	facet_wrap(~ variable) + ggtitle("Iris Boxplots")

	ggplot(data = iris.long, aes(x = Species, y = value, fill = Species)) +
	geom_violin() + ylab('') + ylab("Length or Width in mm") +
	facet_wrap(~ variable) + ggtitle("Iris Violin Plots")


	#####
	##
	## Pollster
	##
	#####

	obama.favorable.df <- read.csv("http://elections.huffingtonpost.com/pollster/obama-favorable-rating.csv", header = TRUE, as.is = TRUE)


	obama.favorable.df[, "End.Date"] <- as.Date(obama.favorable.df[, "End.Date"],
	format = "%Y-%m-%d")

	# cut() has a method for dates which is very handy

	obama.favorable.df[, "Quarter"] <- cut.Date(obama.favorable.df[, "End.Date"],
	breaks = "quarter")

	# Not necessary, but cleans things up for inspection

	obama.favorable.df <- obama.favorable.df[, c("Pollster",
	"Quarter", "End.Date",
	"Number.of.Observations",
	"Population", "Mode",
	"Favorable", "Unfavorable",
	"Undecided")]

	# avg helpful for plotting colors

	obama.favorable.df <- ddply(obama.favorable.df, "Quarter", transform, FavAvg = median(Favorable))

	# Boxplot by quarters

	ggplot(data = obama.favorable.df, aes(x = Quarter, y = Favorable, fill = FavAvg)) +
	geom_boxplot() + theme(axis.text.x = element_text(angle = 30, size = rel(1.2), vjust = 0.5)) +
	scale_fill_gradient2(low = "red", high = "lightblue", midpoint = 50, guide = FALSE) +
	ylab('') + xlab('') + ggtitle("Obama Favorability Ratings")

	###
	###
	### Time Series
	###
	###

	# Start with one time series


	# build a model
	# 2nd argument is order of the model: (p, d, q)
	# we cheat a bit here because we know the simulated data

	model.ts <- HoltWinters(airmiles, gamma = FALSE)

	# one forecast

	forecast.ts <- predict(model.ts, n.ahead = 10,
	prediction.interval = TRUE,
	level = 0.95)

	#####
	##
	## Confidence regions in ggplot2
	##
	#####

	## ggplot2 doesn't handle time series objects natively so we convert to df

	sim.df <- data.frame(Time = as.numeric(time(airmiles)),
	Value = as.numeric(airmiles),
	Upper = 0,
	Lower = 0,
	Status = factor("Observed",
	levels = c("Observed",
	"Predicted",
	"Fitted")))

	forecast.df <- data.frame(Time = as.numeric(time(forecast.ts) - 1),
	Value = as.numeric(forecast.ts[, "fit"]),
	Upper = as.numeric(forecast.ts[, "upr"]),
	Lower = as.numeric(forecast.ts[, "lwr"]),
	Status = "Predicted")
	fitted.df <- data.frame(Time = as.numeric(time(model.ts[["fitted"]])),
	Value = as.numeric(model.ts[["fitted"]][,"xhat"]),
	Upper = 0,
	Lower = 0,
	Status = "Fitted")

	combinedts.df <- do.call(rbind, list(sim.df, forecast.df, fitted.df))

	# our critical value for a confidence interval derived from t-scores
	# the predict model object gives us standard errors if we want them


	ggplot(data = combinedts.df, aes(x = Time, y = Value, colour = Status)) +
	geom_ribbon(data = subset(combinedts.df, Status == "Predicted"),
	aes(ymin = Upper,
	ymax = Lower),
	alpha = 0.2) +
	geom_line() + geom_vline(xintercept = 1960, linetype = "longdash") +
	xlab('') + ylab("Airline Miles per Year") + ggtitle("HW Confidence Intervals")

	#####
	##
	## Contour plots and margins
	##
	#####

	mvn <- mvrnorm(n = 3000, c(2,2), matrix(c(10,1,3,3),2,2))

	mvn <- data.frame(x = mvn[, 1], y = mvn[, 2])


	# very basic 2d contour plot

	ggplot(data = mvn, aes(x = x, y = y)) + stat_density2d() + geom_point(alpha = 0.2)