glamp/election-map-plots.R

## election-map-plots.R
library(XML)
library(uuid)
library(stringr)
library(plyr)
library(reshape2)
library(ggplot2)

f <- "https://raw.githubusercontent.com/chris-taylor/USElection/master/data/electoral-college-votes.csv"
electoral.college <- read.csv(f, header=FALSE)
names(electoral.college) <- c("state", "electoral_votes")
head(electoral.college)

# exclude D.C. from the data pull b/c there aren't any polls!. we'll add it in manually
states <- electoral.college$state[c(1:7, 9:51)]
results <- ldply(states, function(state) {
  url <- "http://www.electionprojection.com/latest-polls/%s-presidential-polls-trump-vs-clinton-vs-johnson-vs-stein.php"
  state.fmt <- gsub(" ", "-", tolower(state))

  url.state <- sprintf(url, state.fmt)
  print(url.state)
  r <- readHTMLTable(url.state, stringsAsFactors=FALSE)[[3]]
  r$state <- state
  r$id <- 1:nrow(r)
  cols <- c("Dates", "Firm", "state", "Clinton", "Trump", "Johnson", "id")
  r <- r[2:nrow(r),][,cols]
  r <- melt(r, id=c("Dates", "Firm", "state", "id"), variable.name="candidate", value.name="vote")

  names(r) <- c("date", "poll", "state", "id", "candidate", "vote")
  r$race <- ""
  cols <- c("date", "race", "state", "poll", "candidate", "vote", "id")
  r <- r[,cols]
  r$vote <- as.numeric(r$vote)
  r
})

# adding D.C. on manually b/c it's slightly different. it also doesn't produce material changes to the
# results
results <- rbind(results, data.frame(
  date='10/20 - 10/28',
  race='',
  state='District of Columbia',
  poll='SurveyMonkey',
  candidate=c("Clinton", "Trump", "Johnson"),
  vote=c(87, 5, 4),
  id=1
))

head(results)
tail(results)
table(results$candidate)
table(results$state)
results <- results[order(results$state, results$id, results$candidate),]

poll.freq <- data.frame(table(results$state))
ggplot(poll.freq, aes(x=Var1, weight=Freq)) +
  geom_bar() +
  coord_flip() +
  scale_y_continuous("# of Polls") +
  scale_x_discrete("State", limits=rev(levels(poll.freq$Var1)))


weight <- function(i) {
  exp(1)*1 / exp(i)
}

w <- data.frame(poll=1:8, weight=weight(1:8))
ggplot(w, aes(x=poll, weight=weight)) +
  geom_bar() +
  scale_x_continuous("nth poll", breaks=1:8) +
  scale_y_continuous("weight")


election.sim <- function() {
  ddply(results, .(state), function(polls.state) {
    polls.state$.id <- NULL
    polls.state$id <- cumsum(!duplicated(polls.state$id))
    polls.state$weight <- weight(polls.state$id)
    polls.state$weighted_vote <- polls.state$vote * polls.state$weight

    tally <- ddply(polls.state, .(candidate), function(p) {
      r <- rnorm(nrow(p), 1, .15)
      data.frame(weighted_vote=sum(p$weighted_vote * r))
    })
    tally <- head(tally, 3)
    tally$estimated_popular_vote <- tally$weighted_vote / sum(tally$weighted_vote)
    tally
  })
}

(election <- election.sim())

colormap <- c(Clinton="#179ee0", Trump="#ff5d40", Johnson="#f6b900")
ggplot(election, aes(x=candidate, weight=estimated_popular_vote, fill=candidate)) +
  geom_bar() +
  facet_wrap(~state) +
  scale_fill_manual(values=colormap) +
  scale_y_continuous(labels = scales::percent, breaks=c(0, 0.25, 0.5, 0.75, 1)) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y=element_blank())

library(doParallel)
doParallel::registerDoParallel(cores=8)

simulated.state.results <- ldply(1:10000, function(i) {
  election <- election.sim()
  election.results <- dcast(election, state ~ candidate, value.var="estimated_popular_vote")
  election.results <- merge(election.results, electoral.college, by.x="state", by.y="state", all.x = TRUE)
  candidates <- c("Clinton", "Trump", "Johnson")
  election.results$winner <- candidates[max.col(election.results[,candidates])]
  election.results$sim_id <- UUIDgenerate()
  election.results
}, .progress="text", .parallel=TRUE)


simulated.elections <- ddply(simulated.state.results, .(sim_id), function(simulation) {
  clinton <- sum(ifelse(simulation$Clinton > simulation$Trump & simulation$Clinton > simulation$Johnson, simulation$electoral_votes, 0))
  trump <- sum(ifelse(simulation$Clinton < simulation$Trump & simulation$Trump > simulation$Johnson, simulation$electoral_votes, 0))
  johnson <- sum(ifelse(simulation$Johnson > simulation$Trump & simulation$Johnson > simulation$Trump, simulation$electoral_votes, 0))
  data.frame(
    clinton=clinton,
    trump=trump,
    johnson=johnson,
    winner=ifelse(clinton > trump, "Clinton", "Trump")
  )
}, .progress="text", .parallel=TRUE)

table(simulated.elections$winner)
table(simulated.elections$winner) / nrow(simulated.elections)
summary(simulated.elections$clinton)
summary(simulated.elections$trump)
summary(simulated.elections$johnson)
head(simulated.elections)

ggplot(melt(simulated.elections[,1:4], id.vars = "sim_id"), aes(x=value, fill=variable)) +
  geom_histogram(position="identity", alpha=0.7) +
  scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

ggplot(melt(simulated.elections[,1:3], id.vars = "sim_id"), aes(x=value, fill=variable)) +
  geom_histogram(position="identity", alpha=0.7) +
  scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

ggplot(melt(simulated.elections[,1:3], id.vars = "sim_id"), aes(x=value, fill=variable)) +
  geom_density(position="identity", alpha=0.7) +
  scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

# popular vote
clinton <- sum(simulated.elections$clinton * simulated.elections$electoral_votes)
trump <- sum(simulated.elections$trump * simulated.elections$electoral_votes)
johnson <- sum(simulated.elections$johnson * simulated.elections$electoral_votes)
total <- sum(clinton, trump, johnson)
data.frame(Clinton=clinton/total, Trump=trump/total, Johnson=johnson/total)

simulated.state.results.agg <- ddply(simulated.state.results, .(state), function(state) {
  data.frame(
    state=state$state[1],
    trump=sum(state$winner=="Trump") / nrow(state),
    clinton=sum(state$winner=="Clinton") / nrow(state),
    johnson=sum(state$winner=="Johnson") / nrow(state),
    electoral_votes=state$electoral_votes[1],
    n=nrow(state)
  )
})
simulated.state.results.agg

data.frame(
  Trump=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$trump),
  Clinton=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$clinton),
  Johnson=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$johnson)
)


us.states <- map_data("state")
simulated.state.results.agg$state.mergecol <- as.character(tolower(simulated.state.results.agg$state))
state.plot <- merge(us.states, simulated.state.results.agg, by.x="region", by.y="state.mergecol")

table(us.states$region)
state.plot$winner <- ifelse(state.plot$trump > state.plot$clinton, "Trump", "Clinton")
# binary outcomes for states
ggplot(state.plot, aes(x=long, y=lat, group=group, fill=winner)) +
  geom_polygon(colour="white") +
  scale_fill_manual(values=colormap) +
  coord_map()

# shaded outcomes for states
ggplot(state.plot, aes(x=long, y=lat)) +
  geom_polygon(aes(group=group, fill=trump), colour="grey10") +
  scale_fill_gradient2("", low=colormap["Clinton"], mid="white", high=colormap["Trump"], midpoint=0.5,
                       breaks=c(0, 0.5, 1), labels=c("Clinton", "?", "Trump")) +
  theme_minimal() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        legend.text=element_text(size=12, face="bold")) +
  coord_map()

us.regions <- read.csv("~/workspace/github.com/yhat/blog/code-exp/election/regions.csv")

state.plot <- merge(state.plot, us.regions, by.x="state", by.y="state")
head(state.plot)

# http://www.census.gov/econ/census/help/geography/regions_and_divisions.html
# http://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

ddply(state.plot, .(division), function(div) {
  p <- ggplot(div, aes(x=long, y=lat)) +
    geom_polygon(aes(group=group, fill=trump), colour="grey10") +
    scale_fill_gradient2("", low=colormap["Clinton"], mid="white", high=colormap["Trump"], midpoint=0.5,
                         guide=FALSE) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks.y=element_blank(),
          legend.text=element_text(size=12, face="bold")) +
    coord_map() +
    ggtitle(div$division[1])
  filename <- paste0("~/workspace/github.com/yhat/blog/static/img/election-region-", tolower(str_replace_all(div$division[1], " ", "-")), ".png")
  print(p)
  ggsave(filename)
  NULL
})
	library(XML)
	library(uuid)
	library(stringr)
	library(plyr)
	library(reshape2)
	library(ggplot2)

	f <- "https://raw.githubusercontent.com/chris-taylor/USElection/master/data/electoral-college-votes.csv"
	electoral.college <- read.csv(f, header=FALSE)
	names(electoral.college) <- c("state", "electoral_votes")
	head(electoral.college)

	# exclude D.C. from the data pull b/c there aren't any polls!. we'll add it in manually
	states <- electoral.college$state[c(1:7, 9:51)]
	results <- ldply(states, function(state) {
	url <- "http://www.electionprojection.com/latest-polls/%s-presidential-polls-trump-vs-clinton-vs-johnson-vs-stein.php"
	state.fmt <- gsub(" ", "-", tolower(state))

	url.state <- sprintf(url, state.fmt)
	print(url.state)
	r <- readHTMLTable(url.state, stringsAsFactors=FALSE)[[3]]
	r$state <- state
	r$id <- 1:nrow(r)
	cols <- c("Dates", "Firm", "state", "Clinton", "Trump", "Johnson", "id")
	r <- r[2:nrow(r),][,cols]
	r <- melt(r, id=c("Dates", "Firm", "state", "id"), variable.name="candidate", value.name="vote")

	names(r) <- c("date", "poll", "state", "id", "candidate", "vote")
	r$race <- ""
	cols <- c("date", "race", "state", "poll", "candidate", "vote", "id")
	r <- r[,cols]
	r$vote <- as.numeric(r$vote)
	r
	})

	# adding D.C. on manually b/c it's slightly different. it also doesn't produce material changes to the
	# results
	results <- rbind(results, data.frame(
	date='10/20 - 10/28',
	race='',
	state='District of Columbia',
	poll='SurveyMonkey',
	candidate=c("Clinton", "Trump", "Johnson"),
	vote=c(87, 5, 4),
	id=1
	))

	head(results)
	tail(results)
	table(results$candidate)
	table(results$state)
	results <- results[order(results$state, results$id, results$candidate),]

	poll.freq <- data.frame(table(results$state))
	ggplot(poll.freq, aes(x=Var1, weight=Freq)) +
	geom_bar() +
	coord_flip() +
	scale_y_continuous("# of Polls") +
	scale_x_discrete("State", limits=rev(levels(poll.freq$Var1)))


	weight <- function(i) {
	exp(1)*1 / exp(i)
	}

	w <- data.frame(poll=1:8, weight=weight(1:8))
	ggplot(w, aes(x=poll, weight=weight)) +
	geom_bar() +
	scale_x_continuous("nth poll", breaks=1:8) +
	scale_y_continuous("weight")


	election.sim <- function() {
	ddply(results, .(state), function(polls.state) {
	polls.state$.id <- NULL
	polls.state$id <- cumsum(!duplicated(polls.state$id))
	polls.state$weight <- weight(polls.state$id)
	polls.state$weighted_vote <- polls.state$vote * polls.state$weight

	tally <- ddply(polls.state, .(candidate), function(p) {
	r <- rnorm(nrow(p), 1, .15)
	data.frame(weighted_vote=sum(p$weighted_vote * r))
	})
	tally <- head(tally, 3)
	tally$estimated_popular_vote <- tally$weighted_vote / sum(tally$weighted_vote)
	tally
	})
	}

	(election <- election.sim())

	colormap <- c(Clinton="#179ee0", Trump="#ff5d40", Johnson="#f6b900")
	ggplot(election, aes(x=candidate, weight=estimated_popular_vote, fill=candidate)) +
	geom_bar() +
	facet_wrap(~state) +
	scale_fill_manual(values=colormap) +
	scale_y_continuous(labels = scales::percent, breaks=c(0, 0.25, 0.5, 0.75, 1)) +
	theme(axis.title.x=element_blank(),
	axis.text.x=element_blank(),
	axis.ticks.x=element_blank(),
	axis.title.y=element_blank())

	library(doParallel)
	doParallel::registerDoParallel(cores=8)

	simulated.state.results <- ldply(1:10000, function(i) {
	election <- election.sim()
	election.results <- dcast(election, state ~ candidate, value.var="estimated_popular_vote")
	election.results <- merge(election.results, electoral.college, by.x="state", by.y="state", all.x = TRUE)
	candidates <- c("Clinton", "Trump", "Johnson")
	election.results$winner <- candidates[max.col(election.results[,candidates])]
	election.results$sim_id <- UUIDgenerate()
	election.results
	}, .progress="text", .parallel=TRUE)


	simulated.elections <- ddply(simulated.state.results, .(sim_id), function(simulation) {
	clinton <- sum(ifelse(simulation$Clinton > simulation$Trump & simulation$Clinton > simulation$Johnson, simulation$electoral_votes, 0))
	trump <- sum(ifelse(simulation$Clinton < simulation$Trump & simulation$Trump > simulation$Johnson, simulation$electoral_votes, 0))
	johnson <- sum(ifelse(simulation$Johnson > simulation$Trump & simulation$Johnson > simulation$Trump, simulation$electoral_votes, 0))
	data.frame(
	clinton=clinton,
	trump=trump,
	johnson=johnson,
	winner=ifelse(clinton > trump, "Clinton", "Trump")
	)
	}, .progress="text", .parallel=TRUE)

	table(simulated.elections$winner)
	table(simulated.elections$winner) / nrow(simulated.elections)
	summary(simulated.elections$clinton)
	summary(simulated.elections$trump)
	summary(simulated.elections$johnson)
	head(simulated.elections)

	ggplot(melt(simulated.elections[,1:4], id.vars = "sim_id"), aes(x=value, fill=variable)) +
	geom_histogram(position="identity", alpha=0.7) +
	scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

	ggplot(melt(simulated.elections[,1:3], id.vars = "sim_id"), aes(x=value, fill=variable)) +
	geom_histogram(position="identity", alpha=0.7) +
	scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

	ggplot(melt(simulated.elections[,1:3], id.vars = "sim_id"), aes(x=value, fill=variable)) +
	geom_density(position="identity", alpha=0.7) +
	scale_fill_manual(values=c(clinton="#179ee0", trump="#ff5d40", johnson="#f6b900"))

	# popular vote
	clinton <- sum(simulated.elections$clinton * simulated.elections$electoral_votes)
	trump <- sum(simulated.elections$trump * simulated.elections$electoral_votes)
	johnson <- sum(simulated.elections$johnson * simulated.elections$electoral_votes)
	total <- sum(clinton, trump, johnson)
	data.frame(Clinton=clinton/total, Trump=trump/total, Johnson=johnson/total)

	simulated.state.results.agg <- ddply(simulated.state.results, .(state), function(state) {
	data.frame(
	state=state$state[1],
	trump=sum(state$winner=="Trump") / nrow(state),
	clinton=sum(state$winner=="Clinton") / nrow(state),
	johnson=sum(state$winner=="Johnson") / nrow(state),
	electoral_votes=state$electoral_votes[1],
	n=nrow(state)
	)
	})
	simulated.state.results.agg

	data.frame(
	Trump=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$trump),
	Clinton=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$clinton),
	Johnson=sum(simulated.state.results.agg$electoral_votes * simulated.state.results.agg$johnson)
	)


	us.states <- map_data("state")
	simulated.state.results.agg$state.mergecol <- as.character(tolower(simulated.state.results.agg$state))
	state.plot <- merge(us.states, simulated.state.results.agg, by.x="region", by.y="state.mergecol")

	table(us.states$region)
	state.plot$winner <- ifelse(state.plot$trump > state.plot$clinton, "Trump", "Clinton")
	# binary outcomes for states
	ggplot(state.plot, aes(x=long, y=lat, group=group, fill=winner)) +
	geom_polygon(colour="white") +
	scale_fill_manual(values=colormap) +
	coord_map()

	# shaded outcomes for states
	ggplot(state.plot, aes(x=long, y=lat)) +
	geom_polygon(aes(group=group, fill=trump), colour="grey10") +
	scale_fill_gradient2("", low=colormap["Clinton"], mid="white", high=colormap["Trump"], midpoint=0.5,
	breaks=c(0, 0.5, 1), labels=c("Clinton", "?", "Trump")) +
	theme_minimal() +
	theme(axis.title.x=element_blank(),
	axis.text.x=element_blank(),
	axis.ticks.x=element_blank(),
	axis.title.y=element_blank(),
	axis.text.y=element_blank(),
	axis.ticks.y=element_blank(),
	legend.text=element_text(size=12, face="bold")) +
	coord_map()

	us.regions <- read.csv("~/workspace/github.com/yhat/blog/code-exp/election/regions.csv")

	state.plot <- merge(state.plot, us.regions, by.x="state", by.y="state")
	head(state.plot)

	# http://www.census.gov/econ/census/help/geography/regions_and_divisions.html
	# http://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

	ddply(state.plot, .(division), function(div) {
	p <- ggplot(div, aes(x=long, y=lat)) +
	geom_polygon(aes(group=group, fill=trump), colour="grey10") +
	scale_fill_gradient2("", low=colormap["Clinton"], mid="white", high=colormap["Trump"], midpoint=0.5,
	guide=FALSE) +
	theme_minimal() +
	theme(axis.title.x=element_blank(),
	axis.text.x=element_blank(),
	axis.ticks.x=element_blank(),
	axis.title.y=element_blank(),
	axis.text.y=element_blank(),
	axis.ticks.y=element_blank(),
	legend.text=element_text(size=12, face="bold")) +
	coord_map() +
	ggtitle(div$division[1])
	filename <- paste0("~/workspace/github.com/yhat/blog/static/img/election-region-", tolower(str_replace_all(div$division[1], " ", "-")), ".png")
	print(p)
	ggsave(filename)
	NULL
	})