pitakakariki/polls.r

## polls.r
###
##
## --- New Zealand General Election Polling ---
##
## Plots polling data scraped from Wikipedia.
##
## - GAM -
##
## The smoothed curves are calculated using a generalised additive model (GAM).  The
## smoothing parameter is estimated using cross-validation. This means that the curves
## are estimated based on subsets of the data, and the parameter that best predicts
## the hold-out data is chosen. Note that the error bounds are based on the assumption
## that the smoothing parameter is correct, i.e. uncertainty in the smoothing parameter
## estimate are not accounted for.
##
## A separate smoothed curve is estimated for each combination of polling firm and party
## (the election is treated as a poll and given more weight than the opinion polls). The
## shape of the curves for each party is constrained to be the same for every polling firm,
## although they can be vertically offset from each other. The displayed curve is aligned
## so that it passes through the election result, but in the interactive version you can
## see the offset curves for each polling outfit.
##
## - SVG -
##
## The svg graphics format allows interactivity via Javascript.  Note that the
## SVGAnnotation library comes from http://www.bioconductor.org/ rather that
## the CRAN repositories.  This requires Cairo to be available (use
## capabilities("cairo") to check this).
##
###

library(stringr)
library(plyr)
library(XML)
library(xts)
library(ggplot2)
library(lubridate)
library(mgcv)
library(reshape)
library(Hmisc)
library(rje)

###
##
## Settings
##
###

# Options

opt_begin <- as.Date("2008-01-01") #as.Date("1996-01-01") #as.Date("2002-06-01")
opt_end <- today()

opt_filename <- "test"
opt_svg <- TRUE         # set to FALSE if you can't get SVGAnnotation to work
opt_png <- !opt_svg

opt_filename <- str_c(opt_filename, if(opt_svg) ".svg" else ".png")

opt_bc <- TRUE         # bias corrections
opt_bcplot <- FALSE     # plot bias-corrected points
opt_bcreport <- TRUE    # print bias estimates
opt_bccurves <- opt_svg # curves for each polling firm

opt_events <- FALSE     # event markers
opt_thresh <- TRUE      # 5% threshold marker

opt_ewt <- 1000         # weighting for election data points
opt_adapt <- FALSE      # Adaptive smoother

opt_svgwidth <- 12      # width and height if the output file is svg
opt_svgheight <- 7      # these will be multiplied by 72 for png

opt_xwidth <- 3/2 * opt_svgwidth    # 1 / opt_xwidth of the plot is reserved for final results
opt_xheight <- 3/2 * opt_svgheight  # 1 / opt_xheight for event markers

opt_dashes <- "[\u002D\u2013\u2014\u2212]"
opt_spaces <- "[\u0020\u00A0]"

opt_plotcurves <- TRUE
opt_curvepts <- 500     # number of line segements to make "curves"
opt_curvett <- TRUE     # tooltips on curves

opt_moe <- 0

opt_legend <- TRUE

# Wikipedia urls

urls <- str_c("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_", c(2005, 2008, 2011))
urls <- c(urls, "http://en.wikipedia.org/wiki/Opinion_polling_for_the_next_New_Zealand_general_election")

# Party details

party_ctl <- read.table(

    header = TRUE,
    row.names = "Party",
    stringsAsFactors = FALSE,
    comment = "",
    text = "

    Party           Colour   Contrast Include
    ACT             #FFE401  #FFFF80    -
    Conservative    #00AEEF  #33CCFF    -
    Destiny         #FF0000  #000000    -
    Green           #098137  #B3FFB3    Y
    Labour          #FF0000  #FFBAA8    Y
    Mana            #770808  #FF6E6E    -
    Maori           #EF4A42  #FFCC80    -
    National        #00529F  #CCDDFF    Y
    'NZ First'      #000000  #CCCCCC    Y
    Progressive     #9E9E9E  #DDCCDD    -
    Internet        #662C92  #EE77FF    -
    'Internet/Mana' #662C92  #FF6E6E    -
    'United Future' #501557  #DD99DD    -
    'Labour/Green'  #FF0000  #B3FFB3    -
")

party_ctl[, "Include"] <- (party_ctl[, "Include"] == "Y")
rownames(party_ctl)[match("Maori", rownames(party_ctl))] <- "M\u0101ori"

if(opt_svg) library(SVGAnnotation, quietly=TRUE)

###
##
## Helper function definitions
##
###

get_data <- function(data_url, quiet=FALSE, i=1) {

    if(!quiet) message(str_c("Reading data from ", data_url))

    h <- htmlParse(data_url, encoding="UTF-8")

    x <- readHTMLTable(h, encoding="UTF-8", stringsAsFactors=FALSE)
    x $ toc <- NULL
    x <- x[[i]] # the poll data usually the first table, not counting "table" of contents

    names(x) <- trim(names(x))
    x[,1] <- trim(x[,1])

    # If the second column is NA, then we have some sort of non-poll event
    r_events <- is.na(x[2])

    events <- unlist(str_split(x[r_events, 1], "\n"))
    events <- str_split(events, opt_dashes, n=2)

    Date <- sapply(events, `[`, 1)
    Date <- as.Date(Date, "%d %b %Y")

    Event <- sapply(events, `[`, 2)
    Event <- trim(Event)

    events <- data.frame(Date=Date, Event=Event, stringsAsFactors=FALSE)

    # Polls are everything else, minus "heading" rows
    r_polls <- !r_events & !(x$Date == "Date")
    c_polls <- (names(x) != "")

    polls <- x[r_polls, c_polls]

    # Conversions
    polls $ Poll [str_detect(polls $ Poll, "election")] <- "Election"
    polls $ Poll <- as.factor(polls $ Poll)
    polls $ Poll <- reorder(polls $ Poll, polls $ Poll, function(x) if(x[1] == "Election") 1 else 2)

    polls $ Date <- as.character(polls $ Date)

    for(col in names(polls)[-c(1,2)])
        polls[col] <- suppressWarnings(as.numeric(polls[[col]])) # suppress conversion to NA warnings

    # Return both data frames
    attr(polls, "events") <- events

    null0 <- function(x) if(is.null(x)) 0 else ifelse(is.na(x), 0, x)

    # Combinations

    polls[['Labour/Green']] <- rep(0, nrow(polls)) +
        null0(polls$Labour) +
        null0(polls$Green)

    IM <- rep(0, nrow(polls)) +
        null0(polls$Internet) +
        null0(polls$Mana) +
        null0(polls$`Internet Mana`)

    IM <- ifelse(IM==0, NA, IM)

    polls[['Internet/Mana']] <- IM

    return(polls)
}

trim <- function(x) {

    x <- str_replace_all(x, "\\[(.*?)\\]", "") # remove any Wikipedia footnotes, e.g. [1]
    x <- str_trim(x) # trim leading/trailing spaces

    return(x)
}

alpha <- function(x, alpha=1) rgb(t(col2rgb(x)), alpha=255*alpha, max=255)

date <- function(x) {

    temp <- lapply(str_split(x, opt_dashes), rev) # Four different kinds of dashes used!

    # Date that poll coverage ends

    a <- sapply(temp, `[`, 1)
    a <- str_replace(a, "c\\.", "")
    a <- str_replace(a, "Released", "")
    a <- str_trim(a)

    date_end <- as.Date(a, "%d %b %Y")

    # Date that poll coverage starts

    b <- sapply(temp, `[`, 2)

    b <- str_trim(b)
    b <- str_split(b, opt_spaces) # Second space is a non-breaking space.

    year_start <- as.numeric(sapply(b, `[`, 3))
    month_start <- match(sapply(b, `[`, 2), month.name)
    day_start <- as.numeric(sapply(b, `[`, 1))

    date_start <- date_end
    s <- !is.na(year_start);   if(any(s)) date_start[s] <- update(date_start[s], years=year_start[s])
    s <- !is.na(month_start);  if(any(s)) date_start[s] <- update(date_start[s], months=month_start[s])
    s <- !is.na(day_start);    if(any(s)) date_start[s] <- update(date_start[s], days=day_start[s])

    # Polls with only month and year

    s <- is.na(date_start)

    if(any(s)) {

        date_start[s] <- as.Date(as.yearmon(a[s]))
        date_end[s] <- date_start[s] + new_period(month=1) - new_period(day=1)
    }

    # Combine and return

    date_mean <- as.Date(apply(cbind(date_start, date_end), 1, mean))

    data.frame(orig=x, date_start, date_end, date_mean)
}

loop <- function(x, y=x) c(x, rev(y))

format_num <- function(x) formatC(x, digits=1, format="f")

format_date <- function(x) format(as.Date(x), "%d %B %Y")

format_date_range <- function(x, y) {

    x_day <- day(x)
    x_month <- month.name[month(x)]
    x_year <- year(x)

    y_day <- day(y)
    y_month <- month.name[month(y)]
    y_year <- year(y)

    en <- "\u2013"

    ifelse(

        x_year != y_year,
        str_c(x_day, x_month, x_year, en, y_day, y_month, y_year, sep=" "),
        ifelse(

            x_month != y_month,
            str_c(x_day, x_month, en, y_day, y_month, y_year, sep=" "),
            ifelse(

                x_day != y_day,
                str_c(x_day, en, y_day, y_month, y_year, sep=" "),
                str_c(y_day, y_month, y_year, sep=" ")
            )
        )
    )
}

embiggen <- function(node) {

    for(i in seq_along(node)) {

        xmlAttrs(node[[i]]) <- c(

            `onmouseover` = "embiggen(this)",
            `onmouseout` = "unembiggen(this)",
            `onclick` = "click(this)"
        )
    }
}

link <- function(pt_node, sm_node, party, poll) {

    id <- str_c(party, poll)

    for(i in seq_along(pt_node)) {

        xmlAttrs(pt_node[[i]]) <- c(

            `id` = str_c(id, i),
            `onmouseover` = sprintf("link('%s', %i)", id, length(pt_node)),
            `onmouseout` = sprintf("unlink('%s', %i)", id, length(pt_node)),
            `onclick` = sprintf("clink('%s', %i)", id, length(pt_node))
        )
    }

    if(!is.null(sm_node)) {

        xmlAttrs(sm_node) <- c(`id` = id)
        modifyStyle(sm_node, `stroke-opacity`="0")
    }
}

get_party_data <- function(p) {

    s <- !is.na(poll_matrix[, p])
    n <<- sum(s)

    x <<- as.numeric(x_date[s])
    x0 <<- poll_data $ date_start[s]
    x1 <<- poll_data $ date_end[s]

    y <<- poll_matrix[s, p]
    z <<- poll_data $ Poll[s]

    this_colour <<- party_ctl[p, "Colour"]
    this_contrast <<- party_ctl[p, "Contrast"]

    xs <<- seq(min(x), max(x), length=opt_curvepts)
    new_X <<- data.frame(x=xs, z="Election")

    s_elect <<- (z == "Election")

    invisible()
}

###
##
## Collect the data.
##
###

data_list <- lapply(urls, get_data)
#load('datalist.rda')

#data_list[[5]] <- data.frame(Poll="3 News Reid Research", Date="25 May 2014", Labour=29.5, National=50.3, `NZ First`=5.6, Green=10.2, check.names=FALSE, stringsAsFactors=FALSE)
#attr(data_list[[5]], "events") <- data.frame(Date=as.Date(Sys.Date()), Event="Today")

poll_data <- Reduce(function(x, y) merge(x, y, all=TRUE), data_list)

event_data_list <- lapply(data_list, attr, "events")
event_data <- Reduce(function(x, y) merge(x, y, all=TRUE), event_data_list)
event_data <- event_data[!is.na(event_data $ Date), ]

poll_data <- cbind(poll_data, date(poll_data $ Date))       # Parse dates
poll_data <- poll_data[!is.na(poll_data $ date_mean), ]     # Remove any polls where we can't parse a date
poll_data <- sort_df(poll_data, 'date_mean')                # Sort from earliest to latest poll

# Restrict to selected dates
s_date <- (poll_data $ date_start >= opt_begin & poll_data $ date_end <= opt_end)
poll_data <- poll_data[s_date, ]

poll_data <- droplevels(poll_data)                          # Remove unused polling firms

incl_parties <- rownames(party_ctl)[party_ctl $ Include]
poll_matrix <- as.matrix(poll_data[, incl_parties])         # Restrict to parties of interest
x_date <- poll_data $ date_mean

###
##
## Analysis and plotting.
##
###

if(opt_svg) {

    svg(opt_filename, width=opt_svgwidth, height=opt_svgheight)
}
if(opt_png && !opt_svg) {

    png(opt_filename, width=opt_svgwidth*72, height=opt_svgheight*72)
    #pdf(opt_filename, width=opt_svgwidth, height=opt_svgheight, bg='white')
}

svg_counter <- 1
svg_index <- list()

xw <- as.numeric(opt_end - opt_begin) / (opt_xwidth - 1)
xh <- if(opt_events) max(poll_matrix, na.rm=TRUE) / (opt_xheight - 1) else 0

# This sets up the plot region
matplot(

    x_date, poll_matrix,
    pch = NA,
    main = "New Zealand General Election Polling",
    xlim = c(min(x_date), max(x_date) + xw), xlab = "", xaxt = "n",
    ylim = c(-xh, max(6, 1.1 * max(poll_matrix, na.rm=TRUE))), ylab = "Party Percentage (%)"
)
axis.Date(1, x_date)
abline(h=0); inc(svg_counter) <- 1

is_election <- (poll_data $ Poll == "Election")
abline(v=x_date[is_election]); inc(svg_counter) <- sum(is_election)

if(opt_thresh) {

    abline(h=5, lty=2)
    inc(svg_counter) <- 1
}

# Fit the Generalised Additive Model

a_list <- list()
b_list <- list()

cf_list <- list()
final <- character()

for(p in incl_parties) {

    get_party_data(p)

    ##logit - add logit links
    #y <- y / 100

    if(opt_bc) {

        a <- if(opt_adapt) {
            gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1))
            #gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
        } else {
            gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1))
            ###gamm(y ~ s(x) + z)
            #gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
        }
        a_list[[p]] <- a
        ###a_list[[p]] <- a$gam

        cf <- coef(a)
        ###cf <- coef(a $ gam)
        names(cf)[1] <- "zElection"; cf[1] <- 0

        cf_list[[p]] <- cf[str_c("z", levels(poll_data $ Poll)[-1])] # Keep track of bias estimates
        names(cf_list[[p]]) <- levels(poll_data $ Poll)[-1]
    }
    else {

        a <- if(opt_adapt) {
            gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1))
            #gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
        } else {
            gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1))
            #gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
        }
        a_list[[p]] <- a
    }
}

bias <- do.call(cbind, cf_list)
colnames(bias)[grep("Maori", colnames(bias))] <- "M\u0101ori"

# Plot spline curves

if(opt_plotcurves) for(p in incl_parties) {

    get_party_data(p)

    a <- a_list[[p]]
    b <- predict(a, newdata=new_X, se=TRUE)
    #b$fit <- 100 * expit(b$fit) ##logit
    b_list[[p]] <- b

    polygon(

        loop(xs),
        loop(b$fit) + 1.96 * loop(b$se.fit, -b$se.fit),
        col = alpha(this_contrast, 1/2),
        border = NA
    )

    lines(xs, b$fit, lwd=3, col=this_colour)

    svg_index[[str_c("sm_err:", p)]] <- svg_counter; inc(svg_counter) <- 1
    svg_index[[str_c("sm:", p)]] <- svg_counter; inc(svg_counter) <- 1

    if(opt_curvett) {

        points(xs, b$fit, col=alpha(this_colour, 0), bg=alpha(this_contrast, 0.1), pch=21)
        svg_index[[str_c("smpts:", p)]] <- seq(svg_counter, length=opt_curvepts)
        inc(svg_counter) <- opt_curvepts
    }
}

# Plot data points

for(p in incl_parties) {

    get_party_data(p)

    if(opt_bc && opt_bcplot) {

        cf <- cf_list[[p]]
        y[!s_elect] <- y[!s_elect] - cf[as.character(z[!s_elect])] # Bias correction
    }

    n <- sum(!is.na(y))

    if(opt_moe > 0) {

        x_me <- ggplot2:::interleave(x, x, NA)
        y_me <- ggplot2:::interleave(y-opt_moe, y+opt_moe, NA)
        lines(x_me, y_me, col=this_colour)
        inc(svg_counter) <- n
    }

    points(x, y, pch=21, cex=1, col=this_colour, bg=this_contrast)
    svg_index[[str_c("pts:", p)]] <- seq(svg_counter, length=n); inc(svg_counter) <- n
}

# Uncorrected smooths

for(p in incl_parties) {

    get_party_data(p)
    b <- b_list[[p]]

    if(opt_bc && opt_bccurves && !opt_bcplot) {

        for(i in levels(droplevels(z))[-1]) {

            lines(xs, b$fit + bias[i, p], lwd=2, col=alpha(this_colour, 1/2))
            svg_index[[sprintf("psm:%s:%s", p, i)]] <- svg_counter; inc(svg_counter) <- 1
        }
    }
}

# Election results

for(p in incl_parties) {

    get_party_data(p)

    points(

        x[s_elect], y[s_elect],
        pch = 21, cex = 2,
        col = party_ctl[p, "Colour"],
        bg = party_ctl[p, "Contrast"]
    )
    n_elect <- sum(s_elect)
    svg_index[[str_c("election:", p)]] <- seq(svg_counter, length=n_elect); inc(svg_counter) <- n_elect
}

# Plot final estimates

for(p in incl_parties) {

    get_party_data(p)

    b <- b_list[[p]]

    final_x <- tail(x_date, 1)
    final_fit <- tail(b $ fit, 1)
    final_err <- 1.96 * tail(b $ se.fit, 1)
    final[p] <- sprintf("%s \u00B1 %s%%", format_num(final_fit), format_num(final_err))

    final_date <- tail(x_date, 1)
    right_x <- par("usr")[2]
    avail_w <- right_x - as.numeric(final_date)

    test_str <- sprintf("%s \u00B1 %s%%_", format_num(800/9), format_num(80/9)) # "88.8 ± 8.8%_"
    test_w <- strwidth(test_str, font=4)

    text(
        final_x, final_fit, final[p],
        pos = 4, font = 4, xpd = TRUE,
        col = this_colour,
        cex = avail_w / test_w,
    )
}

# Events
if(opt_events) {

    s_date <- (event_data $ Date >= opt_begin & event_data $ Date <= opt_end)
    event_data <- event_data[s_date, ]

    points(event_data $ Date, rep(-c(0, 2, 4, 1, 3), length=nrow(event_data)) * (xh / 4), pch=21, cex=1.5, bg="#FC7928")
    svg_index[["events"]] <- seq(svg_counter, length=nrow(event_data)); inc(svg_counter) <- nrow(event_data)
}

# Legend (SVG is broken for this, do it last)

if(opt_legend) {

  party_col <- sapply(incl_parties, function(p) {get_party_data(p); this_colour})
  party_bg <- sapply(incl_parties, function(p) {get_party_data(p); this_contrast})

  legend("topleft", legend=incl_parties, pch=21, col=party_col, pt.bg=party_bg)

  for(p in incl_parties) {

      svg_index[[str_c("legend:", p)]] <- svg_counter
      inc(svg_counter) <- 1
  }
}


if(opt_svg || opt_png) dev.off()

# Bias estimates

if(opt_bc && opt_bcreport) print(round(bias, 1))

###
##
## Fancy SVG stuff.
##
###

if(opt_svg) {

    doc <- xmlParse(opt_filename)

    svg_points <- unlist(getPlotPoints(doc))

    addChildren(xmlChildren(doc) $ svg, kids=list(newXMLNode(name="title", "New Zealand General Election Polling")), at=0)
    addECMAScripts(doc, I('

    function embiggen(path) {

            d = path.getAttributeNS(null, "d").match(/-?\\d+(\\.\\d+)?/ig)
            x = ( parseFloat(d[0]) + parseFloat(d[4]) ) / 4;
            y = parseFloat(d[1]) / 2;
            path.setAttributeNS(null, "transform", "scale(2) translate(-" + x + ", -" + y + ")");
    }

    function unembiggen(path) {

        clicked = path.getAttributeNS(null, "desc")

        if(clicked != "clicked") {

            path.setAttributeNS(null, "transform", "scale(1)");
        }
    }

    function link(desc, n) {

        for(i = 1; i <= n; i++) {

            path = document.getElementById(desc + i);
            embiggen(path);
        }

        path = document.getElementById(desc);
        bigline(path);
    }

    function unlink(desc, n) {

        for(i = 1; i <= n; i++) {

            path = document.getElementById(desc + i);
            unembiggen(path);
        }

        path = document.getElementById(desc);
        smallline(path);
    }

    function clink(desc, n) {

        for(i =1; i <= n; i++) {

            path = document.getElementById(desc + i);
            click(path)
        }

        path = document.getElementById(desc);
        click(path)
    }

    function click(path) {

        desc = path.getAttributeNS(null, "desc")

        if(desc == "clicked") {

            path.setAttributeNS(null, "desc", "");
        }
        else {

            path.setAttributeNS(null, "desc", "clicked");
        }
    }

    function bigline(path) {

        path.style.setProperty("stroke-opacity", "0.5");
    }

    function smallline(path) {

        clicked = path.getAttributeNS(null, "desc")

        if(clicked != "clicked") {

            path.style.setProperty("stroke-opacity", "0");
        }
    }

    '), at=1)

    # Poll results

    for(p in incl_parties) {

        get_party_data(p)

        poll_index <- svg_index[[str_c("pts:", p)]]

        pt1 <- format_date_range(x0, x1)
        pt2 <- z
        pt3 <- sprintf("%s: %s%%", p, y)
        poll_text <- str_c(pt1, pt2, pt3, sep=", \n")

        addToolTips(doc, text=poll_text, path=svg_points[poll_index])
#        for(i in levels(poll_data $ Poll)[-1]) {
        for(i in levels(droplevels(z))[-1]) {

            if(opt_bc && opt_bccurves && !opt_bcplot) {

                sm_index <- svg_index[[sprintf("psm:%s:%s", p, i)]]
                link(svg_points[poll_index[z==i]], svg_points[[sm_index]], p, i)
            }
            else {

                link(svg_points[poll_index[z==i]], NULL, p, i)
            }
        }

        elect_index <- svg_index[[str_c("election:", p)]]
        et1 <- format_date(x[s_elect])
        et2 <- "Election"
        et3 <- sprintf("%s: %s%%", p, y[s_elect])
        elect_text <- str_c(et1, et2, et3, sep="\n")

        addToolTips(doc, text=elect_text, path=svg_points[elect_index])
        embiggen(svg_points[elect_index])

        if(opt_curvett) {

            b <- b_list[[p]]
            smpts_date <- format_date(xs)
            smpts_pc <- str_c(round(b$fit, 1), '%')
            smpts_text <- str_c(smpts_date, smpts_pc, sep='\n')
            smpts_index <- svg_index[[str_c("smpts:", p)]]
            addToolTips(doc, text=smpts_text, path=svg_points[smpts_index])
        }
    }

    # Events

    if(opt_events) {

        event_index <- svg_index[["events"]]
        event_text <- with(event_data, str_c(format_date(Date), "\n", Event))

        addToolTips(doc, text=event_text, path=svg_points[event_index])
        embiggen(svg_points[event_index])
    }

    saveXML(doc, opt_filename)
}
	###
	##
	## --- New Zealand General Election Polling ---
	##
	## Plots polling data scraped from Wikipedia.
	##
	## - GAM -
	##
	## The smoothed curves are calculated using a generalised additive model (GAM). The
	## smoothing parameter is estimated using cross-validation. This means that the curves
	## are estimated based on subsets of the data, and the parameter that best predicts
	## the hold-out data is chosen. Note that the error bounds are based on the assumption
	## that the smoothing parameter is correct, i.e. uncertainty in the smoothing parameter
	## estimate are not accounted for.
	##
	## A separate smoothed curve is estimated for each combination of polling firm and party
	## (the election is treated as a poll and given more weight than the opinion polls). The
	## shape of the curves for each party is constrained to be the same for every polling firm,
	## although they can be vertically offset from each other. The displayed curve is aligned
	## so that it passes through the election result, but in the interactive version you can
	## see the offset curves for each polling outfit.
	##
	## - SVG -
	##
	## The svg graphics format allows interactivity via Javascript. Note that the
	## SVGAnnotation library comes from http://www.bioconductor.org/ rather that
	## the CRAN repositories. This requires Cairo to be available (use
	## capabilities("cairo") to check this).
	##
	###

	library(stringr)
	library(plyr)
	library(XML)
	library(xts)
	library(ggplot2)
	library(lubridate)
	library(mgcv)
	library(reshape)
	library(Hmisc)
	library(rje)

	###
	##
	## Settings
	##
	###

	# Options

	opt_begin <- as.Date("2008-01-01") #as.Date("1996-01-01") #as.Date("2002-06-01")
	opt_end <- today()

	opt_filename <- "test"
	opt_svg <- TRUE # set to FALSE if you can't get SVGAnnotation to work
	opt_png <- !opt_svg

	opt_filename <- str_c(opt_filename, if(opt_svg) ".svg" else ".png")

	opt_bc <- TRUE # bias corrections
	opt_bcplot <- FALSE # plot bias-corrected points
	opt_bcreport <- TRUE # print bias estimates
	opt_bccurves <- opt_svg # curves for each polling firm

	opt_events <- FALSE # event markers
	opt_thresh <- TRUE # 5% threshold marker

	opt_ewt <- 1000 # weighting for election data points
	opt_adapt <- FALSE # Adaptive smoother

	opt_svgwidth <- 12 # width and height if the output file is svg
	opt_svgheight <- 7 # these will be multiplied by 72 for png

	opt_xwidth <- 3/2 * opt_svgwidth # 1 / opt_xwidth of the plot is reserved for final results
	opt_xheight <- 3/2 * opt_svgheight # 1 / opt_xheight for event markers

	opt_dashes <- "[\u002D\u2013\u2014\u2212]"
	opt_spaces <- "[\u0020\u00A0]"

	opt_plotcurves <- TRUE
	opt_curvepts <- 500 # number of line segements to make "curves"
	opt_curvett <- TRUE # tooltips on curves

	opt_moe <- 0

	opt_legend <- TRUE

	# Wikipedia urls

	urls <- str_c("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_", c(2005, 2008, 2011))
	urls <- c(urls, "http://en.wikipedia.org/wiki/Opinion_polling_for_the_next_New_Zealand_general_election")

	# Party details

	party_ctl <- read.table(

	header = TRUE,
	row.names = "Party",
	stringsAsFactors = FALSE,
	comment = "",
	text = "

	Party Colour Contrast Include
	ACT #FFE401 #FFFF80 -
	Conservative #00AEEF #33CCFF -
	Destiny #FF0000 #000000 -
	Green #098137 #B3FFB3 Y
	Labour #FF0000 #FFBAA8 Y
	Mana #770808 #FF6E6E -
	Maori #EF4A42 #FFCC80 -
	National #00529F #CCDDFF Y
	'NZ First' #000000 #CCCCCC Y
	Progressive #9E9E9E #DDCCDD -
	Internet #662C92 #EE77FF -
	'Internet/Mana' #662C92 #FF6E6E -
	'United Future' #501557 #DD99DD -
	'Labour/Green' #FF0000 #B3FFB3 -
	")

	party_ctl[, "Include"] <- (party_ctl[, "Include"] == "Y")
	rownames(party_ctl)[match("Maori", rownames(party_ctl))] <- "M\u0101ori"

	if(opt_svg) library(SVGAnnotation, quietly=TRUE)

	###
	##
	## Helper function definitions
	##
	###

	get_data <- function(data_url, quiet=FALSE, i=1) {

	if(!quiet) message(str_c("Reading data from ", data_url))

	h <- htmlParse(data_url, encoding="UTF-8")

	x <- readHTMLTable(h, encoding="UTF-8", stringsAsFactors=FALSE)
	x $ toc <- NULL
	x <- x[[i]] # the poll data usually the first table, not counting "table" of contents

	names(x) <- trim(names(x))
	x[,1] <- trim(x[,1])

	# If the second column is NA, then we have some sort of non-poll event
	r_events <- is.na(x[2])

	events <- unlist(str_split(x[r_events, 1], "\n"))
	events <- str_split(events, opt_dashes, n=2)

	Date <- sapply(events, `[`, 1)
	Date <- as.Date(Date, "%d %b %Y")

	Event <- sapply(events, `[`, 2)
	Event <- trim(Event)

	events <- data.frame(Date=Date, Event=Event, stringsAsFactors=FALSE)

	# Polls are everything else, minus "heading" rows
	r_polls <- !r_events & !(x$Date == "Date")
	c_polls <- (names(x) != "")

	polls <- x[r_polls, c_polls]

	# Conversions
	polls $ Poll [str_detect(polls $ Poll, "election")] <- "Election"
	polls $ Poll <- as.factor(polls $ Poll)
	polls $ Poll <- reorder(polls $ Poll, polls $ Poll, function(x) if(x[1] == "Election") 1 else 2)

	polls $ Date <- as.character(polls $ Date)

	for(col in names(polls)[-c(1,2)])
	polls[col] <- suppressWarnings(as.numeric(polls[[col]])) # suppress conversion to NA warnings

	# Return both data frames
	attr(polls, "events") <- events

	null0 <- function(x) if(is.null(x)) 0 else ifelse(is.na(x), 0, x)

	# Combinations

	polls[['Labour/Green']] <- rep(0, nrow(polls)) +
	null0(polls$Labour) +
	null0(polls$Green)

	IM <- rep(0, nrow(polls)) +
	null0(polls$Internet) +
	null0(polls$Mana) +
	null0(polls$`Internet Mana`)

	IM <- ifelse(IM==0, NA, IM)

	polls[['Internet/Mana']] <- IM

	return(polls)
	}

	trim <- function(x) {

	x <- str_replace_all(x, "\\[(.*?)\\]", "") # remove any Wikipedia footnotes, e.g. [1]
	x <- str_trim(x) # trim leading/trailing spaces

	return(x)
	}

	alpha <- function(x, alpha=1) rgb(t(col2rgb(x)), alpha=255*alpha, max=255)

	date <- function(x) {

	temp <- lapply(str_split(x, opt_dashes), rev) # Four different kinds of dashes used!

	# Date that poll coverage ends

	a <- sapply(temp, `[`, 1)
	a <- str_replace(a, "c\\.", "")
	a <- str_replace(a, "Released", "")
	a <- str_trim(a)

	date_end <- as.Date(a, "%d %b %Y")

	# Date that poll coverage starts

	b <- sapply(temp, `[`, 2)

	b <- str_trim(b)
	b <- str_split(b, opt_spaces) # Second space is a non-breaking space.

	year_start <- as.numeric(sapply(b, `[`, 3))
	month_start <- match(sapply(b, `[`, 2), month.name)
	day_start <- as.numeric(sapply(b, `[`, 1))

	date_start <- date_end
	s <- !is.na(year_start); if(any(s)) date_start[s] <- update(date_start[s], years=year_start[s])
	s <- !is.na(month_start); if(any(s)) date_start[s] <- update(date_start[s], months=month_start[s])
	s <- !is.na(day_start); if(any(s)) date_start[s] <- update(date_start[s], days=day_start[s])

	# Polls with only month and year

	s <- is.na(date_start)

	if(any(s)) {

	date_start[s] <- as.Date(as.yearmon(a[s]))
	date_end[s] <- date_start[s] + new_period(month=1) - new_period(day=1)
	}

	# Combine and return

	date_mean <- as.Date(apply(cbind(date_start, date_end), 1, mean))

	data.frame(orig=x, date_start, date_end, date_mean)
	}

	loop <- function(x, y=x) c(x, rev(y))

	format_num <- function(x) formatC(x, digits=1, format="f")

	format_date <- function(x) format(as.Date(x), "%d %B %Y")

	format_date_range <- function(x, y) {

	x_day <- day(x)
	x_month <- month.name[month(x)]
	x_year <- year(x)

	y_day <- day(y)
	y_month <- month.name[month(y)]
	y_year <- year(y)

	en <- "\u2013"

	ifelse(

	x_year != y_year,
	str_c(x_day, x_month, x_year, en, y_day, y_month, y_year, sep=" "),
	ifelse(

	x_month != y_month,
	str_c(x_day, x_month, en, y_day, y_month, y_year, sep=" "),
	ifelse(

	x_day != y_day,
	str_c(x_day, en, y_day, y_month, y_year, sep=" "),
	str_c(y_day, y_month, y_year, sep=" ")
	)
	)
	)
	}

	embiggen <- function(node) {

	for(i in seq_along(node)) {

	xmlAttrs(node[[i]]) <- c(

	`onmouseover` = "embiggen(this)",
	`onmouseout` = "unembiggen(this)",
	`onclick` = "click(this)"
	)
	}
	}

	link <- function(pt_node, sm_node, party, poll) {

	id <- str_c(party, poll)

	for(i in seq_along(pt_node)) {

	xmlAttrs(pt_node[[i]]) <- c(

	`id` = str_c(id, i),
	`onmouseover` = sprintf("link('%s', %i)", id, length(pt_node)),
	`onmouseout` = sprintf("unlink('%s', %i)", id, length(pt_node)),
	`onclick` = sprintf("clink('%s', %i)", id, length(pt_node))
	)
	}

	if(!is.null(sm_node)) {

	xmlAttrs(sm_node) <- c(`id` = id)
	modifyStyle(sm_node, `stroke-opacity`="0")
	}
	}

	get_party_data <- function(p) {

	s <- !is.na(poll_matrix[, p])
	n <<- sum(s)

	x <<- as.numeric(x_date[s])
	x0 <<- poll_data $ date_start[s]
	x1 <<- poll_data $ date_end[s]

	y <<- poll_matrix[s, p]
	z <<- poll_data $ Poll[s]

	this_colour <<- party_ctl[p, "Colour"]
	this_contrast <<- party_ctl[p, "Contrast"]

	xs <<- seq(min(x), max(x), length=opt_curvepts)
	new_X <<- data.frame(x=xs, z="Election")

	s_elect <<- (z == "Election")

	invisible()
	}

	###
	##
	## Collect the data.
	##
	###

	data_list <- lapply(urls, get_data)
	#load('datalist.rda')

	#data_list[[5]] <- data.frame(Poll="3 News Reid Research", Date="25 May 2014", Labour=29.5, National=50.3, `NZ First`=5.6, Green=10.2, check.names=FALSE, stringsAsFactors=FALSE)
	#attr(data_list[[5]], "events") <- data.frame(Date=as.Date(Sys.Date()), Event="Today")

	poll_data <- Reduce(function(x, y) merge(x, y, all=TRUE), data_list)

	event_data_list <- lapply(data_list, attr, "events")
	event_data <- Reduce(function(x, y) merge(x, y, all=TRUE), event_data_list)
	event_data <- event_data[!is.na(event_data $ Date), ]

	poll_data <- cbind(poll_data, date(poll_data $ Date)) # Parse dates
	poll_data <- poll_data[!is.na(poll_data $ date_mean), ] # Remove any polls where we can't parse a date
	poll_data <- sort_df(poll_data, 'date_mean') # Sort from earliest to latest poll

	# Restrict to selected dates
	s_date <- (poll_data $ date_start >= opt_begin & poll_data $ date_end <= opt_end)
	poll_data <- poll_data[s_date, ]

	poll_data <- droplevels(poll_data) # Remove unused polling firms

	incl_parties <- rownames(party_ctl)[party_ctl $ Include]
	poll_matrix <- as.matrix(poll_data[, incl_parties]) # Restrict to parties of interest
	x_date <- poll_data $ date_mean

	###
	##
	## Analysis and plotting.
	##
	###

	if(opt_svg) {

	svg(opt_filename, width=opt_svgwidth, height=opt_svgheight)
	}
	if(opt_png && !opt_svg) {

	png(opt_filename, width=opt_svgwidth72, height=opt_svgheight72)
	#pdf(opt_filename, width=opt_svgwidth, height=opt_svgheight, bg='white')
	}

	svg_counter <- 1
	svg_index <- list()

	xw <- as.numeric(opt_end - opt_begin) / (opt_xwidth - 1)
	xh <- if(opt_events) max(poll_matrix, na.rm=TRUE) / (opt_xheight - 1) else 0

	# This sets up the plot region
	matplot(

	x_date, poll_matrix,
	pch = NA,
	main = "New Zealand General Election Polling",
	xlim = c(min(x_date), max(x_date) + xw), xlab = "", xaxt = "n",
	ylim = c(-xh, max(6, 1.1 * max(poll_matrix, na.rm=TRUE))), ylab = "Party Percentage (%)"
	)
	axis.Date(1, x_date)
	abline(h=0); inc(svg_counter) <- 1

	is_election <- (poll_data $ Poll == "Election")
	abline(v=x_date[is_election]); inc(svg_counter) <- sum(is_election)

	if(opt_thresh) {

	abline(h=5, lty=2)
	inc(svg_counter) <- 1
	}

	# Fit the Generalised Additive Model

	a_list <- list()
	b_list <- list()

	cf_list <- list()
	final <- character()

	for(p in incl_parties) {

	get_party_data(p)

	##logit - add logit links
	#y <- y / 100

	if(opt_bc) {

	a <- if(opt_adapt) {
	gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1))
	#gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
	} else {
	gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1))
	###gamm(y ~ s(x) + z)
	#gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
	}
	a_list[[p]] <- a
	###a_list[[p]] <- a$gam

	cf <- coef(a)
	###cf <- coef(a $ gam)
	names(cf)[1] <- "zElection"; cf[1] <- 0

	cf_list[[p]] <- cf[str_c("z", levels(poll_data $ Poll)[-1])] # Keep track of bias estimates
	names(cf_list[[p]]) <- levels(poll_data $ Poll)[-1]
	}
	else {

	a <- if(opt_adapt) {
	gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1))
	#gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
	} else {
	gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1))
	#gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
	}
	a_list[[p]] <- a
	}
	}

	bias <- do.call(cbind, cf_list)
	colnames(bias)[grep("Maori", colnames(bias))] <- "M\u0101ori"

	# Plot spline curves

	if(opt_plotcurves) for(p in incl_parties) {

	get_party_data(p)

	a <- a_list[[p]]
	b <- predict(a, newdata=new_X, se=TRUE)
	#b$fit <- 100 * expit(b$fit) ##logit
	b_list[[p]] <- b

	polygon(

	loop(xs),
	loop(b$fit) + 1.96 * loop(b$se.fit, -b$se.fit),
	col = alpha(this_contrast, 1/2),
	border = NA
	)

	lines(xs, b$fit, lwd=3, col=this_colour)

	svg_index[[str_c("sm_err:", p)]] <- svg_counter; inc(svg_counter) <- 1
	svg_index[[str_c("sm:", p)]] <- svg_counter; inc(svg_counter) <- 1

	if(opt_curvett) {

	points(xs, b$fit, col=alpha(this_colour, 0), bg=alpha(this_contrast, 0.1), pch=21)
	svg_index[[str_c("smpts:", p)]] <- seq(svg_counter, length=opt_curvepts)
	inc(svg_counter) <- opt_curvepts
	}
	}

	# Plot data points

	for(p in incl_parties) {

	get_party_data(p)

	if(opt_bc && opt_bcplot) {

	cf <- cf_list[[p]]
	y[!s_elect] <- y[!s_elect] - cf[as.character(z[!s_elect])] # Bias correction
	}

	n <- sum(!is.na(y))

	if(opt_moe > 0) {

	x_me <- ggplot2:::interleave(x, x, NA)
	y_me <- ggplot2:::interleave(y-opt_moe, y+opt_moe, NA)
	lines(x_me, y_me, col=this_colour)
	inc(svg_counter) <- n
	}

	points(x, y, pch=21, cex=1, col=this_colour, bg=this_contrast)
	svg_index[[str_c("pts:", p)]] <- seq(svg_counter, length=n); inc(svg_counter) <- n
	}

	# Uncorrected smooths

	for(p in incl_parties) {

	get_party_data(p)
	b <- b_list[[p]]

	if(opt_bc && opt_bccurves && !opt_bcplot) {

	for(i in levels(droplevels(z))[-1]) {

	lines(xs, b$fit + bias[i, p], lwd=2, col=alpha(this_colour, 1/2))
	svg_index[[sprintf("psm:%s:%s", p, i)]] <- svg_counter; inc(svg_counter) <- 1
	}
	}
	}

	# Election results

	for(p in incl_parties) {

	get_party_data(p)

	points(

	x[s_elect], y[s_elect],
	pch = 21, cex = 2,
	col = party_ctl[p, "Colour"],
	bg = party_ctl[p, "Contrast"]
	)
	n_elect <- sum(s_elect)
	svg_index[[str_c("election:", p)]] <- seq(svg_counter, length=n_elect); inc(svg_counter) <- n_elect
	}

	# Plot final estimates

	for(p in incl_parties) {

	get_party_data(p)

	b <- b_list[[p]]

	final_x <- tail(x_date, 1)
	final_fit <- tail(b $ fit, 1)
	final_err <- 1.96 * tail(b $ se.fit, 1)
	final[p] <- sprintf("%s \u00B1 %s%%", format_num(final_fit), format_num(final_err))

	final_date <- tail(x_date, 1)
	right_x <- par("usr")[2]
	avail_w <- right_x - as.numeric(final_date)

	test_str <- sprintf("%s \u00B1 %s%%_", format_num(800/9), format_num(80/9)) # "88.8 ± 8.8%_"
	test_w <- strwidth(test_str, font=4)

	text(
	final_x, final_fit, final[p],
	pos = 4, font = 4, xpd = TRUE,
	col = this_colour,
	cex = avail_w / test_w,
	)
	}

	# Events
	if(opt_events) {

	s_date <- (event_data $ Date >= opt_begin & event_data $ Date <= opt_end)
	event_data <- event_data[s_date, ]

	points(event_data $ Date, rep(-c(0, 2, 4, 1, 3), length=nrow(event_data)) * (xh / 4), pch=21, cex=1.5, bg="#FC7928")
	svg_index[["events"]] <- seq(svg_counter, length=nrow(event_data)); inc(svg_counter) <- nrow(event_data)
	}

	# Legend (SVG is broken for this, do it last)

	if(opt_legend) {

	party_col <- sapply(incl_parties, function(p) {get_party_data(p); this_colour})
	party_bg <- sapply(incl_parties, function(p) {get_party_data(p); this_contrast})

	legend("topleft", legend=incl_parties, pch=21, col=party_col, pt.bg=party_bg)

	for(p in incl_parties) {

	svg_index[[str_c("legend:", p)]] <- svg_counter
	inc(svg_counter) <- 1
	}
	}


	if(opt_svg \|\| opt_png) dev.off()

	# Bias estimates

	if(opt_bc && opt_bcreport) print(round(bias, 1))

	###
	##
	## Fancy SVG stuff.
	##
	###

	if(opt_svg) {

	doc <- xmlParse(opt_filename)

	svg_points <- unlist(getPlotPoints(doc))

	addChildren(xmlChildren(doc) $ svg, kids=list(newXMLNode(name="title", "New Zealand General Election Polling")), at=0)
	addECMAScripts(doc, I('

	function embiggen(path) {

	d = path.getAttributeNS(null, "d").match(/-?\\d+(\\.\\d+)?/ig)
	x = ( parseFloat(d[0]) + parseFloat(d[4]) ) / 4;
	y = parseFloat(d[1]) / 2;
	path.setAttributeNS(null, "transform", "scale(2) translate(-" + x + ", -" + y + ")");
	}

	function unembiggen(path) {

	clicked = path.getAttributeNS(null, "desc")

	if(clicked != "clicked") {

	path.setAttributeNS(null, "transform", "scale(1)");
	}
	}

	function link(desc, n) {

	for(i = 1; i <= n; i++) {

	path = document.getElementById(desc + i);
	embiggen(path);
	}

	path = document.getElementById(desc);
	bigline(path);
	}

	function unlink(desc, n) {

	for(i = 1; i <= n; i++) {

	path = document.getElementById(desc + i);
	unembiggen(path);
	}

	path = document.getElementById(desc);
	smallline(path);
	}

	function clink(desc, n) {

	for(i =1; i <= n; i++) {

	path = document.getElementById(desc + i);
	click(path)
	}

	path = document.getElementById(desc);
	click(path)
	}

	function click(path) {

	desc = path.getAttributeNS(null, "desc")

	if(desc == "clicked") {

	path.setAttributeNS(null, "desc", "");
	}
	else {

	path.setAttributeNS(null, "desc", "clicked");
	}
	}

	function bigline(path) {

	path.style.setProperty("stroke-opacity", "0.5");
	}

	function smallline(path) {

	clicked = path.getAttributeNS(null, "desc")

	if(clicked != "clicked") {

	path.style.setProperty("stroke-opacity", "0");
	}
	}

	'), at=1)

	# Poll results

	for(p in incl_parties) {

	get_party_data(p)

	poll_index <- svg_index[[str_c("pts:", p)]]

	pt1 <- format_date_range(x0, x1)
	pt2 <- z
	pt3 <- sprintf("%s: %s%%", p, y)
	poll_text <- str_c(pt1, pt2, pt3, sep=", \n")

	addToolTips(doc, text=poll_text, path=svg_points[poll_index])
	# for(i in levels(poll_data $ Poll)[-1]) {
	for(i in levels(droplevels(z))[-1]) {

	if(opt_bc && opt_bccurves && !opt_bcplot) {

	sm_index <- svg_index[[sprintf("psm:%s:%s", p, i)]]
	link(svg_points[poll_index[z==i]], svg_points[[sm_index]], p, i)
	}
	else {

	link(svg_points[poll_index[z==i]], NULL, p, i)
	}
	}

	elect_index <- svg_index[[str_c("election:", p)]]
	et1 <- format_date(x[s_elect])
	et2 <- "Election"
	et3 <- sprintf("%s: %s%%", p, y[s_elect])
	elect_text <- str_c(et1, et2, et3, sep="\n")

	addToolTips(doc, text=elect_text, path=svg_points[elect_index])
	embiggen(svg_points[elect_index])

	if(opt_curvett) {

	b <- b_list[[p]]
	smpts_date <- format_date(xs)
	smpts_pc <- str_c(round(b$fit, 1), '%')
	smpts_text <- str_c(smpts_date, smpts_pc, sep='\n')
	smpts_index <- svg_index[[str_c("smpts:", p)]]
	addToolTips(doc, text=smpts_text, path=svg_points[smpts_index])
	}
	}

	# Events

	if(opt_events) {

	event_index <- svg_index[["events"]]
	event_text <- with(event_data, str_c(format_date(Date), "\n", Event))

	addToolTips(doc, text=event_text, path=svg_points[event_index])
	embiggen(svg_points[event_index])
	}

	saveXML(doc, opt_filename)
	}