Rharald/EP2012_1.R

## dl_CNN_EP.py
# harald 2012-11-11
# heavily based on http://www.python-forum.de/viewtopic.php?f=3&t=20308
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

URI = sys.argv[1]

class P_MainWindow(QWebView):
    def __init__(self, url, parent = None):
        QWebView.__init__(self, parent)

        self.load(QUrl(url))

        self.connect(self, SIGNAL("loadFinished(bool)"),
                     self.print_html)

        print "OK!"
    def print_html(self):
        x = open("temp.html", "w")
        x.write(self.page().mainFrame().toHtml())
        print "OK"
        app.quit()

app = QApplication(sys.argv)
w = P_MainWindow(URI)
sys.exit(app.exec_())

## EP2012_1.R
## harald, 2012-11-11
# this script downloads all exit polls for the U.S. presidential
# election on 2012-11-06 from CNN and saves the resulting
# data.frame to file
# for later puropses it can be loaded by
# load(file="PresExitPolls2012.Rdata")

library(XML)
library(plyr)
source("getStateData.R")
source("parseEpNode.R")
source("getExitPolls.R")

# In 2012 exit polls weren't conducted in all states.
# "Here is a list of the states that will be excluded from coverage: Alaska, Arkansas, D$
# (source: http://www.washingtonpost.com/blogs/the-fix/wp/2012/10/04/networks-ap-cancel-$
# a bit of editing gives:
nonep.states <- paste0("Alaska, Arkansas, Delaware, District of Columbia, ",
  "Georgia, Hawaii, Idaho, Kentucky, Louisiana, ",
  "Nebraska, North Dakota, Oklahoma, Rhode Island, ",
  "South Carolina, South Dakota, Tennessee, Texas, ",
  "Utah, West Virginia, Wyoming"
)
excluded <- unlist(strsplit(nonep.states, ", "))
# we need the abbreviations, however, so we have to translate:
ep.states <- as.list(state.abb[!(state.name %in% excluded)])

# add "nationwide" to states
ep.states <- c(ep.states, "US")
# It's "Split" already, so "Apply"...
all_exit_polls <- lapply(ep.states, function(x) getExitPolls(x))

# "Combine"...
EP <- rbind.fill(all_exit_polls)

# Save
save(EP, file = "PresExitPolls2012.Rdata")

## getExitPolls.R
## harald, 2012-11-11
# a function to retrieve the exit polls for one state:
getExitPolls <- function(state) {
  require(XML)
  require(plyr)
  # get the raw state data
  raw <- getStateData(state)
  xpath.expression <- "//div[@class=\"exit_poll\"]"
  ep_nodes <- getNodeSet(doc = raw, path = xpath.expression)
  # the first two nodes contain the "About Exit Polls" examples
  ep_nodes <- ep_nodes[c(-1, -2)]
  # convert elements to data.frames
  # there will be some warnings about "NAs introduced by coercion"
  # those can safely be ignored
  EP <- lapply(ep_nodes, function(x) parseEpNode(x))
  # drop the NULL frames
  EP <- EP[-grep("NULL", EP)]
  # in some cases the same question has been asked with different
  # breakdowns. to distiguish them we'll add a question number
  for (i in seq_along(EP)) {EP[[i]]$QNo <- i}
  EP <- rbind.fill(EP)
  EP$state <- state
  return(EP)
}

## getStateData.R
## harald, 2012-11-11
## function to read all data for a single state from CNN
getStateData <- function(state) {
  require(XML)
  # compose the URI
  base_url <- "http://us.cnn.com/election/2012/"
  exit_polls_url <- "results/state/"
  state <- toupper(state)
  URI <- paste0(base_url, exit_polls_url, state, "/president")
  if (state == "US") {
    URI <- "http://us.cnn.com/election/2012/results/race/president"
  }
  # call python script that downloads the polls from CNN
  # R can't do this, because it's dynamically generated html
  # print a message
  print(paste("processing data for", state))
  syscall <- paste("python dl_CNN_EP.py", URI)
  system(syscall)
  webpage <- readLines("temp.html")
  pagetree <- htmlParse(webpage
    , asText = TRUE
    , error=function(...){}
    , useInternalNodes = TRUE
  )
  return(pagetree)
}

## parseEpNode.R
## harald, 2012-11-11
## function to parse a single node
##
parseEpNode <- function(node) {
  require(XML)
  require(plyr)
  if (grepl("\n", xmlValue(xmlChildren(node)[[2]]))
    && grepl("TQ", xmlAttrs(node[4]$div[3]$ul)[1])
  ) {
    question <- xmlValue(xmlChildren(node)[[2]])
    question <- gsub("\n", "", gsub("\nClose\n", "", question))
    foo <-
      lapply(
        xmlChildren(
          xmlChildren(node)[[4]]
        ),
        function(x) unlist(strsplit(xmlValue(x),"\n"))
      )[-1]
    # add a prefix to make the following lapply possible
    foo[[1]][1] <- paste0("answer:", foo[[1]][1])
    foo <- lapply(foo, function(x) unlist(strsplit(x, ":")))
    # there's at least one state, Massachussetts, where CNN shows
    # an empty exit poll table, so we better check:
    if (length(unlist(foo[-1])) == 0) {
      return(NULL)
    }
    foo.df <-
      as.data.frame(
        do.call(rbind, foo),
        stringsAsFactors = FALSE
      )
    # there's a problem with residual blanks on some CNN pages
    names(foo.df) <- gsub(" ", "", foo.df[1, ])
    foo.df <- foo.df[-1, ]
    for (i in 2:length(foo.df)) {
      # the warnings produced are legitimate coercions from
      # "N/A" to NA. so we can safely suppress them here.
      foo.df[, i] <- suppressWarnings(
        as.numeric(gsub("[%]|[(]|[)]", "", foo.df[, i]))
      )
    }
    foo.df$question <- question
    return(foo.df)
  } else {
  }
}
	# harald 2012-11-11
	# heavily based on http://www.python-forum.de/viewtopic.php?f=3&t=20308
	import sys
	from PyQt4.QtCore import *
	from PyQt4.QtGui import *
	from PyQt4.QtWebKit import *

	URI = sys.argv[1]

	class P_MainWindow(QWebView):
	def __init__(self, url, parent = None):
	QWebView.__init__(self, parent)

	self.load(QUrl(url))

	self.connect(self, SIGNAL("loadFinished(bool)"),
	self.print_html)

	print "OK!"
	def print_html(self):
	x = open("temp.html", "w")
	x.write(self.page().mainFrame().toHtml())
	print "OK"
	app.quit()

	app = QApplication(sys.argv)
	w = P_MainWindow(URI)
	sys.exit(app.exec_())
	## harald, 2012-11-11
	# this script downloads all exit polls for the U.S. presidential
	# election on 2012-11-06 from CNN and saves the resulting
	# data.frame to file
	# for later puropses it can be loaded by
	# load(file="PresExitPolls2012.Rdata")

	library(XML)
	library(plyr)
	source("getStateData.R")
	source("parseEpNode.R")
	source("getExitPolls.R")

	# In 2012 exit polls weren't conducted in all states.
	# "Here is a list of the states that will be excluded from coverage: Alaska, Arkansas, D$
	# (source: http://www.washingtonpost.com/blogs/the-fix/wp/2012/10/04/networks-ap-cancel-$
	# a bit of editing gives:
	nonep.states <- paste0("Alaska, Arkansas, Delaware, District of Columbia, ",
	"Georgia, Hawaii, Idaho, Kentucky, Louisiana, ",
	"Nebraska, North Dakota, Oklahoma, Rhode Island, ",
	"South Carolina, South Dakota, Tennessee, Texas, ",
	"Utah, West Virginia, Wyoming"
	)
	excluded <- unlist(strsplit(nonep.states, ", "))
	# we need the abbreviations, however, so we have to translate:
	ep.states <- as.list(state.abb[!(state.name %in% excluded)])

	# add "nationwide" to states
	ep.states <- c(ep.states, "US")
	# It's "Split" already, so "Apply"...
	all_exit_polls <- lapply(ep.states, function(x) getExitPolls(x))

	# "Combine"...
	EP <- rbind.fill(all_exit_polls)

	# Save
	save(EP, file = "PresExitPolls2012.Rdata")
	## harald, 2012-11-11
	# a function to retrieve the exit polls for one state:
	getExitPolls <- function(state) {
	require(XML)
	require(plyr)
	# get the raw state data
	raw <- getStateData(state)
	xpath.expression <- "//div[@class=\"exit_poll\"]"
	ep_nodes <- getNodeSet(doc = raw, path = xpath.expression)
	# the first two nodes contain the "About Exit Polls" examples
	ep_nodes <- ep_nodes[c(-1, -2)]
	# convert elements to data.frames
	# there will be some warnings about "NAs introduced by coercion"
	# those can safely be ignored
	EP <- lapply(ep_nodes, function(x) parseEpNode(x))
	# drop the NULL frames
	EP <- EP[-grep("NULL", EP)]
	# in some cases the same question has been asked with different
	# breakdowns. to distiguish them we'll add a question number
	for (i in seq_along(EP)) {EP[[i]]$QNo <- i}
	EP <- rbind.fill(EP)
	EP$state <- state
	return(EP)
	}
	## harald, 2012-11-11
	## function to read all data for a single state from CNN
	getStateData <- function(state) {
	require(XML)
	# compose the URI
	base_url <- "http://us.cnn.com/election/2012/"
	exit_polls_url <- "results/state/"
	state <- toupper(state)
	URI <- paste0(base_url, exit_polls_url, state, "/president")
	if (state == "US") {
	URI <- "http://us.cnn.com/election/2012/results/race/president"
	}
	# call python script that downloads the polls from CNN
	# R can't do this, because it's dynamically generated html
	# print a message
	print(paste("processing data for", state))
	syscall <- paste("python dl_CNN_EP.py", URI)
	system(syscall)
	webpage <- readLines("temp.html")
	pagetree <- htmlParse(webpage
	, asText = TRUE
	, error=function(...){}
	, useInternalNodes = TRUE
	)
	return(pagetree)
	}
	## harald, 2012-11-11
	## function to parse a single node
	##
	parseEpNode <- function(node) {
	require(XML)
	require(plyr)
	if (grepl("\n", xmlValue(xmlChildren(node)[[2]]))
	&& grepl("TQ", xmlAttrs(node[4]$div[3]$ul)[1])
	) {
	question <- xmlValue(xmlChildren(node)[[2]])
	question <- gsub("\n", "", gsub("\nClose\n", "", question))
	foo <-
	lapply(
	xmlChildren(
	xmlChildren(node)[[4]]
	),
	function(x) unlist(strsplit(xmlValue(x),"\n"))
	)[-1]
	# add a prefix to make the following lapply possible
	foo[[1]][1] <- paste0("answer:", foo[[1]][1])
	foo <- lapply(foo, function(x) unlist(strsplit(x, ":")))
	# there's at least one state, Massachussetts, where CNN shows
	# an empty exit poll table, so we better check:
	if (length(unlist(foo[-1])) == 0) {
	return(NULL)
	}
	foo.df <-
	as.data.frame(
	do.call(rbind, foo),
	stringsAsFactors = FALSE
	)
	# there's a problem with residual blanks on some CNN pages
	names(foo.df) <- gsub(" ", "", foo.df[1, ])
	foo.df <- foo.df[-1, ]
	for (i in 2:length(foo.df)) {
	# the warnings produced are legitimate coercions from
	# "N/A" to NA. so we can safely suppress them here.
	foo.df[, i] <- suppressWarnings(
	as.numeric(gsub("[%]\|[(]\|[)]", "", foo.df[, i]))
	)
	}
	foo.df$question <- question
	return(foo.df)
	} else {
	}
	}