Skip to content

Instantly share code, notes, and snippets.

@Rharald
Created November 17, 2012 06:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Rharald/4093843 to your computer and use it in GitHub Desktop.
Save Rharald/4093843 to your computer and use it in GitHub Desktop.
EP2012
# harald 2012-11-11
# heavily based on http://www.python-forum.de/viewtopic.php?f=3&t=20308
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
URI = sys.argv[1]
class P_MainWindow(QWebView):
def __init__(self, url, parent = None):
QWebView.__init__(self, parent)
self.load(QUrl(url))
self.connect(self, SIGNAL("loadFinished(bool)"),
self.print_html)
print "OK!"
def print_html(self):
x = open("temp.html", "w")
x.write(self.page().mainFrame().toHtml())
print "OK"
app.quit()
app = QApplication(sys.argv)
w = P_MainWindow(URI)
sys.exit(app.exec_())
## harald, 2012-11-11
# this script downloads all exit polls for the U.S. presidential
# election on 2012-11-06 from CNN and saves the resulting
# data.frame to file
# for later puropses it can be loaded by
# load(file="PresExitPolls2012.Rdata")
library(XML)
library(plyr)
source("getStateData.R")
source("parseEpNode.R")
source("getExitPolls.R")
# In 2012 exit polls weren't conducted in all states.
# "Here is a list of the states that will be excluded from coverage: Alaska, Arkansas, D$
# (source: http://www.washingtonpost.com/blogs/the-fix/wp/2012/10/04/networks-ap-cancel-$
# a bit of editing gives:
nonep.states <- paste0("Alaska, Arkansas, Delaware, District of Columbia, ",
"Georgia, Hawaii, Idaho, Kentucky, Louisiana, ",
"Nebraska, North Dakota, Oklahoma, Rhode Island, ",
"South Carolina, South Dakota, Tennessee, Texas, ",
"Utah, West Virginia, Wyoming"
)
excluded <- unlist(strsplit(nonep.states, ", "))
# we need the abbreviations, however, so we have to translate:
ep.states <- as.list(state.abb[!(state.name %in% excluded)])
# add "nationwide" to states
ep.states <- c(ep.states, "US")
# It's "Split" already, so "Apply"...
all_exit_polls <- lapply(ep.states, function(x) getExitPolls(x))
# "Combine"...
EP <- rbind.fill(all_exit_polls)
# Save
save(EP, file = "PresExitPolls2012.Rdata")
## harald, 2012-11-11
# a function to retrieve the exit polls for one state:
getExitPolls <- function(state) {
require(XML)
require(plyr)
# get the raw state data
raw <- getStateData(state)
xpath.expression <- "//div[@class=\"exit_poll\"]"
ep_nodes <- getNodeSet(doc = raw, path = xpath.expression)
# the first two nodes contain the "About Exit Polls" examples
ep_nodes <- ep_nodes[c(-1, -2)]
# convert elements to data.frames
# there will be some warnings about "NAs introduced by coercion"
# those can safely be ignored
EP <- lapply(ep_nodes, function(x) parseEpNode(x))
# drop the NULL frames
EP <- EP[-grep("NULL", EP)]
# in some cases the same question has been asked with different
# breakdowns. to distiguish them we'll add a question number
for (i in seq_along(EP)) {EP[[i]]$QNo <- i}
EP <- rbind.fill(EP)
EP$state <- state
return(EP)
}
## harald, 2012-11-11
## function to read all data for a single state from CNN
getStateData <- function(state) {
require(XML)
# compose the URI
base_url <- "http://us.cnn.com/election/2012/"
exit_polls_url <- "results/state/"
state <- toupper(state)
URI <- paste0(base_url, exit_polls_url, state, "/president")
if (state == "US") {
URI <- "http://us.cnn.com/election/2012/results/race/president"
}
# call python script that downloads the polls from CNN
# R can't do this, because it's dynamically generated html
# print a message
print(paste("processing data for", state))
syscall <- paste("python dl_CNN_EP.py", URI)
system(syscall)
webpage <- readLines("temp.html")
pagetree <- htmlParse(webpage
, asText = TRUE
, error=function(...){}
, useInternalNodes = TRUE
)
return(pagetree)
}
## harald, 2012-11-11
## function to parse a single node
##
parseEpNode <- function(node) {
require(XML)
require(plyr)
if (grepl("\n", xmlValue(xmlChildren(node)[[2]]))
&& grepl("TQ", xmlAttrs(node[4]$div[3]$ul)[1])
) {
question <- xmlValue(xmlChildren(node)[[2]])
question <- gsub("\n", "", gsub("\nClose\n", "", question))
foo <-
lapply(
xmlChildren(
xmlChildren(node)[[4]]
),
function(x) unlist(strsplit(xmlValue(x),"\n"))
)[-1]
# add a prefix to make the following lapply possible
foo[[1]][1] <- paste0("answer:", foo[[1]][1])
foo <- lapply(foo, function(x) unlist(strsplit(x, ":")))
# there's at least one state, Massachussetts, where CNN shows
# an empty exit poll table, so we better check:
if (length(unlist(foo[-1])) == 0) {
return(NULL)
}
foo.df <-
as.data.frame(
do.call(rbind, foo),
stringsAsFactors = FALSE
)
# there's a problem with residual blanks on some CNN pages
names(foo.df) <- gsub(" ", "", foo.df[1, ])
foo.df <- foo.df[-1, ]
for (i in 2:length(foo.df)) {
# the warnings produced are legitimate coercions from
# "N/A" to NA. so we can safely suppress them here.
foo.df[, i] <- suppressWarnings(
as.numeric(gsub("[%]|[(]|[)]", "", foo.df[, i]))
)
}
foo.df$question <- question
return(foo.df)
} else {
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment