Created
November 17, 2012 06:34
-
-
Save Rharald/4093843 to your computer and use it in GitHub Desktop.
EP2012
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# harald 2012-11-11 | |
# heavily based on http://www.python-forum.de/viewtopic.php?f=3&t=20308 | |
import sys | |
from PyQt4.QtCore import * | |
from PyQt4.QtGui import * | |
from PyQt4.QtWebKit import * | |
URI = sys.argv[1] | |
class P_MainWindow(QWebView): | |
def __init__(self, url, parent = None): | |
QWebView.__init__(self, parent) | |
self.load(QUrl(url)) | |
self.connect(self, SIGNAL("loadFinished(bool)"), | |
self.print_html) | |
print "OK!" | |
def print_html(self): | |
x = open("temp.html", "w") | |
x.write(self.page().mainFrame().toHtml()) | |
print "OK" | |
app.quit() | |
app = QApplication(sys.argv) | |
w = P_MainWindow(URI) | |
sys.exit(app.exec_()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## harald, 2012-11-11 | |
# this script downloads all exit polls for the U.S. presidential | |
# election on 2012-11-06 from CNN and saves the resulting | |
# data.frame to file | |
# for later puropses it can be loaded by | |
# load(file="PresExitPolls2012.Rdata") | |
library(XML) | |
library(plyr) | |
source("getStateData.R") | |
source("parseEpNode.R") | |
source("getExitPolls.R") | |
# In 2012 exit polls weren't conducted in all states. | |
# "Here is a list of the states that will be excluded from coverage: Alaska, Arkansas, D$ | |
# (source: http://www.washingtonpost.com/blogs/the-fix/wp/2012/10/04/networks-ap-cancel-$ | |
# a bit of editing gives: | |
nonep.states <- paste0("Alaska, Arkansas, Delaware, District of Columbia, ", | |
"Georgia, Hawaii, Idaho, Kentucky, Louisiana, ", | |
"Nebraska, North Dakota, Oklahoma, Rhode Island, ", | |
"South Carolina, South Dakota, Tennessee, Texas, ", | |
"Utah, West Virginia, Wyoming" | |
) | |
excluded <- unlist(strsplit(nonep.states, ", ")) | |
# we need the abbreviations, however, so we have to translate: | |
ep.states <- as.list(state.abb[!(state.name %in% excluded)]) | |
# add "nationwide" to states | |
ep.states <- c(ep.states, "US") | |
# It's "Split" already, so "Apply"... | |
all_exit_polls <- lapply(ep.states, function(x) getExitPolls(x)) | |
# "Combine"... | |
EP <- rbind.fill(all_exit_polls) | |
# Save | |
save(EP, file = "PresExitPolls2012.Rdata") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## harald, 2012-11-11 | |
# a function to retrieve the exit polls for one state: | |
getExitPolls <- function(state) { | |
require(XML) | |
require(plyr) | |
# get the raw state data | |
raw <- getStateData(state) | |
xpath.expression <- "//div[@class=\"exit_poll\"]" | |
ep_nodes <- getNodeSet(doc = raw, path = xpath.expression) | |
# the first two nodes contain the "About Exit Polls" examples | |
ep_nodes <- ep_nodes[c(-1, -2)] | |
# convert elements to data.frames | |
# there will be some warnings about "NAs introduced by coercion" | |
# those can safely be ignored | |
EP <- lapply(ep_nodes, function(x) parseEpNode(x)) | |
# drop the NULL frames | |
EP <- EP[-grep("NULL", EP)] | |
# in some cases the same question has been asked with different | |
# breakdowns. to distiguish them we'll add a question number | |
for (i in seq_along(EP)) {EP[[i]]$QNo <- i} | |
EP <- rbind.fill(EP) | |
EP$state <- state | |
return(EP) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## harald, 2012-11-11 | |
## function to read all data for a single state from CNN | |
getStateData <- function(state) { | |
require(XML) | |
# compose the URI | |
base_url <- "http://us.cnn.com/election/2012/" | |
exit_polls_url <- "results/state/" | |
state <- toupper(state) | |
URI <- paste0(base_url, exit_polls_url, state, "/president") | |
if (state == "US") { | |
URI <- "http://us.cnn.com/election/2012/results/race/president" | |
} | |
# call python script that downloads the polls from CNN | |
# R can't do this, because it's dynamically generated html | |
# print a message | |
print(paste("processing data for", state)) | |
syscall <- paste("python dl_CNN_EP.py", URI) | |
system(syscall) | |
webpage <- readLines("temp.html") | |
pagetree <- htmlParse(webpage | |
, asText = TRUE | |
, error=function(...){} | |
, useInternalNodes = TRUE | |
) | |
return(pagetree) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## harald, 2012-11-11 | |
## function to parse a single node | |
## | |
parseEpNode <- function(node) { | |
require(XML) | |
require(plyr) | |
if (grepl("\n", xmlValue(xmlChildren(node)[[2]])) | |
&& grepl("TQ", xmlAttrs(node[4]$div[3]$ul)[1]) | |
) { | |
question <- xmlValue(xmlChildren(node)[[2]]) | |
question <- gsub("\n", "", gsub("\nClose\n", "", question)) | |
foo <- | |
lapply( | |
xmlChildren( | |
xmlChildren(node)[[4]] | |
), | |
function(x) unlist(strsplit(xmlValue(x),"\n")) | |
)[-1] | |
# add a prefix to make the following lapply possible | |
foo[[1]][1] <- paste0("answer:", foo[[1]][1]) | |
foo <- lapply(foo, function(x) unlist(strsplit(x, ":"))) | |
# there's at least one state, Massachussetts, where CNN shows | |
# an empty exit poll table, so we better check: | |
if (length(unlist(foo[-1])) == 0) { | |
return(NULL) | |
} | |
foo.df <- | |
as.data.frame( | |
do.call(rbind, foo), | |
stringsAsFactors = FALSE | |
) | |
# there's a problem with residual blanks on some CNN pages | |
names(foo.df) <- gsub(" ", "", foo.df[1, ]) | |
foo.df <- foo.df[-1, ] | |
for (i in 2:length(foo.df)) { | |
# the warnings produced are legitimate coercions from | |
# "N/A" to NA. so we can safely suppress them here. | |
foo.df[, i] <- suppressWarnings( | |
as.numeric(gsub("[%]|[(]|[)]", "", foo.df[, i])) | |
) | |
} | |
foo.df$question <- question | |
return(foo.df) | |
} else { | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment