Skip to content

Instantly share code, notes, and snippets.

@jseabold
Created October 2, 2012 21:13
Show Gist options
  • Save jseabold/3823349 to your computer and use it in GitHub Desktop.
Save jseabold/3823349 to your computer and use it in GitHub Desktop.
Download presidential poll data into a DataFrame
"""
Download all the Presidential Poll Data from Real Clear Politics and
put it in a DataFrame then put a bird on it.
Usage:
data2012_state_recent = download_latest_state_polls()
data2004, data2008, data2012 = download_national_polls()
data2012_state = download_state_polls()
Data will probably need some ex-post cleaning to be useful.
"""
import urllib2
import re
from urllib import urlencode
from time import sleep
from lxml import html
import pandas
GLOBAL_NUMBER_OF_URL_TRIES = 0
states = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
def get_xml_from_url(url, data={}):
data = urlencode(data)
request = urllib2.Request(url, data=data)
opener = urllib2.build_opener()
try:
response = opener.open(request)
except urllib2.HTTPError, error:
return error
xml_tree = html.parse(response)
return xml_tree
def split_row(row):
return [i.text_content() for i in row.getchildren()]
def get_table(table):
table_data = []
for i, child in enumerate(table.iterchildren()):
if not (i % 2):
date = child.text_content()
else: # iterate through the sub-table
if not table_data: # then get the headers
header = split_row(child[0])
header[0] = re.sub("\xa0\xa0\(.*\)", "", header[0]).strip()
table_data.append(header + ["Date"])
for row in child.getchildren()[1:]:
row = split_row(row)
table_data.append(row + [date])
return pandas.DataFrame(table_data[1:], columns=table_data[0])
def download_latest_state_polls():
url = "http://www.realclearpolitics.com/epolls/latest_polls/president/"
table_data = []
#table should go date, table, date, table, until end, then follow next
#linke
xml_tree = get_xml_from_url(url)
i = 1
# right now there are 3 table classes to get. don't know if this is
# always the case, so code defensively
table = xml_tree.xpath('//div[@id="table-%d"]' % i)
table_frame = get_table(table[0])
while table:
i += 1
table = xml_tree.xpath('//div[@id="table-%d"]' % i)
if table:
table_frame = pandas.concat((table_frame, get_table(table[0])))
return table_frame
def politely_open_url(url):
"""
This 403s a bit while doing the server load balancing, so politely knock
again.
"""
global GLOBAL_NUMBER_OF_URL_TRIES
try:
response = urllib2.urlopen(url)
GLOBAL_NUMBER_OF_URL_TRIES = 0
return response
except urllib2.HTTPError, error:
if GLOBAL_NUMBER_OF_URL_TRIES > 100:
print "More than 100 tries of %s failed" % url
return error
elif error.msg == 'Forbidden':
GLOBAL_NUMBER_OF_URL_TRIES += 1
sleep(1)
return politely_open_url(url)
else:
return error
def download_state_polls():
"""
Use this to download all state polls. Unforunately, there's no years on a lot of the
data. But you should be able to cross-reference with latest.
"""
# need to walk this directory for the states plus DC
url = "http://www.realclearpolitics.com/epolls/2012/president/"
# inside each directory get the one called state_romney_vs_obama*.html
state_xx = [key.lower() for key in states.keys()]
table_data = []
for state in states:
url_link = url + state.lower() + '/'
response = politely_open_url(url_link)
xml_tree = html.parse(response)
state_links = xml_tree.findall('//a')
for link in state_links:
link.make_links_absolute()
link_url = link.get('href')
# i don't think we have to worry about there being more than one
if re.match("http://.+_romney_vs_obama-.+\.html", link_url):
print "Trying to download %s" % link_url
xml_tree = get_xml_from_url(link_url)
try:
# some states like Alaska and Alabama don't have any polls?
table = xml_tree.xpath(
'//div[@id="polling-data-full"]//table')[0]
print "Downloaded %s" % link_url
except:
continue
# this should work for states too
state_table = get_national_tables(table)
state_table["State"] = state
table_data.append(state_table)
# chill out for a second. server seems finnicky
sleep(1)
table_2012 = pandas.concat(table_data)
return table_2012
def get_national_tables(table):
table_data = []
for row in table.iterchildren():
row = split_row(row)
table_data.append(row)
return pandas.DataFrame(table_data[1:], columns=table_data[0])
def download_national_polls():
#NOTE: the 2012 data is likely to update daily
# you can browse around from here, sometimes I get forbidden, sometimes not
# seems to depend which page I come from AFIACT. Refreshing also works.
# I think it's bouncing me around to different servers.
#http://www.realclearpolitics.com/epolls/
# bush vs gore
# can't find anything, just an electoral college map
# bush vs kerry
url = "http://www.realclearpolitics.com/epolls/2004/president/us/general_election_bush_vs_kerry-939.html"
xml_tree = get_xml_from_url(url)
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
table_2004 = get_national_table(table)
# mccain vs obama
url = "http://www.realclearpolitics.com/epolls/2008/president/us/general_election_mccain_vs_obama-225.html"
xml_tree = get_xml_from_url(url)
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
table_2008 = get_national_table(table)
url = "http://www.realclearpolitics.com/epolls/2012/president/us/general_election_romney_vs_obama-1171.html"
xml_tree = get_xml_from_url(url)
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
table_2012 = get_national_table(table)
return table_2004, table_2008, table_2012
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment