Created
October 2, 2012 21:13
-
-
Save jseabold/3823349 to your computer and use it in GitHub Desktop.
Download presidential poll data into a DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Download all the Presidential Poll Data from Real Clear Politics and | |
put it in a DataFrame then put a bird on it. | |
Usage: | |
data2012_state_recent = download_latest_state_polls() | |
data2004, data2008, data2012 = download_national_polls() | |
data2012_state = download_state_polls() | |
Data will probably need some ex-post cleaning to be useful. | |
""" | |
import urllib2 | |
import re | |
from urllib import urlencode | |
from time import sleep | |
from lxml import html | |
import pandas | |
GLOBAL_NUMBER_OF_URL_TRIES = 0 | |
states = { | |
'AK': 'Alaska', | |
'AL': 'Alabama', | |
'AR': 'Arkansas', | |
'AS': 'American Samoa', | |
'AZ': 'Arizona', | |
'CA': 'California', | |
'CO': 'Colorado', | |
'CT': 'Connecticut', | |
'DC': 'District of Columbia', | |
'DE': 'Delaware', | |
'FL': 'Florida', | |
'GA': 'Georgia', | |
'GU': 'Guam', | |
'HI': 'Hawaii', | |
'IA': 'Iowa', | |
'ID': 'Idaho', | |
'IL': 'Illinois', | |
'IN': 'Indiana', | |
'KS': 'Kansas', | |
'KY': 'Kentucky', | |
'LA': 'Louisiana', | |
'MA': 'Massachusetts', | |
'MD': 'Maryland', | |
'ME': 'Maine', | |
'MI': 'Michigan', | |
'MN': 'Minnesota', | |
'MO': 'Missouri', | |
'MP': 'Northern Mariana Islands', | |
'MS': 'Mississippi', | |
'MT': 'Montana', | |
'NA': 'National', | |
'NC': 'North Carolina', | |
'ND': 'North Dakota', | |
'NE': 'Nebraska', | |
'NH': 'New Hampshire', | |
'NJ': 'New Jersey', | |
'NM': 'New Mexico', | |
'NV': 'Nevada', | |
'NY': 'New York', | |
'OH': 'Ohio', | |
'OK': 'Oklahoma', | |
'OR': 'Oregon', | |
'PA': 'Pennsylvania', | |
'PR': 'Puerto Rico', | |
'RI': 'Rhode Island', | |
'SC': 'South Carolina', | |
'SD': 'South Dakota', | |
'TN': 'Tennessee', | |
'TX': 'Texas', | |
'UT': 'Utah', | |
'VA': 'Virginia', | |
'VI': 'Virgin Islands', | |
'VT': 'Vermont', | |
'WA': 'Washington', | |
'WI': 'Wisconsin', | |
'WV': 'West Virginia', | |
'WY': 'Wyoming' | |
} | |
def get_xml_from_url(url, data={}): | |
data = urlencode(data) | |
request = urllib2.Request(url, data=data) | |
opener = urllib2.build_opener() | |
try: | |
response = opener.open(request) | |
except urllib2.HTTPError, error: | |
return error | |
xml_tree = html.parse(response) | |
return xml_tree | |
def split_row(row): | |
return [i.text_content() for i in row.getchildren()] | |
def get_table(table): | |
table_data = [] | |
for i, child in enumerate(table.iterchildren()): | |
if not (i % 2): | |
date = child.text_content() | |
else: # iterate through the sub-table | |
if not table_data: # then get the headers | |
header = split_row(child[0]) | |
header[0] = re.sub("\xa0\xa0\(.*\)", "", header[0]).strip() | |
table_data.append(header + ["Date"]) | |
for row in child.getchildren()[1:]: | |
row = split_row(row) | |
table_data.append(row + [date]) | |
return pandas.DataFrame(table_data[1:], columns=table_data[0]) | |
def download_latest_state_polls(): | |
url = "http://www.realclearpolitics.com/epolls/latest_polls/president/" | |
table_data = [] | |
#table should go date, table, date, table, until end, then follow next | |
#linke | |
xml_tree = get_xml_from_url(url) | |
i = 1 | |
# right now there are 3 table classes to get. don't know if this is | |
# always the case, so code defensively | |
table = xml_tree.xpath('//div[@id="table-%d"]' % i) | |
table_frame = get_table(table[0]) | |
while table: | |
i += 1 | |
table = xml_tree.xpath('//div[@id="table-%d"]' % i) | |
if table: | |
table_frame = pandas.concat((table_frame, get_table(table[0]))) | |
return table_frame | |
def politely_open_url(url): | |
""" | |
This 403s a bit while doing the server load balancing, so politely knock | |
again. | |
""" | |
global GLOBAL_NUMBER_OF_URL_TRIES | |
try: | |
response = urllib2.urlopen(url) | |
GLOBAL_NUMBER_OF_URL_TRIES = 0 | |
return response | |
except urllib2.HTTPError, error: | |
if GLOBAL_NUMBER_OF_URL_TRIES > 100: | |
print "More than 100 tries of %s failed" % url | |
return error | |
elif error.msg == 'Forbidden': | |
GLOBAL_NUMBER_OF_URL_TRIES += 1 | |
sleep(1) | |
return politely_open_url(url) | |
else: | |
return error | |
def download_state_polls(): | |
""" | |
Use this to download all state polls. Unforunately, there's no years on a lot of the | |
data. But you should be able to cross-reference with latest. | |
""" | |
# need to walk this directory for the states plus DC | |
url = "http://www.realclearpolitics.com/epolls/2012/president/" | |
# inside each directory get the one called state_romney_vs_obama*.html | |
state_xx = [key.lower() for key in states.keys()] | |
table_data = [] | |
for state in states: | |
url_link = url + state.lower() + '/' | |
response = politely_open_url(url_link) | |
xml_tree = html.parse(response) | |
state_links = xml_tree.findall('//a') | |
for link in state_links: | |
link.make_links_absolute() | |
link_url = link.get('href') | |
# i don't think we have to worry about there being more than one | |
if re.match("http://.+_romney_vs_obama-.+\.html", link_url): | |
print "Trying to download %s" % link_url | |
xml_tree = get_xml_from_url(link_url) | |
try: | |
# some states like Alaska and Alabama don't have any polls? | |
table = xml_tree.xpath( | |
'//div[@id="polling-data-full"]//table')[0] | |
print "Downloaded %s" % link_url | |
except: | |
continue | |
# this should work for states too | |
state_table = get_national_tables(table) | |
state_table["State"] = state | |
table_data.append(state_table) | |
# chill out for a second. server seems finnicky | |
sleep(1) | |
table_2012 = pandas.concat(table_data) | |
return table_2012 | |
def get_national_tables(table): | |
table_data = [] | |
for row in table.iterchildren(): | |
row = split_row(row) | |
table_data.append(row) | |
return pandas.DataFrame(table_data[1:], columns=table_data[0]) | |
def download_national_polls(): | |
#NOTE: the 2012 data is likely to update daily | |
# you can browse around from here, sometimes I get forbidden, sometimes not | |
# seems to depend which page I come from AFIACT. Refreshing also works. | |
# I think it's bouncing me around to different servers. | |
#http://www.realclearpolitics.com/epolls/ | |
# bush vs gore | |
# can't find anything, just an electoral college map | |
# bush vs kerry | |
url = "http://www.realclearpolitics.com/epolls/2004/president/us/general_election_bush_vs_kerry-939.html" | |
xml_tree = get_xml_from_url(url) | |
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0] | |
table_2004 = get_national_table(table) | |
# mccain vs obama | |
url = "http://www.realclearpolitics.com/epolls/2008/president/us/general_election_mccain_vs_obama-225.html" | |
xml_tree = get_xml_from_url(url) | |
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0] | |
table_2008 = get_national_table(table) | |
url = "http://www.realclearpolitics.com/epolls/2012/president/us/general_election_romney_vs_obama-1171.html" | |
xml_tree = get_xml_from_url(url) | |
table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0] | |
table_2012 = get_national_table(table) | |
return table_2004, table_2008, table_2012 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment