jseabold/get_poll_data.py

## get_poll_data.py
"""
Download all the Presidential Poll Data from Real Clear Politics and
put it in a DataFrame then put a bird on it.

Usage:

    data2012_state_recent = download_latest_state_polls()
    data2004, data2008, data2012 = download_national_polls()
    data2012_state = download_state_polls()

Data will probably need some ex-post cleaning to be useful.
"""

import urllib2
import re
from urllib import urlencode
from time import sleep

from lxml import html
import pandas

GLOBAL_NUMBER_OF_URL_TRIES = 0

states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

def get_xml_from_url(url, data={}):
    data = urlencode(data)
    request = urllib2.Request(url, data=data)
    opener = urllib2.build_opener()
    try:
        response = opener.open(request)
    except urllib2.HTTPError, error:
        return error
    xml_tree = html.parse(response)
    return xml_tree

def split_row(row):
    return [i.text_content() for i in row.getchildren()]

def get_table(table):
    table_data = []
    for i, child in enumerate(table.iterchildren()):
        if not (i % 2):
            date = child.text_content()
        else: # iterate through the sub-table
            if not table_data: # then get the headers
                header = split_row(child[0])
                header[0] = re.sub("\xa0\xa0\(.*\)", "", header[0]).strip()
                table_data.append(header + ["Date"])
            for row in child.getchildren()[1:]:
                row = split_row(row)
                table_data.append(row + [date])
    return pandas.DataFrame(table_data[1:], columns=table_data[0])


def download_latest_state_polls():
    url = "http://www.realclearpolitics.com/epolls/latest_polls/president/"
    table_data = []
    #table should go date, table, date, table, until end, then follow next
    #linke
    xml_tree = get_xml_from_url(url)
    i = 1
    # right now there are 3 table classes to get. don't know if this is
    # always the case, so code defensively

    table = xml_tree.xpath('//div[@id="table-%d"]' % i)
    table_frame = get_table(table[0])
    while table:
        i += 1
        table = xml_tree.xpath('//div[@id="table-%d"]' % i)
        if table:
            table_frame = pandas.concat((table_frame, get_table(table[0])))

    return table_frame

def politely_open_url(url):
    """
    This 403s a bit while doing the server load balancing, so politely knock
    again.
    """
    global GLOBAL_NUMBER_OF_URL_TRIES
    try:
        response = urllib2.urlopen(url)
        GLOBAL_NUMBER_OF_URL_TRIES = 0
        return response
    except urllib2.HTTPError, error:
        if GLOBAL_NUMBER_OF_URL_TRIES > 100:
            print "More than 100 tries of %s failed" % url
            return error
        elif error.msg == 'Forbidden':
            GLOBAL_NUMBER_OF_URL_TRIES += 1
            sleep(1)
            return politely_open_url(url)
        else:
            return error


def download_state_polls():
    """
    Use this to download all state polls. Unforunately, there's no years on a lot of the
    data. But you should be able to cross-reference with latest.
    """
    # need to walk this directory for the states plus DC
    url = "http://www.realclearpolitics.com/epolls/2012/president/"

    # inside each directory get the one called state_romney_vs_obama*.html
    state_xx = [key.lower() for key in states.keys()]

    table_data = []
    for state in states:
        url_link = url + state.lower() + '/'
        response = politely_open_url(url_link)
        xml_tree = html.parse(response)
        state_links = xml_tree.findall('//a')
        for link in state_links:
            link.make_links_absolute()
            link_url = link.get('href')
            # i don't think we have to worry about there being more than one
            if re.match("http://.+_romney_vs_obama-.+\.html", link_url):
                print "Trying to download %s" % link_url
                xml_tree = get_xml_from_url(link_url)
                try:
                    # some states like Alaska and Alabama don't have any polls?
                    table = xml_tree.xpath(
                            '//div[@id="polling-data-full"]//table')[0]
                    print "Downloaded %s" % link_url
                except:
                    continue
                # this should work for states too
                state_table = get_national_tables(table)
                state_table["State"] = state
                table_data.append(state_table)
        # chill out for a second. server seems finnicky
        sleep(1)
    table_2012 = pandas.concat(table_data)
    return table_2012

def get_national_tables(table):
    table_data = []
    for row in table.iterchildren():
        row = split_row(row)
        table_data.append(row)
    return pandas.DataFrame(table_data[1:], columns=table_data[0])


def download_national_polls():
    #NOTE: the 2012 data is likely to update daily

    # you can browse around from here, sometimes I get forbidden, sometimes not
    # seems to depend which page I come from AFIACT. Refreshing also works.
    # I think it's bouncing me around to different servers.
    #http://www.realclearpolitics.com/epolls/
    # bush vs gore
    # can't find anything, just an electoral college map

    # bush vs kerry
    url = "http://www.realclearpolitics.com/epolls/2004/president/us/general_election_bush_vs_kerry-939.html"
    xml_tree = get_xml_from_url(url)
    table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
    table_2004 = get_national_table(table)
    # mccain vs obama
    url = "http://www.realclearpolitics.com/epolls/2008/president/us/general_election_mccain_vs_obama-225.html"
    xml_tree = get_xml_from_url(url)
    table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
    table_2008 = get_national_table(table)

    url = "http://www.realclearpolitics.com/epolls/2012/president/us/general_election_romney_vs_obama-1171.html"
    xml_tree = get_xml_from_url(url)
    table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
    table_2012 = get_national_table(table)
    return table_2004, table_2008, table_2012
	"""
	Download all the Presidential Poll Data from Real Clear Politics and
	put it in a DataFrame then put a bird on it.

	Usage:

	data2012_state_recent = download_latest_state_polls()
	data2004, data2008, data2012 = download_national_polls()
	data2012_state = download_state_polls()

	Data will probably need some ex-post cleaning to be useful.
	"""

	import urllib2
	import re
	from urllib import urlencode
	from time import sleep

	from lxml import html
	import pandas

	GLOBAL_NUMBER_OF_URL_TRIES = 0

	states = {
	'AK': 'Alaska',
	'AL': 'Alabama',
	'AR': 'Arkansas',
	'AS': 'American Samoa',
	'AZ': 'Arizona',
	'CA': 'California',
	'CO': 'Colorado',
	'CT': 'Connecticut',
	'DC': 'District of Columbia',
	'DE': 'Delaware',
	'FL': 'Florida',
	'GA': 'Georgia',
	'GU': 'Guam',
	'HI': 'Hawaii',
	'IA': 'Iowa',
	'ID': 'Idaho',
	'IL': 'Illinois',
	'IN': 'Indiana',
	'KS': 'Kansas',
	'KY': 'Kentucky',
	'LA': 'Louisiana',
	'MA': 'Massachusetts',
	'MD': 'Maryland',
	'ME': 'Maine',
	'MI': 'Michigan',
	'MN': 'Minnesota',
	'MO': 'Missouri',
	'MP': 'Northern Mariana Islands',
	'MS': 'Mississippi',
	'MT': 'Montana',
	'NA': 'National',
	'NC': 'North Carolina',
	'ND': 'North Dakota',
	'NE': 'Nebraska',
	'NH': 'New Hampshire',
	'NJ': 'New Jersey',
	'NM': 'New Mexico',
	'NV': 'Nevada',
	'NY': 'New York',
	'OH': 'Ohio',
	'OK': 'Oklahoma',
	'OR': 'Oregon',
	'PA': 'Pennsylvania',
	'PR': 'Puerto Rico',
	'RI': 'Rhode Island',
	'SC': 'South Carolina',
	'SD': 'South Dakota',
	'TN': 'Tennessee',
	'TX': 'Texas',
	'UT': 'Utah',
	'VA': 'Virginia',
	'VI': 'Virgin Islands',
	'VT': 'Vermont',
	'WA': 'Washington',
	'WI': 'Wisconsin',
	'WV': 'West Virginia',
	'WY': 'Wyoming'
	}

	def get_xml_from_url(url, data={}):
	data = urlencode(data)
	request = urllib2.Request(url, data=data)
	opener = urllib2.build_opener()
	try:
	response = opener.open(request)
	except urllib2.HTTPError, error:
	return error
	xml_tree = html.parse(response)
	return xml_tree

	def split_row(row):
	return [i.text_content() for i in row.getchildren()]

	def get_table(table):
	table_data = []
	for i, child in enumerate(table.iterchildren()):
	if not (i % 2):
	date = child.text_content()
	else: # iterate through the sub-table
	if not table_data: # then get the headers
	header = split_row(child[0])
	header[0] = re.sub("\xa0\xa0\(.*\)", "", header[0]).strip()
	table_data.append(header + ["Date"])
	for row in child.getchildren()[1:]:
	row = split_row(row)
	table_data.append(row + [date])
	return pandas.DataFrame(table_data[1:], columns=table_data[0])


	def download_latest_state_polls():
	url = "http://www.realclearpolitics.com/epolls/latest_polls/president/"
	table_data = []
	#table should go date, table, date, table, until end, then follow next
	#linke
	xml_tree = get_xml_from_url(url)
	i = 1
	# right now there are 3 table classes to get. don't know if this is
	# always the case, so code defensively

	table = xml_tree.xpath('//div[@id="table-%d"]' % i)
	table_frame = get_table(table[0])
	while table:
	i += 1
	table = xml_tree.xpath('//div[@id="table-%d"]' % i)
	if table:
	table_frame = pandas.concat((table_frame, get_table(table[0])))

	return table_frame

	def politely_open_url(url):
	"""
	This 403s a bit while doing the server load balancing, so politely knock
	again.
	"""
	global GLOBAL_NUMBER_OF_URL_TRIES
	try:
	response = urllib2.urlopen(url)
	GLOBAL_NUMBER_OF_URL_TRIES = 0
	return response
	except urllib2.HTTPError, error:
	if GLOBAL_NUMBER_OF_URL_TRIES > 100:
	print "More than 100 tries of %s failed" % url
	return error
	elif error.msg == 'Forbidden':
	GLOBAL_NUMBER_OF_URL_TRIES += 1
	sleep(1)
	return politely_open_url(url)
	else:
	return error


	def download_state_polls():
	"""
	Use this to download all state polls. Unforunately, there's no years on a lot of the
	data. But you should be able to cross-reference with latest.
	"""
	# need to walk this directory for the states plus DC
	url = "http://www.realclearpolitics.com/epolls/2012/president/"

	# inside each directory get the one called state_romney_vs_obama*.html
	state_xx = [key.lower() for key in states.keys()]

	table_data = []
	for state in states:
	url_link = url + state.lower() + '/'
	response = politely_open_url(url_link)
	xml_tree = html.parse(response)
	state_links = xml_tree.findall('//a')
	for link in state_links:
	link.make_links_absolute()
	link_url = link.get('href')
	# i don't think we have to worry about there being more than one
	if re.match("http://.+_romney_vs_obama-.+\.html", link_url):
	print "Trying to download %s" % link_url
	xml_tree = get_xml_from_url(link_url)
	try:
	# some states like Alaska and Alabama don't have any polls?
	table = xml_tree.xpath(
	'//div[@id="polling-data-full"]//table')[0]
	print "Downloaded %s" % link_url
	except:
	continue
	# this should work for states too
	state_table = get_national_tables(table)
	state_table["State"] = state
	table_data.append(state_table)
	# chill out for a second. server seems finnicky
	sleep(1)
	table_2012 = pandas.concat(table_data)
	return table_2012

	def get_national_tables(table):
	table_data = []
	for row in table.iterchildren():
	row = split_row(row)
	table_data.append(row)
	return pandas.DataFrame(table_data[1:], columns=table_data[0])


	def download_national_polls():
	#NOTE: the 2012 data is likely to update daily

	# you can browse around from here, sometimes I get forbidden, sometimes not
	# seems to depend which page I come from AFIACT. Refreshing also works.
	# I think it's bouncing me around to different servers.
	#http://www.realclearpolitics.com/epolls/
	# bush vs gore
	# can't find anything, just an electoral college map

	# bush vs kerry
	url = "http://www.realclearpolitics.com/epolls/2004/president/us/general_election_bush_vs_kerry-939.html"
	xml_tree = get_xml_from_url(url)
	table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
	table_2004 = get_national_table(table)
	# mccain vs obama
	url = "http://www.realclearpolitics.com/epolls/2008/president/us/general_election_mccain_vs_obama-225.html"
	xml_tree = get_xml_from_url(url)
	table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
	table_2008 = get_national_table(table)

	url = "http://www.realclearpolitics.com/epolls/2012/president/us/general_election_romney_vs_obama-1171.html"
	xml_tree = get_xml_from_url(url)
	table = xml_tree.xpath('//div[@id="polling-data-full"]/table')[0]
	table_2012 = get_national_table(table)
	return table_2004, table_2008, table_2012