Skip to content

Instantly share code, notes, and snippets.

@dannguyen
Created June 17, 2015 22:08
Show Gist options
  • Save dannguyen/483cf7cca8dfc0287ffb to your computer and use it in GitHub Desktop.
Save dannguyen/483cf7cca8dfc0287ffb to your computer and use it in GitHub Desktop.
Sample scrape of Senate HTML roll call page, just for funsies. With Xpath
# The number of roll call votes that were rejected by a margin of less than 5 votes,
# in the first session of the U.S. Senate in the 114th Congress
from lxml import html
import requests
import re
congress_num = 114
session_num = 1
url = ('http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%s.htm'
% (congress_num, session_num))
doc = html.fromstring(requests.get(url).text)
# unnecessarily convoluted xpath statement, which I do here
# just so I can practice xpath statements
# http://stackoverflow.com/questions/1457638/xpath-get-nodes-where-child-node-contains-an-attribute
xstr = "//*[@id='contentArea']//table/tr[td[2][contains(text(), 'Rejected')]]"
# i.e. find all tr elements that have a 2nd td child with text that contains "Rejected"
xcount = 0
for r in doc.xpath(xstr):
yeas, nays = re.search('(\d+)-(\d+)', r.find('td').text_content()).groups()
if (int(nays) - int(yeas) < 5):
xcount += 1
print(xcount)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment