Skip to content

Instantly share code, notes, and snippets.

@frenata
Last active April 23, 2017 16:30
Show Gist options
  • Save frenata/4c9c30520b0128f6ed180a5c62a8946b to your computer and use it in GitHub Desktop.
Save frenata/4c9c30520b0128f6ed180a5c62a8946b to your computer and use it in GitHub Desktop.
no metadata!
from pupa.scrape import Person, Scraper
import lxml.html
import re
import pdb
class IAPersonScraper(Scraper):
jurisdiction = 'ia'
def scrape(self, chamber=None):
#pdb.set_trace()
session_id = self.metadata
#session_id = self.metadata['legislative_sessions'][session]['number']
if chamber:
yield from self.scrape_chamber(chamber)
else:
yield from self.scrape_chamber('upper')
yield from self.scrape_chamber('lower')
def scrape_chamber(self, chamber):
url = "https://www.legis.iowa.gov/legislators/"
if chamber == "lower":
url += "house"
else:
url += "senate"
page = lxml.html.fromstring(self.get(url).text)
page.make_links_absolute(url)
table = page.xpath('//table[@id="sortableTable"]')[0]
for link in table.xpath(".//a[contains(@href, 'legislator')]"):
yield from self.scrape_member(chamber, link)
def scrape_member(self, chamber, link):
name = link.text.strip()
leg_url = link.get('href')
district = link.xpath("string(../../td[3])")
party = link.xpath("string(../../td[4])")
email = link.xpath("string(../../td[5])")
if party == 'Democrat':
party = 'Democratic'
pid = re.search("personID=(\d+)", link.attrib['href']).group(1)
photo_url = ("https://www.legis.iowa.gov/photo"
"?action=getPhoto&ga=%s&pid=%s" % (session_id, pid))
leg = Person(
name=name,
primary_org=chamber,
district=district,
party=party,
image=photo_url)
leg.add_link(leg_url)
leg.add_source(leg_url)
leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
self.scrape_member_page(leg, leg_page)
yield leg
def scrape_member_page(self, leg, leg_page):
office_data = {
"Legislative E-mail:": "email",
"Home Phone:": "home_phone",
"Home Address:": "home_addr",
"Capitol Phone:": "office_phone",
}
metainf = {}
table, = leg_page.xpath(
"//div[@class='legisIndent divideVert']/table"
)
for row in table.xpath(".//tr"):
try:
key, value = (
x.text_content().strip() for x in row.xpath("./td")
)
except ValueError:
continue
try:
metainf[office_data[key]] = value
except KeyError:
continue
if "home_phone" in metainf:
leg.add_contact_detail(type='voice',
value=metainf['home_phone'],
note='District Office')
if "home_addr" in metainf:
leg.add_contact_detail(type='address',
value=metainf['home_addr'],
note='District Office')
if "email" in metainf:
leg.add_contact_detail(type='email',
value=metainf['email'],
note='Capitol Office')
if "office_phone" in metainf:
leg.add_contact_detail(type='voice',
value=metainf['office_phone'],
note='Capitol Office')
from pupa.scrape import Jurisdiction, Organization
from .people import IAPersonScraper
from .bills import IABillScraper
class Iowa(Jurisdiction):
division_id = "ocd-division/country:us/state:ia"
classification = "government"
name = "Iowa"
url = "https://www.legis.iowa.gov/"
scrapers = {
'people': IAPersonScraper,
#'bills': IABillScraper
}
parties = [
{'name': 'Republican'},
{'name': 'Democratic'}
]
legislative_sessions = [
{
"_scraped_name": "General Assembly: 84",
"end_date": "2013-01-13",
"identifier": "2011-2012",
"number": "84",
"name": "2011-2012 Regular Session",
"start_date": "2011-01-10",
},
{
"_scraped_name": "General Assembly: 85",
"identifier": "2013-2014",
"number": "85",
"name": "2013-2014 Regular Session",
},
{
"_scraped_name": "General Assembly: 86",
"identifier": "2015-2016",
"number": "86",
"name": "2015-2016 Regular Session",
},
{
"_scraped_name": "General Assembly: 87",
"identifier": "2017-2018",
"number": "87",
"name": "2017-2018 Regular Session",
}
]
ignored_scraped_sessions = [
"Legislative Assembly: 86",
"General Assembly: 83",
"General Assembly: 82",
"General Assembly: 81",
"General Assembly: 80",
"General Assembly: 79",
"General Assembly: 79",
"General Assembly: 78",
"General Assembly: 78",
"General Assembly: 77",
"General Assembly: 77",
"General Assembly: 76"
]
def get_organizations(self):
legislature_name = "Iowa General Assembly"
lower_chamber_name = "House"
lower_seats = 100
lower_title = "Representative"
upper_chamber_name = "Senate"
upper_seats = 50
upper_title = "Senator"
legislature = Organization(name=legislature_name,
classification="legislature")
upper = Organization(upper_chamber_name, classification='upper',
parent_id=legislature._id)
lower = Organization(lower_chamber_name, classification='lower',
parent_id=legislature._id)
for n in range(1, upper_seats + 1):
upper.add_post(
label=str(n), role=upper_title,
division_id='{}/sldu:{}'.format(self.division_id, n))
for n in range(1, lower_seats + 1):
lower.add_post(
label=str(n), role=lower_title,
division_id='{}/sldl:{}'.format(self.division_id, n))
yield legislature
yield upper
yield lower
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment