Skip to content

Instantly share code, notes, and snippets.

@jwass
Created August 25, 2013 18:08
Show Gist options
  • Save jwass/6335354 to your computer and use it in GitHub Desktop.
Save jwass/6335354 to your computer and use it in GitHub Desktop.
Scrape MA state senate/representative contact info and merge it with the legislator's property info.
import difflib
import json
import re
import bs4
import pandas as pd
import requests
regex = re.compile('\((.*)\)')
def parse_contact_info(chamber):
if chamber == 'senate':
url = 'https://malegislature.gov/People/Senate'
else:
url = 'https://malegislature.gov/People/House'
r = requests.get(url)
# I really should be able to do pd.read_html(r.content)
# but for some reason it doesn't find the table so use bs4 to
# tease it out
soup = bs4.BeautifulSoup(r.content)
df = pd.read_html(soup.find('table').prettify(), infer_types=False)[0]
df.set_index(df['Name'].str.findall(regex).str[0].str.lower(),
inplace=True)
df = df[['Phone Number', 'Email Address']]
m = df.T.to_dict()
return m
def run_chamber(chamber):
contact = parse_contact_info(chamber)
if chamber == 'senate':
name_field = 'SENATOR'
filename = 'properties/senate.json'
else:
name_field = 'REP'
filename = 'properties/house.json'
with open(filename) as f:
props = json.loads(f.read())
matches = contact.keys()
for k, p in props.iteritems():
name = p[name_field]
if name == 'OPEN':
continue
name = name.replace(' (D)', '')
name = name.replace(' (R)', '')
name = name.lower()
# Use difflib to find the 'close' name matches. This helps greatly in
# places where there are middle initials, apostrophes, unicode
# characters and other stuff that would screw up an exact match
m = difflib.get_close_matches(name, matches, n=1, cutoff=0.63)
if not m:
print('No match for {}'.format(name))
continue
record = contact[m[0]]
p['PHONE'] = record['Phone Number']
p['EMAIL'] = record['Email Address']
with open(filename, 'w') as f:
f.write(json.dumps(props, indent=4, separators=(',', ': ')))
def main():
run_chamber('senate')
run_chamber('house')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment