Python scraper to pull Wisconsin state senator and state representative district contact information and biographies into a text file or csv.
Last active
September 29, 2015 03:57
-
-
Save chrislkeller/1543032 to your computer and use it in GitHub Desktop.
Scrape Wisconsin state representative bios
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import requests | |
import lxml | |
from lxml import html | |
from django.utils.encoding import smart_str, smart_unicode | |
#opens text file for output, names it output | |
file = open('output.txt', 'w') | |
endpoint = 99 | |
district = 1 | |
while district <= endpoint: | |
#search URL and assign to variable r | |
r = requests.get('http://legis.wisconsin.gov/w3asp/contact/legislatorpages.aspx?house=Assembly&district=' + str(district) + '&display=bio') | |
#create variable tree from r's content | |
tree = lxml.html.fromstring(r.content) | |
#search the tree for the given element | |
elements = tree.cssselect("div.indent span") | |
#for each element in the variable | |
for el in elements: | |
#set data to the content | |
data = el.text_content().strip().encode('utf-8') | |
#display the data | |
print data | |
#write the data to the file | |
file.write(data) | |
district = district + 1 | |
#close the file | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment