scrape property tax PINs from Cook County IL's website
from lxml import html | |
import requests | |
import sqlite3 | |
import sys | |
con = sqlite3.connect("cook_county.db") | |
con.execute("CREATE TABLE IF NOT EXISTS properties(pin TEXT(18) UNIQUE, number TEXT(63), street TEXT(255), unit TEXT(255), city TEXT(255), zip TEXT(5))") | |
con.execute("DELETE FROM properties") | |
with con: | |
cur = con.cursor() | |
for qCity in ("e", "u", "p", "d", "l", "r", "chicago"): | |
for qNum in "0123456789": | |
for qStreet in "abcdefghijklmnopqrstuvwxyz": | |
print "Starting", qCity, qNum, qStreet, ". . .", | |
url = 'http://www.cookcountypropertyinfo.com/Pages/Address-Results.aspx?hnum=' + qNum + '&sname=' + qStreet + '%20&city=' + qCity +'&zip=&unit=&dir=' | |
page = requests.get(url) | |
dom = html.fromstring(page.text) | |
propertiesRaw = dom.xpath('//*[@id="ctl00_PlaceHolderMain_ctl00_resultsPanel"]/a/text()') | |
for propRaw in propertiesRaw: | |
splitLeftParen = propRaw.split('(') | |
if (len(splitLeftParen) > 2): | |
lastSegment = splitLeftParen.pop() | |
splitLeftParen = ('('.join(splitLeftParen), lastSegment) | |
fullAddress = splitLeftParen[0].strip() | |
splitAddress = fullAddress.split(' ', 1) | |
hnum = splitAddress[0].strip() | |
splitStreet = splitAddress[1].split("Unit", 1) | |
street = splitStreet[0].strip() | |
unit = None | |
if (len(splitStreet) > 1): | |
unit = splitStreet[1].strip() | |
splitRightParen = splitLeftParen[1].split(')', 1) | |
splitComma = splitRightParen[0].split(', ', 1) | |
city = splitComma[0].strip() | |
zipCode = splitComma[1].strip() | |
pin = splitRightParen[1].split('-', 1)[1].strip() | |
cur.execute("INSERT OR IGNORE INTO properties VALUES (:pin, :hnum, :street, :unit, :city, :zipCode)",\ | |
{"pin": pin, "hnum": hnum, "street": street, "unit": unit, "city": city, "zipCode": zipCode}) | |
con.commit() | |
print " ", len(propertiesRaw), "\trecords parsed!" | |
print "SCRAPING COMPLETE!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment