Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
scrape property tax PINs from Cook County IL's website
from lxml import html
import requests
import sqlite3
import sys
con = sqlite3.connect("cook_county.db")
con.execute("CREATE TABLE IF NOT EXISTS properties(pin TEXT(18) UNIQUE, number TEXT(63), street TEXT(255), unit TEXT(255), city TEXT(255), zip TEXT(5))")
con.execute("DELETE FROM properties")
with con:
cur = con.cursor()
for qCity in ("e", "u", "p", "d", "l", "r", "chicago"):
for qNum in "0123456789":
for qStreet in "abcdefghijklmnopqrstuvwxyz":
print "Starting", qCity, qNum, qStreet, ". . .",
url = 'http://www.cookcountypropertyinfo.com/Pages/Address-Results.aspx?hnum=' + qNum + '&sname=' + qStreet + '%20&city=' + qCity +'&zip=&unit=&dir='
page = requests.get(url)
dom = html.fromstring(page.text)
propertiesRaw = dom.xpath('//*[@id="ctl00_PlaceHolderMain_ctl00_resultsPanel"]/a/text()')
for propRaw in propertiesRaw:
splitLeftParen = propRaw.split('(')
if (len(splitLeftParen) > 2):
lastSegment = splitLeftParen.pop()
splitLeftParen = ('('.join(splitLeftParen), lastSegment)
fullAddress = splitLeftParen[0].strip()
splitAddress = fullAddress.split(' ', 1)
hnum = splitAddress[0].strip()
splitStreet = splitAddress[1].split("Unit", 1)
street = splitStreet[0].strip()
unit = None
if (len(splitStreet) > 1):
unit = splitStreet[1].strip()
splitRightParen = splitLeftParen[1].split(')', 1)
splitComma = splitRightParen[0].split(', ', 1)
city = splitComma[0].strip()
zipCode = splitComma[1].strip()
pin = splitRightParen[1].split('-', 1)[1].strip()
cur.execute("INSERT OR IGNORE INTO properties VALUES (:pin, :hnum, :street, :unit, :city, :zipCode)",\
{"pin": pin, "hnum": hnum, "street": street, "unit": unit, "city": city, "zipCode": zipCode})
con.commit()
print " ", len(propertiesRaw), "\trecords parsed!"
print
print "SCRAPING COMPLETE!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment