Last active
August 29, 2015 14:18
-
-
Save conorsg/72c025ad895f6f6f1b9f to your computer and use it in GitHub Desktop.
scrapes all the sheriff's sale data. this takes a looooong time to run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# gets sheriff's sale data from http://www.civilsheriff.com/RealEstateSales.asp | |
# | |
# example of URL that returns sales: | |
# http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=11/6/2014 | |
# | |
# | |
# the good people at http://www.notrs.com/ who made the sheriff's website could use a lesson or two in web development | |
from bs4 import BeautifulSoup | |
import urllib3 | |
import re | |
import csv | |
http = urllib3.PoolManager() | |
dates = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateFilter.asp') | |
dates_soup = BeautifulSoup(dates.data) | |
date_vals = [] | |
master = [] | |
header_row = [] | |
for option in dates_soup.find_all('option'): | |
date_vals.append((option)['value']) | |
def getByDate(date): | |
page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=' + str(date)) | |
soup = BeautifulSoup(page.data) | |
header = soup.find(bgcolor = '#468445') | |
page_rows = soup.find_all(bgcolor = 'White') | |
if header is not None and len(header_row) == 0: | |
for td in header.find_all('td'): | |
val = td.get_text(strip = True) | |
header_row.append(str(val)) | |
for page_row in page_rows: | |
row = [] | |
cells = [] | |
for val in page_row.children: | |
if val.name == 'td': | |
cells.append(val) | |
for cell in cells: | |
def process(tag): | |
if cell.find('a') and cell.get_text(strip = True) == 'STOPPED': | |
url = str(cell.find('a')['href']) | |
strings = re.search('(\w+),(\w+)', url) | |
case_id = strings.group(1) | |
real_estate_id = strings.group(2) | |
stop_page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateCard.asp?CaseID=' + case_id + '&RealEstateID=' + real_estate_id) | |
stop_soup = BeautifulSoup(stop_page.data) | |
sale_info = stop_soup.find('tr', text = re.compile('Sale Information')).find_next_siblings('tr') | |
for info in sale_info: | |
info = info.get_text(strip = True) | |
cell_values.append(str(info)) | |
else: | |
val = str(cell.get_text(strip = True)) | |
cell_values.append(val) | |
cell_values = [] | |
process(cell) | |
row.append(cell_values) | |
def cleanUp(x): | |
x = str(x) | |
x = re.sub("\['", "", x) | |
x = re.sub("']", "", x) | |
x = re.sub("'", "", x) | |
return x | |
row = [cleanUp(c) for c in row] | |
master.append(row) | |
print 'Scraped ' + date | |
# test | |
# test_dates = [u'1/1/1900', u'5/28/2015', u'5/21/2015'] | |
# for date in test_dates: | |
# getByDate(date) | |
for date in date_vals: | |
getByDate(date) | |
with open("output.csv", "wb") as f: | |
writer = csv.writer(f) | |
writer.writerow(header_row) | |
writer.writerows(master) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment