Skip to content

Instantly share code, notes, and snippets.

@conorsg
Last active August 29, 2015 14:18
Show Gist options
  • Save conorsg/72c025ad895f6f6f1b9f to your computer and use it in GitHub Desktop.
Save conorsg/72c025ad895f6f6f1b9f to your computer and use it in GitHub Desktop.
scrapes all the sheriff's sale data. this takes a looooong time to run
#!/usr/bin/python
# gets sheriff's sale data from http://www.civilsheriff.com/RealEstateSales.asp
#
# example of URL that returns sales:
# http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=11/6/2014
#
#
# the good people at http://www.notrs.com/ who made the sheriff's website could use a lesson or two in web development
from bs4 import BeautifulSoup
import urllib3
import re
import csv
http = urllib3.PoolManager()
dates = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateFilter.asp')
dates_soup = BeautifulSoup(dates.data)
date_vals = []
master = []
header_row = []
for option in dates_soup.find_all('option'):
date_vals.append((option)['value'])
def getByDate(date):
page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=' + str(date))
soup = BeautifulSoup(page.data)
header = soup.find(bgcolor = '#468445')
page_rows = soup.find_all(bgcolor = 'White')
if header is not None and len(header_row) == 0:
for td in header.find_all('td'):
val = td.get_text(strip = True)
header_row.append(str(val))
for page_row in page_rows:
row = []
cells = []
for val in page_row.children:
if val.name == 'td':
cells.append(val)
for cell in cells:
def process(tag):
if cell.find('a') and cell.get_text(strip = True) == 'STOPPED':
url = str(cell.find('a')['href'])
strings = re.search('(\w+),(\w+)', url)
case_id = strings.group(1)
real_estate_id = strings.group(2)
stop_page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateCard.asp?CaseID=' + case_id + '&RealEstateID=' + real_estate_id)
stop_soup = BeautifulSoup(stop_page.data)
sale_info = stop_soup.find('tr', text = re.compile('Sale Information')).find_next_siblings('tr')
for info in sale_info:
info = info.get_text(strip = True)
cell_values.append(str(info))
else:
val = str(cell.get_text(strip = True))
cell_values.append(val)
cell_values = []
process(cell)
row.append(cell_values)
def cleanUp(x):
x = str(x)
x = re.sub("\['", "", x)
x = re.sub("']", "", x)
x = re.sub("'", "", x)
return x
row = [cleanUp(c) for c in row]
master.append(row)
print 'Scraped ' + date
# test
# test_dates = [u'1/1/1900', u'5/28/2015', u'5/21/2015']
# for date in test_dates:
# getByDate(date)
for date in date_vals:
getByDate(date)
with open("output.csv", "wb") as f:
writer = csv.writer(f)
writer.writerow(header_row)
writer.writerows(master)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment