conorsg/sheriff-sales.py

## sheriff-sales.py
#!/usr/bin/python
# gets sheriff's sale data from http://www.civilsheriff.com/RealEstateSales.asp
#
# example of URL that returns sales:
# http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=11/6/2014
#
#
# the good people at http://www.notrs.com/ who made the sheriff's website could use a lesson or two in web development

from bs4 import BeautifulSoup
import urllib3
import re
import csv

http = urllib3.PoolManager()

dates = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateFilter.asp')
dates_soup = BeautifulSoup(dates.data)

date_vals = []
master = []
header_row = []

for option in dates_soup.find_all('option'):
    date_vals.append((option)['value'])


def getByDate(date):

    page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=' + str(date))
    soup = BeautifulSoup(page.data)
    header = soup.find(bgcolor = '#468445')
    page_rows = soup.find_all(bgcolor = 'White')

    if header is not None and len(header_row) == 0:
        for td in header.find_all('td'):
            val = td.get_text(strip = True)
            header_row.append(str(val))

    for page_row in page_rows:

        row = []
        cells = []

        for val in page_row.children:
            if val.name == 'td':
                cells.append(val)

        for cell in cells:

            def process(tag):

                if cell.find('a') and cell.get_text(strip = True) == 'STOPPED':
                    url = str(cell.find('a')['href'])
                    strings = re.search('(\w+),(\w+)', url)
                    case_id = strings.group(1)
                    real_estate_id = strings.group(2)

                    stop_page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateCard.asp?CaseID=' + case_id + '&RealEstateID=' + real_estate_id)
                    stop_soup = BeautifulSoup(stop_page.data)

                    sale_info = stop_soup.find('tr', text = re.compile('Sale Information')).find_next_siblings('tr')

                    for info in sale_info:
                        info = info.get_text(strip = True)
                        cell_values.append(str(info))

                else:
                    val = str(cell.get_text(strip = True))
                    cell_values.append(val)

            cell_values = []
            process(cell)
            row.append(cell_values)

        def cleanUp(x):
            x = str(x)
            x = re.sub("\['", "", x)
            x = re.sub("']", "", x)
            x = re.sub("'", "", x)
            return x

        row = [cleanUp(c) for c in row]
        master.append(row)

    print 'Scraped ' + date

# test
# test_dates = [u'1/1/1900', u'5/28/2015', u'5/21/2015']
# for date in test_dates:
#     getByDate(date)

for date in date_vals:
    getByDate(date)

with open("output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerow(header_row)
    writer.writerows(master)
	#!/usr/bin/python
	# gets sheriff's sale data from http://www.civilsheriff.com/RealEstateSales.asp
	#
	# example of URL that returns sales:
	# http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=11/6/2014
	#
	#
	# the good people at http://www.notrs.com/ who made the sheriff's website could use a lesson or two in web development

	from bs4 import BeautifulSoup
	import urllib3
	import re
	import csv

	http = urllib3.PoolManager()

	dates = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateFilter.asp')
	dates_soup = BeautifulSoup(dates.data)

	date_vals = []
	master = []
	header_row = []

	for option in dates_soup.find_all('option'):
	date_vals.append((option)['value'])


	def getByDate(date):

	page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateBody.asp?MYSaleDate=' + str(date))
	soup = BeautifulSoup(page.data)
	header = soup.find(bgcolor = '#468445')
	page_rows = soup.find_all(bgcolor = 'White')

	if header is not None and len(header_row) == 0:
	for td in header.find_all('td'):
	val = td.get_text(strip = True)
	header_row.append(str(val))

	for page_row in page_rows:

	row = []
	cells = []

	for val in page_row.children:
	if val.name == 'td':
	cells.append(val)

	for cell in cells:

	def process(tag):

	if cell.find('a') and cell.get_text(strip = True) == 'STOPPED':
	url = str(cell.find('a')['href'])
	strings = re.search('(\w+),(\w+)', url)
	case_id = strings.group(1)
	real_estate_id = strings.group(2)

	stop_page = http.request('GET', 'http://www.civilsheriff.com/RealEstate/RealEstateCard.asp?CaseID=' + case_id + '&RealEstateID=' + real_estate_id)
	stop_soup = BeautifulSoup(stop_page.data)

	sale_info = stop_soup.find('tr', text = re.compile('Sale Information')).find_next_siblings('tr')

	for info in sale_info:
	info = info.get_text(strip = True)
	cell_values.append(str(info))

	else:
	val = str(cell.get_text(strip = True))
	cell_values.append(val)

	cell_values = []
	process(cell)
	row.append(cell_values)

	def cleanUp(x):
	x = str(x)
	x = re.sub("\['", "", x)
	x = re.sub("']", "", x)
	x = re.sub("'", "", x)
	return x

	row = [cleanUp(c) for c in row]
	master.append(row)

	print 'Scraped ' + date

	# test
	# test_dates = [u'1/1/1900', u'5/28/2015', u'5/21/2015']
	# for date in test_dates:
	# getByDate(date)

	for date in date_vals:
	getByDate(date)

	with open("output.csv", "wb") as f:
	writer = csv.writer(f)
	writer.writerow(header_row)
	writer.writerows(master)