Create a gist now

Instantly share code, notes, and snippets.

Philadelphia Bail Debt Scraper

One or the other of these is the scraper I used for the Philadelphia Court Debt project. I didn't realize until after I promised everyone in the room my code that I had never published it.

Not for nothing, I make no assertion that this is great programming or even remotely reflects best practices in python or scraping. There's plenty here that I would do differently if I did it again today.

I don't even know which of these was the final scraper. I could figure that out but then I'd never get around to throwing it online. So here it is, warts and all.

import urllib2
from bs4 import BeautifulSoup
import string
import urlparse
import re
from datetime import datetime
import csv
#import psycopg2
__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/'
def build_url(l, n=''):
start_string = 'http://www.courts.phila.gov/collections/index.asp?search='
base_url = start_string+l+'%25&page='+n
return base_url
def get_soup(url):
soup = BeautifulSoup(urllib2.urlopen(url))
return soup
def date_format(datestring):
# Assumes dates in the format 11/5/2003, which is what this data set uses.
try:
better_date = datetime.strptime(datestring, '%m/%d/%Y')
except ValueError as error:
better_date = ''
except:
store_exception(error,94, datestring)
return better_date
def store_exception(exception, line_number, some_string, output_base='base'):
data = {
'timestamp' : datetime.now(),
'line_number': line_number,
'message' : exception,
'string' : some_string
}
data_order = ['timestamp', 'line_number', 'message', 'string']
output_file = output_base+'_error_log'
csv_row(data,data_order,output_file)
## Now we store things
## http://docs.python.org/2/library/csv.html?highlight=strings
## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files
def csv_row(data,fieldorder,filename, base=__base__):
full_path = __base__+filename+'.csv'
print "writing", full_path
with open(full_path, 'a+') as csvfile:
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
linewriter.writerow(data)
def get_pagecount(l):
url = build_url(l)
soup = get_soup(url)
graph = soup.find('p',{'id':'search'})
strings = re.match("(.*) Records Found.\s*Displaying page(.*)\s*of(.*):",graph.text.strip()).groups()
rec_count = strings[0].strip()
page_no = strings[1].strip()
total_pages = strings[2].strip()
return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages}
def get_all_urls(alphabet,output_base):
urls = []
try:
for letter in alphabet:
page_count = get_pagecount(letter)
letter_data = {
'letter' : letter,
'records' : page_count['rec_count'],
'pages' : page_count['total_pages'],
'start' : page_count['page_no']
}
data_order = ['letter' , 'records' , 'pages' , 'start']
output_file = output_base+'_record_counts'
csv_row(letter_data,data_order,output_file)
pages = int(get_pagecount(letter)['total_pages'])
for page in range(1,pages+1):
url = build_url(letter, str(page))
urls.append(url)
except Exception as error:
store_exception(error,'51')
return urls
def scrape_soup(soup):
table = soup.find('table' , {'id' : 'Defendant'})
table_data = []
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) == 5:
name = cells[0].get_text().strip()
names = name.replace(u'\xa0',u' ').split(',',1)
first_name = names[0]
last_name = names[1]
address = cells[1].get_text().strip()
payment_plan = cells[2].get_text().strip()
plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
plan_court = plan_details[0]
plan_year = plan_details[1]
plan_number = plan_details[2]
status = cells[3].get_text().strip()
current_balance = cells[4].get_text().strip()
case_data = {
'first_name' : first_name,
'last_name' : last_name,
'address' : address.replace(u'\xa0',u' '),
'plan_court' : plan_court,
'plan_year' : plan_year,
'plan_number' : plan_number,
'status' : status.replace(u'\xa0',u' '),
'current_balance' : current_balance.replace(u'\xa0',u' ')
}
table_data.append(case_data)
return table_data
def scrape_table(url,linewriter):
soup = get_soup(url)
table = soup.find('table' , {'id' : 'Defendant'})
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) == 5:
name = cells[0].get_text().strip()
names = name.replace(u'\xa0',u' ').split(',',1)
first_name = names[0]
last_name = names[1]
address = cells[1].get_text().strip()
payment_plan = cells[2].get_text().strip()
plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
plan_court = plan_details[0]
plan_year = plan_details[1]
plan_number = plan_details[2]
status = cells[3].get_text().strip()
current_balance = cells[4].get_text().strip()
case_data = {
'first_name' : first_name,
'last_name' : last_name,
'address' : address.replace(u'\xa0',u' '),
'plan_court' : plan_court,
'plan_year' : plan_year,
'plan_number' : plan_number,
'status' : status.replace(u'\xa0',u' '),
'current_balance' : current_balance.replace(u'\xa0',u' ')
}
linewriter.writerow(case_data)
else:
continue
## making this more efficient by keeping the page open.
def scrape_all_pages(alphabet,output_to):
for letter in alphabet:
pages = get_all_urls(letter, output_to)
full_path = __base__+output_to+'.csv'
with open(full_path, 'a+') as csvfile:
fieldorder = ['last_name','first_name' , 'address' , 'plan_court', 'plan_year','plan_number' , 'status' , 'current_balance']
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for page in pages:
scrape_table(page,linewriter)
alphabet = string.ascii_uppercase
import urllib2
from bs4 import BeautifulSoup
import string
import urlparse
import re
from datetime import datetime
import csv
#import psycopg2
def build_url(l,n=''):
base_url = 'http://www.courts.phila.gov/mtvr/name.asp?search='+l+'%25&page='+n
return base_url
def get_soup(url):
soup = BeautifulSoup(urllib2.urlopen(url))
return soup
def date_format(datestring):
# Assumes dates in the format 11/5/2003, which is what this data set uses.
try:
better_date = datetime.strptime(datestring, '%m/%d/%Y')
except ValueError as error:
better_date = ''
except:
store_exception(error,94, datestring)
return better_date
def store_exception(exception, line_number, some_string):
data = {
'timestamp' : datetime.now(),
'line_number': line_number,
'message' : exception,
'string' : some_string
}
data_order = ['timestamp', 'line_number', 'message', 'string']
csv_row(data,data_order,'bail_error_log')
## Now we store things
## http://docs.python.org/2/library/csv.html?highlight=strings
## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files
__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/'
def csv_row(data,fieldorder,filename, base=__base__):
full_path = __base__+filename+'1127.csv'
print "writing", full_path
with open(full_path, 'a+') as csvfile:
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
linewriter.writerow(data)
def pg_row(dictionary):
connection = "host='localhost' dbname='Philadelphia'"
# print the connection string we will use to connect
print "Connecting to database\n ->%s" % (conn_string)
# get a connection, if a connect cannot be made an exception will be raised here
conn = psycopg2.connect(conn_string)
# conn.cursor will return a cursor object, you can use this cursor to perform queries
cursor = conn.cursor()
print "Connected!\n"
def get_pagecount(l):
url = build_url(l)
soup = get_soup(url)
graph = soup.find('p',{'id':'search'})
strings = re.match("(.*) Records Found.\s*Displaying page(.*)\s*of(.*):",graph.text.strip()).groups()
rec_count = strings[0].strip()
page_no = strings[1].strip()
total_pages = strings[2].strip()
return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages}
def get_all_urls(alphabet):
urls = []
try:
for letter in alphabet:
page_count = get_pagecount(letter)
letter_data = {
'letter' : letter,
'records' : page_count['rec_count'],
'pages' : page_count['total_pages'],
'start' : page_count['page_no']
}
data_order = ['letter' , 'records' , 'pages' , 'start']
csv_row(letter_data,data_order,'record_counts')
pages = int(get_pagecount(letter)['total_pages'])
for page in range(1,pages+1):
url = build_url(letter, str(page))
urls.append(url)
except Exception as error:
store_exception(error,'51')
return urls
def scrape_soup(soup):
table = soup.find('table' , {'id' : 'Defendant'})
table_data = []
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) > 0:
name_bits = cells[0].get_text().strip().split(',')
defendant_last = name_bits[0].strip()
defendant_first = name_bits[1].strip()
docket = cells[1].get_text().strip()
CPCMS_docket = cells[2].get_text().strip()
failure_to_appear_date = date_format(cells[3].get_text().strip())
bench_warrant_hearing_date = date_format(cells[4].get_text().strip())
judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip()
j_strings = list(BeautifulSoup(judgment).td.strings)
judgment_no = j_strings[1].strip()
j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip())
judgment_date = date_format(j_match.groups()[0].strip())
judgment_amt = j_match.groups()[1].strip()
if 'Judgment against' in j_strings[3]:
judgment_against = j_strings[4].strip()
else:
judgment_against = "n/a"
case_data = {
'defendant_first' : defendant_first,
'defendant_last' : defendant_last,
'docket' : docket,
'CPCMS_docket' : CPCMS_docket,
'failure_to_appear_numeric' : failure_to_appear_date,
'bench_warrant_hearing_numeric' : bench_warrant_hearing_date,
'judgment_no' : judgment_no,
'judgment_date_numeric' : judgment_date,
'judgment_amt_numeric' : judgment_amt,
'judgment_against' : judgment_against
}
table_data.append(case_data)
return table_data
def scrape_table(url,linewriter):
soup = get_soup(url)
table = soup.find('table' , {'id' : 'Defendant'})
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) > 0:
name_bits = cells[0].get_text().strip().split(',')
defendant_last = name_bits[0].strip()
defendant_first = name_bits[1].strip()
docket = cells[1].get_text().strip()
CPCMS_docket = cells[2].get_text().strip()
failure_to_appear_date = date_format(cells[3].get_text().strip())
bench_warrant_hearing_date = date_format(cells[4].get_text().strip())
judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip()
j_strings = list(BeautifulSoup(judgment).td.strings)
judgment_no = j_strings[1].strip()
j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip())
judgment_date = date_format(j_match.groups()[0].strip())
judgment_amt = j_match.groups()[1].strip()
if 'Judgment against' in j_strings[3]:
judgment_against = j_strings[4].strip()
else:
judgment_against = "n/a"
case_data = {
'defendant_first' : defendant_first,
'defendant_last' : defendant_last,
'docket' : docket,
'CPCMS_docket' : CPCMS_docket,
'failure_to_appear_numeric' : failure_to_appear_date,
'bench_warrant_hearing_numeric' : bench_warrant_hearing_date,
'judgment_no' : judgment_no,
'judgment_date_numeric' : judgment_date,
'judgment_amt_numeric' : judgment_amt,
'judgment_against' : judgment_against
}
# Moved the field order to the loop for efficiency. Per
# http://stackoverflow.com/questions/13573122/scraping-tables-and-writing-rows-to-csv/
linewriter.writerow(case_data)
else:
continue
## making this more efficient by keeping the page open.
def scrape_all_pages(alphabet):
for letter in alphabet:
pages = get_all_urls(letter)
full_path = __base__+'optimized_bail_1127.csv'
with open(full_path, 'a+') as csvfile:
fieldorder = ['defendant_first' , 'defendant_last' , 'docket' , 'CPCMS_docket' ,
'failure_to_appear_numeric', 'bench_warrant_hearing_numeric',
'judgment_no', 'judgment_date_numeric', 'judgment_amt_numeric',
'judgment_against']
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for page in pages:
scrape_table(page,linewriter)
alphabet = string.ascii_uppercase
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment