amandabee/README.md

## README.md

      
    Raw
  

              README.md
            
          
    One or the other of these is the scraper I used for the Philadelphia Court Debt project. I didn't realize until after I promised everyone in the room my code that I had never published it.
Not for nothing, I make no assertion that this is great programming or even remotely reflects best practices in python or scraping. There's plenty here that I would do differently if I did it again today.
I don't even know which of these was the final scraper. I could figure that out but then I'd never get around to throwing it online. So here it is, warts and all.

  
## othersample.py
import urllib2
from bs4 import BeautifulSoup
import string
import urlparse
import re
from datetime import datetime
import csv
#import psycopg2

__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/'

def build_url(l, n=''):
    start_string = 'http://www.courts.phila.gov/collections/index.asp?search='
    base_url = start_string+l+'%25&page='+n
    return base_url

def get_soup(url):
    soup = BeautifulSoup(urllib2.urlopen(url))
    return soup


def date_format(datestring):
    # Assumes dates in the format 11/5/2003, which is what this data set uses.
    try:
        better_date = datetime.strptime(datestring, '%m/%d/%Y')
    except ValueError as error:
        better_date = ''
    except:
        store_exception(error,94, datestring)
    return better_date

def store_exception(exception, line_number, some_string, output_base='base'):
    data = {
        'timestamp'  : datetime.now(),
        'line_number': line_number,
        'message'    : exception,
        'string'     : some_string
    }
    data_order = ['timestamp', 'line_number', 'message', 'string']
    output_file = output_base+'_error_log'
    csv_row(data,data_order,output_file)

## Now we store things
## http://docs.python.org/2/library/csv.html?highlight=strings
## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files


def csv_row(data,fieldorder,filename, base=__base__):
    full_path = __base__+filename+'.csv'
    print "writing", full_path
    with open(full_path, 'a+') as csvfile:
        linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        linewriter.writerow(data)

def get_pagecount(l):
    url = build_url(l)
    soup = get_soup(url)
    graph = soup.find('p',{'id':'search'})
    strings = re.match("(.*) Records Found.\s*Displaying page(.*)\s*of(.*):",graph.text.strip()).groups()
    rec_count = strings[0].strip()
    page_no  = strings[1].strip()
    total_pages  = strings[2].strip()
    return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages}

def get_all_urls(alphabet,output_base):
    urls = []
    try:
        for letter in alphabet:
            page_count =  get_pagecount(letter)
            letter_data = {
                'letter'  : letter,
                'records' : page_count['rec_count'],
                'pages'   : page_count['total_pages'],
                'start'   : page_count['page_no']
            }
            data_order = ['letter' , 'records' , 'pages' , 'start']
            output_file = output_base+'_record_counts'
            csv_row(letter_data,data_order,output_file)
            pages = int(get_pagecount(letter)['total_pages'])
            for page in range(1,pages+1):
                url = build_url(letter, str(page))
                urls.append(url)
    except Exception as error:
        store_exception(error,'51')
    return urls

def scrape_soup(soup):
    table = soup.find('table' , {'id' : 'Defendant'})
    table_data = []
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 5:
            name = cells[0].get_text().strip()
            names = name.replace(u'\xa0',u' ').split(',',1)
            first_name = names[0]
            last_name = names[1]
            address = cells[1].get_text().strip()
            payment_plan  = cells[2].get_text().strip()
            plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
            plan_court = plan_details[0]
            plan_year = plan_details[1]
            plan_number = plan_details[2]
            status = cells[3].get_text().strip()
            current_balance  = cells[4].get_text().strip()
            case_data = {
                'first_name'  : first_name,
                'last_name'  : last_name,
                'address' : address.replace(u'\xa0',u' '),
                'plan_court' : plan_court,
                'plan_year' :  plan_year,
                'plan_number' : plan_number,
                'status' : status.replace(u'\xa0',u' '),
                'current_balance' : current_balance.replace(u'\xa0',u' ')
            }
            table_data.append(case_data)
    return table_data

def scrape_table(url,linewriter):
    soup = get_soup(url)
    table = soup.find('table' , {'id' : 'Defendant'})
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 5:
            name = cells[0].get_text().strip()
            names = name.replace(u'\xa0',u' ').split(',',1)
            first_name = names[0]
            last_name = names[1]
            address = cells[1].get_text().strip()
            payment_plan  = cells[2].get_text().strip()
            plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
            plan_court = plan_details[0]
            plan_year = plan_details[1]
            plan_number = plan_details[2]
            status = cells[3].get_text().strip()
            current_balance  = cells[4].get_text().strip()
            case_data = {
                'first_name'  : first_name,
                'last_name'  : last_name,
                'address' : address.replace(u'\xa0',u' '),
                'plan_court' : plan_court,
                'plan_year' :  plan_year,
                'plan_number' : plan_number,
                'status' : status.replace(u'\xa0',u' '),
                'current_balance' : current_balance.replace(u'\xa0',u' ')
            }
            linewriter.writerow(case_data)
        else:
            continue

## making this more efficient by keeping the page open.
def scrape_all_pages(alphabet,output_to):
    for letter in alphabet:
        pages = get_all_urls(letter, output_to)
        full_path = __base__+output_to+'.csv'
        with open(full_path, 'a+') as csvfile:
            fieldorder = ['last_name','first_name' , 'address' , 'plan_court', 'plan_year','plan_number' , 'status' , 'current_balance']
            linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for page in pages:
                scrape_table(page,linewriter)


alphabet = string.ascii_uppercase

## sample.py
import urllib2
from bs4 import BeautifulSoup
import string
import urlparse
import re
from datetime import datetime
import csv
#import psycopg2

def build_url(l,n=''):
    base_url = 'http://www.courts.phila.gov/mtvr/name.asp?search='+l+'%25&page='+n
    return base_url

def get_soup(url):
    soup = BeautifulSoup(urllib2.urlopen(url))
    return soup


def date_format(datestring):
    # Assumes dates in the format 11/5/2003, which is what this data set uses.
    try:
        better_date = datetime.strptime(datestring, '%m/%d/%Y')
    except ValueError as error:
        better_date = ''
    except:
        store_exception(error,94, datestring)
    return better_date

def store_exception(exception, line_number, some_string):
    data = {
        'timestamp'  : datetime.now(),
        'line_number': line_number,
        'message'    : exception,
        'string'     : some_string
    }
    data_order = ['timestamp', 'line_number', 'message', 'string']
    csv_row(data,data_order,'bail_error_log')

## Now we store things
## http://docs.python.org/2/library/csv.html?highlight=strings
## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files

__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/'
def csv_row(data,fieldorder,filename, base=__base__):
    full_path = __base__+filename+'1127.csv'
    print "writing", full_path
    with open(full_path, 'a+') as csvfile:
        linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        linewriter.writerow(data)

def pg_row(dictionary):
    connection = "host='localhost' dbname='Philadelphia'"
    # print the connection string we will use to connect
    print "Connecting to database\n    ->%s" % (conn_string)
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
    print "Connected!\n"

def get_pagecount(l):
    url = build_url(l)
    soup = get_soup(url)
    graph = soup.find('p',{'id':'search'})
    strings = re.match("(.*) Records Found.\s*Displaying page(.*)\s*of(.*):",graph.text.strip()).groups()
    rec_count = strings[0].strip()
    page_no  = strings[1].strip()
    total_pages  = strings[2].strip()
    return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages}

def get_all_urls(alphabet):
    urls = []
    try:
        for letter in alphabet:
            page_count =  get_pagecount(letter)
            letter_data = {
                'letter'  : letter,
                'records' : page_count['rec_count'],
                'pages'   : page_count['total_pages'],
                'start'   : page_count['page_no']
            }
            data_order = ['letter' , 'records' , 'pages' , 'start']
            csv_row(letter_data,data_order,'record_counts')
            pages = int(get_pagecount(letter)['total_pages'])
            for page in range(1,pages+1):
                url = build_url(letter, str(page))
                urls.append(url)
    except Exception as error:
        store_exception(error,'51')
    return urls

def scrape_soup(soup):
    table = soup.find('table' , {'id' : 'Defendant'})
    table_data = []
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) > 0:
            name_bits = cells[0].get_text().strip().split(',')
            defendant_last = name_bits[0].strip()
            defendant_first = name_bits[1].strip()
            docket = cells[1].get_text().strip()
            CPCMS_docket = cells[2].get_text().strip()
            failure_to_appear_date = date_format(cells[3].get_text().strip())
            bench_warrant_hearing_date = date_format(cells[4].get_text().strip())
            judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip()
            j_strings = list(BeautifulSoup(judgment).td.strings)
            judgment_no = j_strings[1].strip()
            j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip())
            judgment_date = date_format(j_match.groups()[0].strip())
            judgment_amt = j_match.groups()[1].strip()
            if 'Judgment against' in j_strings[3]:
            	judgment_against = j_strings[4].strip()
            else:
            	judgment_against = "n/a"
            case_data = {
                'defendant_first'           : defendant_first,
                'defendant_last'            : defendant_last,
                'docket'                    : docket,
                'CPCMS_docket'              : CPCMS_docket,
                'failure_to_appear_numeric' : failure_to_appear_date,
                'bench_warrant_hearing_numeric'      : bench_warrant_hearing_date,
                'judgment_no'              : judgment_no,
                'judgment_date_numeric'    : judgment_date,
                'judgment_amt_numeric'     : judgment_amt,
                'judgment_against'         : judgment_against
            }
            table_data.append(case_data)
    return table_data

def scrape_table(url,linewriter):
    soup = get_soup(url)
    table = soup.find('table' , {'id' : 'Defendant'})
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) > 0:
            name_bits = cells[0].get_text().strip().split(',')
            defendant_last = name_bits[0].strip()
            defendant_first = name_bits[1].strip()
            docket = cells[1].get_text().strip()
            CPCMS_docket = cells[2].get_text().strip()
            failure_to_appear_date = date_format(cells[3].get_text().strip())
            bench_warrant_hearing_date = date_format(cells[4].get_text().strip())
            judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip()
            j_strings = list(BeautifulSoup(judgment).td.strings)
            judgment_no = j_strings[1].strip()
            j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip())
            judgment_date = date_format(j_match.groups()[0].strip())
            judgment_amt = j_match.groups()[1].strip()
            if 'Judgment against' in j_strings[3]:
            	judgment_against = j_strings[4].strip()
            else:
            	judgment_against = "n/a"
            case_data = {
                'defendant_first'           : defendant_first,
                'defendant_last'            : defendant_last,
                'docket'                    : docket,
                'CPCMS_docket'              : CPCMS_docket,
                'failure_to_appear_numeric' : failure_to_appear_date,
                'bench_warrant_hearing_numeric'      : bench_warrant_hearing_date,
                'judgment_no'              : judgment_no,
                'judgment_date_numeric'    : judgment_date,
                'judgment_amt_numeric'     : judgment_amt,
                'judgment_against'         : judgment_against
            }
            # Moved the field order to the loop for efficiency. Per
            # http://stackoverflow.com/questions/13573122/scraping-tables-and-writing-rows-to-csv/
            linewriter.writerow(case_data)
        else:
            continue

## making this more efficient by keeping the page open.
def scrape_all_pages(alphabet):
    for letter in alphabet:
        pages = get_all_urls(letter)
        full_path = __base__+'optimized_bail_1127.csv'
        with open(full_path, 'a+') as csvfile:
            fieldorder = ['defendant_first' , 'defendant_last' , 'docket' , 'CPCMS_docket' ,
                        'failure_to_appear_numeric', 'bench_warrant_hearing_numeric',
                        'judgment_no', 'judgment_date_numeric', 'judgment_amt_numeric',
                        'judgment_against']
            linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for page in pages:
                scrape_table(page,linewriter)


alphabet = string.ascii_uppercase
	import urllib2
	from bs4 import BeautifulSoup
	import string
	import urlparse
	import re
	from datetime import datetime
	import csv
	#import psycopg2

	__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/'

	def build_url(l, n=''):
	start_string = 'http://www.courts.phila.gov/collections/index.asp?search='
	base_url = start_string+l+'%25&page='+n
	return base_url

	def get_soup(url):
	soup = BeautifulSoup(urllib2.urlopen(url))
	return soup


	def date_format(datestring):
	# Assumes dates in the format 11/5/2003, which is what this data set uses.
	try:
	better_date = datetime.strptime(datestring, '%m/%d/%Y')
	except ValueError as error:
	better_date = ''
	except:
	store_exception(error,94, datestring)
	return better_date

	def store_exception(exception, line_number, some_string, output_base='base'):
	data = {
	'timestamp' : datetime.now(),
	'line_number': line_number,
	'message' : exception,
	'string' : some_string
	}
	data_order = ['timestamp', 'line_number', 'message', 'string']
	output_file = output_base+'_error_log'
	csv_row(data,data_order,output_file)

	## Now we store things
	## http://docs.python.org/2/library/csv.html?highlight=strings
	## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files



	def csv_row(data,fieldorder,filename, base=__base__):
	full_path = __base__+filename+'.csv'
	print "writing", full_path
	with open(full_path, 'a+') as csvfile:
	linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='\|',
	quotechar='"', quoting=csv.QUOTE_MINIMAL)
	linewriter.writerow(data)

	def get_pagecount(l):
	url = build_url(l)
	soup = get_soup(url)
	graph = soup.find('p',{'id':'search'})
	strings = re.match("(.) Records Found.\sDisplaying page(.)\sof(.*):",graph.text.strip()).groups()
	rec_count = strings[0].strip()
	page_no = strings[1].strip()
	total_pages = strings[2].strip()
	return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages}

	def get_all_urls(alphabet,output_base):
	urls = []
	try:
	for letter in alphabet:
	page_count = get_pagecount(letter)
	letter_data = {
	'letter' : letter,
	'records' : page_count['rec_count'],
	'pages' : page_count['total_pages'],
	'start' : page_count['page_no']
	}
	data_order = ['letter' , 'records' , 'pages' , 'start']
	output_file = output_base+'_record_counts'
	csv_row(letter_data,data_order,output_file)
	pages = int(get_pagecount(letter)['total_pages'])
	for page in range(1,pages+1):
	url = build_url(letter, str(page))
	urls.append(url)
	except Exception as error:
	store_exception(error,'51')
	return urls

	def scrape_soup(soup):
	table = soup.find('table' , {'id' : 'Defendant'})
	table_data = []
	for row in table.find_all('tr'):
	cells = row.find_all('td')
	if len(cells) == 5:
	name = cells[0].get_text().strip()
	names = name.replace(u'\xa0',u' ').split(',',1)
	first_name = names[0]
	last_name = names[1]
	address = cells[1].get_text().strip()
	payment_plan = cells[2].get_text().strip()
	plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
	plan_court = plan_details[0]
	plan_year = plan_details[1]
	plan_number = plan_details[2]
	status = cells[3].get_text().strip()
	current_balance = cells[4].get_text().strip()
	case_data = {
	'first_name' : first_name,
	'last_name' : last_name,
	'address' : address.replace(u'\xa0',u' '),
	'plan_court' : plan_court,
	'plan_year' : plan_year,
	'plan_number' : plan_number,
	'status' : status.replace(u'\xa0',u' '),
	'current_balance' : current_balance.replace(u'\xa0',u' ')
	}
	table_data.append(case_data)
	return table_data

	def scrape_table(url,linewriter):
	soup = get_soup(url)
	table = soup.find('table' , {'id' : 'Defendant'})
	for row in table.find_all('tr'):
	cells = row.find_all('td')
	if len(cells) == 5:
	name = cells[0].get_text().strip()
	names = name.replace(u'\xa0',u' ').split(',',1)
	first_name = names[0]
	last_name = names[1]
	address = cells[1].get_text().strip()
	payment_plan = cells[2].get_text().strip()
	plan_details = payment_plan.replace(u'\xa0',u' ').split('-',2)
	plan_court = plan_details[0]
	plan_year = plan_details[1]
	plan_number = plan_details[2]
	status = cells[3].get_text().strip()
	current_balance = cells[4].get_text().strip()
	case_data = {
	'first_name' : first_name,
	'last_name' : last_name,
	'address' : address.replace(u'\xa0',u' '),
	'plan_court' : plan_court,
	'plan_year' : plan_year,
	'plan_number' : plan_number,
	'status' : status.replace(u'\xa0',u' '),
	'current_balance' : current_balance.replace(u'\xa0',u' ')
	}
	linewriter.writerow(case_data)
	else:
	continue

	## making this more efficient by keeping the page open.
	def scrape_all_pages(alphabet,output_to):
	for letter in alphabet:
	pages = get_all_urls(letter, output_to)
	full_path = __base__+output_to+'.csv'
	with open(full_path, 'a+') as csvfile:
	fieldorder = ['last_name','first_name' , 'address' , 'plan_court', 'plan_year','plan_number' , 'status' , 'current_balance']
	linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='\|',
	quotechar='"', quoting=csv.QUOTE_MINIMAL)
	for page in pages:
	scrape_table(page,linewriter)


	alphabet = string.ascii_uppercase