-
-
Save ericbaranowski/384b7e94448dcb3e9519fa9e5ed7872f to your computer and use it in GitHub Desktop.
Inmate scraping script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
# We've now imported the two packages that will do the heavy lifting | |
# for us, reqeusts and BeautifulSoup | |
# This is the URL that lists the current inmates | |
# Should this URL go away, and archive is available at | |
# http://perma.cc/2HZR-N38X | |
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' | |
# Tell the requests package to retreive the contents our page (it'll be | |
# grabbing what you see when you use the View Source feature in your browser) | |
r = requests.get(url_to_scrape) | |
# We now have the source HTML of the page. Let's ask BeaultifulSoup | |
# to parse it for us. | |
soup = BeautifulSoup(r.text) | |
# Down below we'll add our inmates to this list. For now, | |
# it's just a placeholder. | |
inmates_links = [] | |
# Our source document puts each inmate in an HTML table row. Let's | |
# loop through all of those rows | |
for table_row in soup.select(".inmatesList tr"): | |
# Each table row has a set of tabel cells, or tds. Let's | |
# get all of those. | |
table_cells = table_row.findAll('td') | |
# Our table has one exception -- a row without any cells. | |
# Let's handle that special case here by making sure we | |
# have more than zero cells before processing the cells | |
if len(table_cells) > 0: | |
# By looking at our source (probably easiest in your browser), we can | |
# see that the link is in the first td of each row. Let's extract the | |
# value of that link here. | |
# | |
# Should this link pattern change, find an archive of an | |
# example at http://perma.cc/RTU7-57DL | |
relative_link_to_inmate_details = table_cells[0].find('a')['href'] | |
# The links to the inmates are relative (they look | |
# like Details.aspx?bi=212840). We need to make them absolute links. | |
# We do that by prepending our base URL (which conveniently is the same | |
# one we used to get the list of inmates.) | |
absolute_link_to_inmate_details = url_to_scrape + relative_link_to_inmate_details | |
# We're done getting the link to the inmate details. Let's add it | |
# to our list of inmates for later use | |
inmates_links.append(absolute_link_to_inmate_details) | |
# Down below we'll add our inmates details to this list. For now, | |
# it's just a placeholder. | |
inmates = [] | |
# Loop through the list of inmate links we built | |
# Since the inmate list is several hunderd links in total, | |
# we might want to slice just a few off for testing. Here, we start with five. | |
for inmate_link in inmates_links[:10]: | |
# Once again we'll use requests to get the HTML of our link | |
# and use beautiful soup to process it. | |
r = requests.get(inmate_link) | |
soup = BeautifulSoup(r.text) | |
# We'll put the details we want to hang on to in this dictionary | |
inmate_details = {} | |
# Get all of our table rows in the inmateProfile table | |
inmate_profile_rows = soup.select("#inmateProfile tr") | |
# Inmate age | |
# From looking at the HTML source (using View Source in our browser) | |
# we see that age is in the first row and the first table cell (td) | |
# We use the strip function to cleanup unwanted spaces | |
inmate_details['age'] = inmate_profile_rows[0].findAll('td')[0].text.strip() | |
# Inmate race | |
# Race and naem are in our same inmateProfile table, we just find | |
# the correct row | |
inmate_details['race'] = inmate_profile_rows[3].findAll('td')[0].text.strip() | |
# Inmate sex | |
inmate_details['sex'] = inmate_profile_rows[4].findAll('td')[0].text.strip() | |
# Get all of our table rows in the inmateNameDate table | |
inmate_name_date_rows = soup.select("#inmateNameDate tr") | |
# Inmate name | |
inmate_details['name'] = inmate_name_date_rows[1].findAll('td')[0].text.strip() | |
# Inmate booking time | |
inmate_details['booked_at'] = inmate_name_date_rows[2].findAll('td')[0].text.strip() | |
# Get all of our table rows in the inmateNameDate table | |
inmate_address_container = soup.select("#inmateAddress") | |
inmate_details['city'] = inmate_address_container[0].text.split('\n')[2].strip() | |
# Now that we have all of the inmate details extracted and placed in a | |
# dictionary, let's append that dictionary to our list | |
inmates.append(inmate_details) | |
# We don't want to overwhelm the Polk County site. Let's pause for one | |
# second between each inmate request. | |
time.sleep(1) | |
# We now have details (in our dictionary) for each inmate. Let's print those out. | |
for inmate in inmates: | |
print '{0}, {1}'.format(inmate['name'], inmate['age']) | |
print '{0} {1} from {2}'.format(inmate['race'], inmate['sex'], inmate['city']) | |
print 'Booked at {0}'.format(inmate['booked_at']) | |
print '' | |
# We might want to do more than just print out our numbers though. Maybe | |
# we want to see count up each inmate's city and print it out.s | |
inmate_cities = {} | |
for inmate in inmates: | |
# If we haven't seen the inmate's city already, add it to our | |
# dictionary with the value of 1. Otherwise, just add 1. | |
if inmate['city'] in inmate_cities: | |
inmate_cities[inmate['city']] += 1 | |
else: | |
inmate_cities[inmate['city']] = 1 | |
print inmate_cities | |
# Or, each inmate's race | |
inmate_races = {} | |
for inmate in inmates: | |
if inmate['race'] in inmate_races: | |
inmate_races[inmate['race']] += 1 | |
else: | |
inmate_races[inmate['race']] = 1 | |
print inmate_races |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment