Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraping and parsing the last words of Texas executed inmates for religious words; an exercise in webscraping and regexes
"""
Filter Texas executed inmates by whether any of their last words fit in a
list of words commonly associated with religion.
A quick demonstration of the overall patterns in web-scraping, including
using a HTML parser to navigate the DOM and the use of Regex for
hand-entered values. Does none of the file-caching/management that you should
be doing for such a task
"""
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
from urllib.parse import urljoin
import re
import requests
RELIGION_WORDS = ['pray','holy spirit', 'God', 'Lord', 'Christ(?:ian\w*|mas)?', 'Islam', 'bless\w*'
'heaven', 'creator', 'Allah', 'M[uo]hammed', 'Jesus', 'Bible', 'Scriptures',
'Koran', 'shepherd']
RELIGION_REGEX = re.compile('(%s)' % '|'.join(RELIGION_WORDS), re.IGNORECASE)
# Note, the original URL is
# https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
# but Github mirror is faster
SOURCE_URL = 'http://wgetsnaps.github.io/tdcj-executed-offenders/death_row/dr_executed_offenders.html'
def extract_offender_name(soup):
"""`soup` is a parsed last-words inmates page
returns: name of offender
"""
otag = soup.find('p', text=re.compile('Offender:')) # regex is required because of inconsistent spaces
offender_name = otag.find_next_sibling('p').text
return offender_name
def extract_last_words(soup):
"""`soup` is a parsed last-words inmates page
returns: concatenated string of last word paragraphs
needs to do a search across p and span because of this strange page:
https://www.tdcj.state.tx.us/death_row/dr_info/garciagustavolast.html
"""
otag = soup.find('p', text=re.compile('Last Statement:'))
if not otag:
otag = soup.find('span', text=re.compile('Last Statement:')).parent
ptags = otag.find_next_siblings('p')
return '\n'.join([p.text for p in ptags])
if __name__ == '__main__':
inmates = {'religious': [], 'nonreligious': []}
indexsoup = BeautifulSoup(requests.get(SOURCE_URL).text, 'lxml')
# gather the last statement links from the main index page
for atag in indexsoup.find_all('a', text='Last Statement'):
if 'no_last_statement.html' not in atag['href']: # ignore non statements
xurl = urljoin(SOURCE_URL, atag['href'])
print('-------------------')
print(Fore.WHITE + Style.BRIGHT + Back.BLACK + xurl + Style.RESET_ALL)
xsoup = BeautifulSoup(requests.get(xurl).text, 'lxml')
inmate_name = extract_offender_name(xsoup)
inmate_words = extract_last_words(xsoup)
# print colorized version
print("Final words from:",
Fore.RED + Style.BRIGHT + Back.YELLOW + inmate_name + Style.RESET_ALL, '\n')
repl = Fore.WHITE + Style.BRIGHT + Back.BLUE + r'\1' + Style.RESET_ALL
print(RELIGION_REGEX.sub(repl, inmate_words))
# judge by count of word appearances
r_words = RELIGION_REGEX.findall(inmate_words)
if r_words:
inmates['religious'].append(inmate_name)
else:
inmates['nonreligious'].append(inmate_name)
# obviously could do more here than just append names
# to the collection
print("==============================================================")
print('Religious', len(inmates['religious']))
print('Non-Religious', len(inmates['nonreligious']))
@dannguyen

This comment has been minimized.

Copy link
Owner Author

dannguyen commented Apr 20, 2016

Current tally:

  • Religious 209
  • Non-Religious 217

Sample screenshot of stdout:

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.