Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@dannguyen
Last active August 10, 2020 18:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dannguyen/b35713b4751272fc6346dc312f3cf470 to your computer and use it in GitHub Desktop.
Save dannguyen/b35713b4751272fc6346dc312f3cf470 to your computer and use it in GitHub Desktop.
Scraping and parsing the last words of Texas executed inmates for religious words; an exercise in webscraping and regexes
"""
Filter Texas executed inmates by whether any of their last words fit in a
list of words commonly associated with religion.
A quick demonstration of the overall patterns in web-scraping, including
using a HTML parser to navigate the DOM and the use of Regex for
hand-entered values. Does none of the file-caching/management that you should
be doing for such a task
"""
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
from urllib.parse import urljoin
import re
import requests
RELIGION_WORDS = ['pray','holy spirit', 'God', 'Lord', 'Christ(?:ian\w*|mas)?', 'Islam', 'bless\w*'
'heaven', 'creator', 'Allah', 'M[uo]hammed', 'Jesus', 'Bible', 'Scriptures',
'Koran', 'shepherd']
RELIGION_REGEX = re.compile('(%s)' % '|'.join(RELIGION_WORDS), re.IGNORECASE)
# Note, the original URL is
# https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
# but Github mirror is faster
SOURCE_URL = 'http://wgetsnaps.github.io/tdcj-executed-offenders/death_row/dr_executed_offenders.html'
def extract_offender_name(soup):
"""`soup` is a parsed last-words inmates page
returns: name of offender
"""
otag = soup.find('p', text=re.compile('Offender:')) # regex is required because of inconsistent spaces
offender_name = otag.find_next_sibling('p').text
return offender_name
def extract_last_words(soup):
"""`soup` is a parsed last-words inmates page
returns: concatenated string of last word paragraphs
needs to do a search across p and span because of this strange page:
https://www.tdcj.state.tx.us/death_row/dr_info/garciagustavolast.html
"""
otag = soup.find('p', text=re.compile('Last Statement:'))
if not otag:
otag = soup.find('span', text=re.compile('Last Statement:')).parent
ptags = otag.find_next_siblings('p')
return '\n'.join([p.text for p in ptags])
if __name__ == '__main__':
inmates = {'religious': [], 'nonreligious': []}
indexsoup = BeautifulSoup(requests.get(SOURCE_URL).text, 'lxml')
# gather the last statement links from the main index page
for atag in indexsoup.find_all('a', text='Last Statement'):
if 'no_last_statement.html' not in atag['href']: # ignore non statements
xurl = urljoin(SOURCE_URL, atag['href'])
print('-------------------')
print(Fore.WHITE + Style.BRIGHT + Back.BLACK + xurl + Style.RESET_ALL)
xsoup = BeautifulSoup(requests.get(xurl).text, 'lxml')
inmate_name = extract_offender_name(xsoup)
inmate_words = extract_last_words(xsoup)
# print colorized version
print("Final words from:",
Fore.RED + Style.BRIGHT + Back.YELLOW + inmate_name + Style.RESET_ALL, '\n')
repl = Fore.WHITE + Style.BRIGHT + Back.BLUE + r'\1' + Style.RESET_ALL
print(RELIGION_REGEX.sub(repl, inmate_words))
# judge by count of word appearances
r_words = RELIGION_REGEX.findall(inmate_words)
if r_words:
inmates['religious'].append(inmate_name)
else:
inmates['nonreligious'].append(inmate_name)
# obviously could do more here than just append names
# to the collection
print("==============================================================")
print('Religious', len(inmates['religious']))
print('Non-Religious', len(inmates['nonreligious']))
@dannguyen
Copy link
Author

Current tally:

  • Religious 209
  • Non-Religious 217

Sample screenshot of stdout:

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment