Last active
August 10, 2020 18:56
-
-
Save dannguyen/b35713b4751272fc6346dc312f3cf470 to your computer and use it in GitHub Desktop.
Scraping and parsing the last words of Texas executed inmates for religious words; an exercise in webscraping and regexes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Filter Texas executed inmates by whether any of their last words fit in a | |
list of words commonly associated with religion. | |
A quick demonstration of the overall patterns in web-scraping, including | |
using a HTML parser to navigate the DOM and the use of Regex for | |
hand-entered values. Does none of the file-caching/management that you should | |
be doing for such a task | |
""" | |
from bs4 import BeautifulSoup | |
from colorama import Fore, Back, Style | |
from urllib.parse import urljoin | |
import re | |
import requests | |
RELIGION_WORDS = ['pray','holy spirit', 'God', 'Lord', 'Christ(?:ian\w*|mas)?', 'Islam', 'bless\w*' | |
'heaven', 'creator', 'Allah', 'M[uo]hammed', 'Jesus', 'Bible', 'Scriptures', | |
'Koran', 'shepherd'] | |
RELIGION_REGEX = re.compile('(%s)' % '|'.join(RELIGION_WORDS), re.IGNORECASE) | |
# Note, the original URL is | |
# https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html | |
# but Github mirror is faster | |
SOURCE_URL = 'http://wgetsnaps.github.io/tdcj-executed-offenders/death_row/dr_executed_offenders.html' | |
def extract_offender_name(soup): | |
"""`soup` is a parsed last-words inmates page | |
returns: name of offender | |
""" | |
otag = soup.find('p', text=re.compile('Offender:')) # regex is required because of inconsistent spaces | |
offender_name = otag.find_next_sibling('p').text | |
return offender_name | |
def extract_last_words(soup): | |
"""`soup` is a parsed last-words inmates page | |
returns: concatenated string of last word paragraphs | |
needs to do a search across p and span because of this strange page: | |
https://www.tdcj.state.tx.us/death_row/dr_info/garciagustavolast.html | |
""" | |
otag = soup.find('p', text=re.compile('Last Statement:')) | |
if not otag: | |
otag = soup.find('span', text=re.compile('Last Statement:')).parent | |
ptags = otag.find_next_siblings('p') | |
return '\n'.join([p.text for p in ptags]) | |
if __name__ == '__main__': | |
inmates = {'religious': [], 'nonreligious': []} | |
indexsoup = BeautifulSoup(requests.get(SOURCE_URL).text, 'lxml') | |
# gather the last statement links from the main index page | |
for atag in indexsoup.find_all('a', text='Last Statement'): | |
if 'no_last_statement.html' not in atag['href']: # ignore non statements | |
xurl = urljoin(SOURCE_URL, atag['href']) | |
print('-------------------') | |
print(Fore.WHITE + Style.BRIGHT + Back.BLACK + xurl + Style.RESET_ALL) | |
xsoup = BeautifulSoup(requests.get(xurl).text, 'lxml') | |
inmate_name = extract_offender_name(xsoup) | |
inmate_words = extract_last_words(xsoup) | |
# print colorized version | |
print("Final words from:", | |
Fore.RED + Style.BRIGHT + Back.YELLOW + inmate_name + Style.RESET_ALL, '\n') | |
repl = Fore.WHITE + Style.BRIGHT + Back.BLUE + r'\1' + Style.RESET_ALL | |
print(RELIGION_REGEX.sub(repl, inmate_words)) | |
# judge by count of word appearances | |
r_words = RELIGION_REGEX.findall(inmate_words) | |
if r_words: | |
inmates['religious'].append(inmate_name) | |
else: | |
inmates['nonreligious'].append(inmate_name) | |
# obviously could do more here than just append names | |
# to the collection | |
print("==============================================================") | |
print('Religious', len(inmates['religious'])) | |
print('Non-Religious', len(inmates['nonreligious'])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Current tally:
Sample screenshot of stdout: