dannguyen/tx-dp-regex-religion.py

## tx-dp-regex-religion.py
"""
Filter Texas executed inmates by whether any of their last words fit in a
list of words commonly associated with religion.


A quick demonstration of the overall patterns in web-scraping, including
  using a HTML parser to navigate the DOM and the use of Regex for
  hand-entered values. Does none of the file-caching/management that you should
  be doing for such a task
"""
from bs4 import BeautifulSoup
from colorama import Fore, Back, Style
from urllib.parse import urljoin
import re
import requests
RELIGION_WORDS = ['pray','holy spirit', 'God', 'Lord', 'Christ(?:ian\w*|mas)?', 'Islam', 'bless\w*'
                  'heaven', 'creator', 'Allah', 'M[uo]hammed', 'Jesus', 'Bible', 'Scriptures',
                  'Koran', 'shepherd']
RELIGION_REGEX = re.compile('(%s)' % '|'.join(RELIGION_WORDS), re.IGNORECASE)
# Note, the original URL is
# https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
# but Github mirror is faster
SOURCE_URL = 'http://wgetsnaps.github.io/tdcj-executed-offenders/death_row/dr_executed_offenders.html'


def extract_offender_name(soup):
    """`soup` is a parsed last-words inmates page
        returns: name of offender
    """
    otag = soup.find('p', text=re.compile('Offender:')) # regex is required because of inconsistent spaces
    offender_name = otag.find_next_sibling('p').text
    return offender_name

def extract_last_words(soup):
    """`soup` is a parsed last-words inmates page
        returns: concatenated string of last word paragraphs

        needs to do a search across p and span because of this strange page:
        https://www.tdcj.state.tx.us/death_row/dr_info/garciagustavolast.html
    """
    otag = soup.find('p', text=re.compile('Last Statement:'))
    if not otag:
        otag = soup.find('span', text=re.compile('Last Statement:')).parent
    ptags = otag.find_next_siblings('p')
    return '\n'.join([p.text for p in ptags])


if __name__ == '__main__':
    inmates = {'religious': [], 'nonreligious': []}
    indexsoup = BeautifulSoup(requests.get(SOURCE_URL).text, 'lxml')
    # gather the last statement links from the main index page
    for atag in indexsoup.find_all('a', text='Last Statement'):
        if 'no_last_statement.html' not in atag['href']: # ignore non statements
            xurl = urljoin(SOURCE_URL, atag['href'])
            print('-------------------')
            print(Fore.WHITE + Style.BRIGHT + Back.BLACK + xurl + Style.RESET_ALL)

            xsoup = BeautifulSoup(requests.get(xurl).text, 'lxml')
            inmate_name = extract_offender_name(xsoup)
            inmate_words = extract_last_words(xsoup)
            # print colorized version
            print("Final words from:",
                   Fore.RED + Style.BRIGHT +  Back.YELLOW + inmate_name + Style.RESET_ALL, '\n')
            repl = Fore.WHITE + Style.BRIGHT + Back.BLUE + r'\1' + Style.RESET_ALL
            print(RELIGION_REGEX.sub(repl, inmate_words))

            # judge by count of word appearances
            r_words = RELIGION_REGEX.findall(inmate_words)
            if r_words:
                inmates['religious'].append(inmate_name)
            else:
                inmates['nonreligious'].append(inmate_name)

    # obviously could do more here than just append names
    # to the collection
    print("==============================================================")
    print('Religious', len(inmates['religious']))
    print('Non-Religious', len(inmates['nonreligious']))
	"""
	Filter Texas executed inmates by whether any of their last words fit in a
	list of words commonly associated with religion.


	A quick demonstration of the overall patterns in web-scraping, including
	using a HTML parser to navigate the DOM and the use of Regex for
	hand-entered values. Does none of the file-caching/management that you should
	be doing for such a task
	"""
	from bs4 import BeautifulSoup
	from colorama import Fore, Back, Style
	from urllib.parse import urljoin
	import re
	import requests
	RELIGION_WORDS = ['pray','holy spirit', 'God', 'Lord', 'Christ(?:ian\w\|mas)?', 'Islam', 'bless\w'
	'heaven', 'creator', 'Allah', 'M[uo]hammed', 'Jesus', 'Bible', 'Scriptures',
	'Koran', 'shepherd']
	RELIGION_REGEX = re.compile('(%s)' % '\|'.join(RELIGION_WORDS), re.IGNORECASE)
	# Note, the original URL is
	# https://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
	# but Github mirror is faster
	SOURCE_URL = 'http://wgetsnaps.github.io/tdcj-executed-offenders/death_row/dr_executed_offenders.html'


	def extract_offender_name(soup):
	"""`soup` is a parsed last-words inmates page
	returns: name of offender
	"""
	otag = soup.find('p', text=re.compile('Offender:')) # regex is required because of inconsistent spaces
	offender_name = otag.find_next_sibling('p').text
	return offender_name

	def extract_last_words(soup):
	"""`soup` is a parsed last-words inmates page
	returns: concatenated string of last word paragraphs

	needs to do a search across p and span because of this strange page:
	https://www.tdcj.state.tx.us/death_row/dr_info/garciagustavolast.html
	"""
	otag = soup.find('p', text=re.compile('Last Statement:'))
	if not otag:
	otag = soup.find('span', text=re.compile('Last Statement:')).parent
	ptags = otag.find_next_siblings('p')
	return '\n'.join([p.text for p in ptags])


	if __name__ == '__main__':
	inmates = {'religious': [], 'nonreligious': []}
	indexsoup = BeautifulSoup(requests.get(SOURCE_URL).text, 'lxml')
	# gather the last statement links from the main index page
	for atag in indexsoup.find_all('a', text='Last Statement'):
	if 'no_last_statement.html' not in atag['href']: # ignore non statements
	xurl = urljoin(SOURCE_URL, atag['href'])
	print('-------------------')
	print(Fore.WHITE + Style.BRIGHT + Back.BLACK + xurl + Style.RESET_ALL)

	xsoup = BeautifulSoup(requests.get(xurl).text, 'lxml')
	inmate_name = extract_offender_name(xsoup)
	inmate_words = extract_last_words(xsoup)
	# print colorized version
	print("Final words from:",
	Fore.RED + Style.BRIGHT + Back.YELLOW + inmate_name + Style.RESET_ALL, '\n')
	repl = Fore.WHITE + Style.BRIGHT + Back.BLUE + r'\1' + Style.RESET_ALL
	print(RELIGION_REGEX.sub(repl, inmate_words))

	# judge by count of word appearances
	r_words = RELIGION_REGEX.findall(inmate_words)
	if r_words:
	inmates['religious'].append(inmate_name)
	else:
	inmates['nonreligious'].append(inmate_name)

	# obviously could do more here than just append names
	# to the collection
	print("==============================================================")
	print('Religious', len(inmates['religious']))
	print('Non-Religious', len(inmates['nonreligious']))