willettk/names.txt

## names.txt
Bethilda
Brunhilda
Chegory
Chelizabeth
Drusilla
Elizanthonius
Esteñabeth
Festerling
Flimshaw
Grasputin
Grayabeth
Grelliot
Grenevieve
Grennifer
Hanktimony
Harkonnen
Irulan
Jakery
Jellicle
Karabeth
Margareth the Unbloodied
Meowcutio
Michaelwards
Pallison
Quadrophenia
Rachem
Ralphonse
Rhadamanthus
Rourthenay
Scorinthians
Skeremy
Slartha
Spliffany
Stormbreath
Thector
Triticale

------

Bartleby the Scrivener
Enfield Tennis Academy
Fragonard's The Swing
Darmok and Jalad

## prudie.py
from bs4 import BeautifulSoup as bs
import requests
import string

def get_names(urls):

    # Load native word list
    f = open('/usr/share/dict/web2','r')
    words_ = f.readlines()
    f.close()
    words = set([w.strip() for w in words_])

    masternames = set()

    for url in urls:

        # Scrape web page
        r = requests.get(url)
        soup = bs(r.text,'lxml')

        ptext = ''

        # Format from Dec 2017 onward
        slate_paragraphs = soup.find_all('p',{'class':['slate-paragraph',]})
        if len(slate_paragraphs) > 0:
            for link in slate_paragraphs:
                ptext = ptext + link.text.strip()

        # Format up to Dec 2017
        parbases = soup.find_all('div',{'class':['parbase',]})
        if len(parbases) > 0:
            for link in parbases:
                p = link.find('p')
                if p is not None:
                    ptext = ptext + p.text.strip()

        # Split out non-letter characters
        text = ''
        for char in ptext:
            if char in string.ascii_letters:
                text += char
            elif char in [" "]:
                text += ' '
            elif char in ["-"]:
                text += '-'
            elif char in ["’","'",'"']:
                text += ' '

        # Select only capitalized words
        potential_names = set()
        for word in text.split():
            if word[0] in string.ascii_uppercase and len(word) > 3:
                potential_names.add(word)

        # Check if candidate is in word list
        for name in potential_names:
            ncap = 0
            for l in name:
                if l in string.ascii_uppercase:
                    ncap += 1
            non_letters = len(set(name) - set(string.ascii_letters))
            if name not in words and name.lower() not in words and name.upper() not in words and non_letters == 0 and ncap <= 1:
                masternames.add(name)

    # Print results
    for s in sorted(list(masternames)):
        print(s)

def get_urls(n=10):

    # Get list of all the archived Dear Prudence webpages. Not sure how far back to go - up to at least 10 still has Mallory Ortberg and the same format.

    urls = []

    for i in range(n):
        url = 'http://www.slate.com/articles/life/dear_prudence.{}.html'.format(i+1)

        r = requests.get(url)
        soup = bs(r.text,'lxml')

        ptext = ''
        for link in soup.find_all('a',{'class':'primary'}):
            urls.append(link['href'])

    print("At least {} pages of Dear Prudence columns found.".format(len(urls)))

    return urls

if __name__ == "__main__":
    url_list = get_urls()
    get_names(url_list)

    # Some manual curation still required after the last step; cuts down candidates by a factor of several.
	Bethilda
	Brunhilda
	Chegory
	Chelizabeth
	Drusilla
	Elizanthonius
	Esteñabeth
	Festerling
	Flimshaw
	Grasputin
	Grayabeth
	Grelliot
	Grenevieve
	Grennifer
	Hanktimony
	Harkonnen
	Irulan
	Jakery
	Jellicle
	Karabeth
	Margareth the Unbloodied
	Meowcutio
	Michaelwards
	Pallison
	Quadrophenia
	Rachem
	Ralphonse
	Rhadamanthus
	Rourthenay
	Scorinthians
	Skeremy
	Slartha
	Spliffany
	Stormbreath
	Thector
	Triticale

	------

	Bartleby the Scrivener
	Enfield Tennis Academy
	Fragonard's The Swing
	Darmok and Jalad
	from bs4 import BeautifulSoup as bs
	import requests
	import string

	def get_names(urls):

	# Load native word list
	f = open('/usr/share/dict/web2','r')
	words_ = f.readlines()
	f.close()
	words = set([w.strip() for w in words_])

	masternames = set()

	for url in urls:

	# Scrape web page
	r = requests.get(url)
	soup = bs(r.text,'lxml')

	ptext = ''

	# Format from Dec 2017 onward
	slate_paragraphs = soup.find_all('p',{'class':['slate-paragraph',]})
	if len(slate_paragraphs) > 0:
	for link in slate_paragraphs:
	ptext = ptext + link.text.strip()

	# Format up to Dec 2017
	parbases = soup.find_all('div',{'class':['parbase',]})
	if len(parbases) > 0:
	for link in parbases:
	p = link.find('p')
	if p is not None:
	ptext = ptext + p.text.strip()

	# Split out non-letter characters
	text = ''
	for char in ptext:
	if char in string.ascii_letters:
	text += char
	elif char in [" "]:
	text += ' '
	elif char in ["-"]:
	text += '-'
	elif char in ["’","'",'"']:
	text += ' '

	# Select only capitalized words
	potential_names = set()
	for word in text.split():
	if word[0] in string.ascii_uppercase and len(word) > 3:
	potential_names.add(word)

	# Check if candidate is in word list
	for name in potential_names:
	ncap = 0
	for l in name:
	if l in string.ascii_uppercase:
	ncap += 1
	non_letters = len(set(name) - set(string.ascii_letters))
	if name not in words and name.lower() not in words and name.upper() not in words and non_letters == 0 and ncap <= 1:
	masternames.add(name)

	# Print results
	for s in sorted(list(masternames)):
	print(s)

	def get_urls(n=10):

	# Get list of all the archived Dear Prudence webpages. Not sure how far back to go - up to at least 10 still has Mallory Ortberg and the same format.

	urls = []

	for i in range(n):
	url = 'http://www.slate.com/articles/life/dear_prudence.{}.html'.format(i+1)

	r = requests.get(url)
	soup = bs(r.text,'lxml')

	ptext = ''
	for link in soup.find_all('a',{'class':'primary'}):
	urls.append(link['href'])

	print("At least {} pages of Dear Prudence columns found.".format(len(urls)))

	return urls

	if __name__ == "__main__":
	url_list = get_urls()
	get_names(url_list)

	# Some manual curation still required after the last step; cuts down candidates by a factor of several.