Skip to content

Instantly share code, notes, and snippets.

@willettk
Created January 9, 2018 22:50
Show Gist options
  • Save willettk/47a528352ac8f19f0f1558324c1b2b1e to your computer and use it in GitHub Desktop.
Save willettk/47a528352ac8f19f0f1558324c1b2b1e to your computer and use it in GitHub Desktop.
Dear Prudence - The Best Names
Bethilda
Brunhilda
Chegory
Chelizabeth
Drusilla
Elizanthonius
Esteñabeth
Festerling
Flimshaw
Grasputin
Grayabeth
Grelliot
Grenevieve
Grennifer
Hanktimony
Harkonnen
Irulan
Jakery
Jellicle
Karabeth
Margareth the Unbloodied
Meowcutio
Michaelwards
Pallison
Quadrophenia
Rachem
Ralphonse
Rhadamanthus
Rourthenay
Scorinthians
Skeremy
Slartha
Spliffany
Stormbreath
Thector
Triticale
------
Bartleby the Scrivener
Enfield Tennis Academy
Fragonard's The Swing
Darmok and Jalad
from bs4 import BeautifulSoup as bs
import requests
import string
def get_names(urls):
# Load native word list
f = open('/usr/share/dict/web2','r')
words_ = f.readlines()
f.close()
words = set([w.strip() for w in words_])
masternames = set()
for url in urls:
# Scrape web page
r = requests.get(url)
soup = bs(r.text,'lxml')
ptext = ''
# Format from Dec 2017 onward
slate_paragraphs = soup.find_all('p',{'class':['slate-paragraph',]})
if len(slate_paragraphs) > 0:
for link in slate_paragraphs:
ptext = ptext + link.text.strip()
# Format up to Dec 2017
parbases = soup.find_all('div',{'class':['parbase',]})
if len(parbases) > 0:
for link in parbases:
p = link.find('p')
if p is not None:
ptext = ptext + p.text.strip()
# Split out non-letter characters
text = ''
for char in ptext:
if char in string.ascii_letters:
text += char
elif char in [" "]:
text += ' '
elif char in ["-"]:
text += '-'
elif char in ["’","'",'"']:
text += ' '
# Select only capitalized words
potential_names = set()
for word in text.split():
if word[0] in string.ascii_uppercase and len(word) > 3:
potential_names.add(word)
# Check if candidate is in word list
for name in potential_names:
ncap = 0
for l in name:
if l in string.ascii_uppercase:
ncap += 1
non_letters = len(set(name) - set(string.ascii_letters))
if name not in words and name.lower() not in words and name.upper() not in words and non_letters == 0 and ncap <= 1:
masternames.add(name)
# Print results
for s in sorted(list(masternames)):
print(s)
def get_urls(n=10):
# Get list of all the archived Dear Prudence webpages. Not sure how far back to go - up to at least 10 still has Mallory Ortberg and the same format.
urls = []
for i in range(n):
url = 'http://www.slate.com/articles/life/dear_prudence.{}.html'.format(i+1)
r = requests.get(url)
soup = bs(r.text,'lxml')
ptext = ''
for link in soup.find_all('a',{'class':'primary'}):
urls.append(link['href'])
print("At least {} pages of Dear Prudence columns found.".format(len(urls)))
return urls
if __name__ == "__main__":
url_list = get_urls()
get_names(url_list)
# Some manual curation still required after the last step; cuts down candidates by a factor of several.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment