Created
January 9, 2018 22:50
-
-
Save willettk/47a528352ac8f19f0f1558324c1b2b1e to your computer and use it in GitHub Desktop.
Dear Prudence - The Best Names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Bethilda | |
Brunhilda | |
Chegory | |
Chelizabeth | |
Drusilla | |
Elizanthonius | |
Esteñabeth | |
Festerling | |
Flimshaw | |
Grasputin | |
Grayabeth | |
Grelliot | |
Grenevieve | |
Grennifer | |
Hanktimony | |
Harkonnen | |
Irulan | |
Jakery | |
Jellicle | |
Karabeth | |
Margareth the Unbloodied | |
Meowcutio | |
Michaelwards | |
Pallison | |
Quadrophenia | |
Rachem | |
Ralphonse | |
Rhadamanthus | |
Rourthenay | |
Scorinthians | |
Skeremy | |
Slartha | |
Spliffany | |
Stormbreath | |
Thector | |
Triticale | |
------ | |
Bartleby the Scrivener | |
Enfield Tennis Academy | |
Fragonard's The Swing | |
Darmok and Jalad |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
import requests | |
import string | |
def get_names(urls): | |
# Load native word list | |
f = open('/usr/share/dict/web2','r') | |
words_ = f.readlines() | |
f.close() | |
words = set([w.strip() for w in words_]) | |
masternames = set() | |
for url in urls: | |
# Scrape web page | |
r = requests.get(url) | |
soup = bs(r.text,'lxml') | |
ptext = '' | |
# Format from Dec 2017 onward | |
slate_paragraphs = soup.find_all('p',{'class':['slate-paragraph',]}) | |
if len(slate_paragraphs) > 0: | |
for link in slate_paragraphs: | |
ptext = ptext + link.text.strip() | |
# Format up to Dec 2017 | |
parbases = soup.find_all('div',{'class':['parbase',]}) | |
if len(parbases) > 0: | |
for link in parbases: | |
p = link.find('p') | |
if p is not None: | |
ptext = ptext + p.text.strip() | |
# Split out non-letter characters | |
text = '' | |
for char in ptext: | |
if char in string.ascii_letters: | |
text += char | |
elif char in [" "]: | |
text += ' ' | |
elif char in ["-"]: | |
text += '-' | |
elif char in ["’","'",'"']: | |
text += ' ' | |
# Select only capitalized words | |
potential_names = set() | |
for word in text.split(): | |
if word[0] in string.ascii_uppercase and len(word) > 3: | |
potential_names.add(word) | |
# Check if candidate is in word list | |
for name in potential_names: | |
ncap = 0 | |
for l in name: | |
if l in string.ascii_uppercase: | |
ncap += 1 | |
non_letters = len(set(name) - set(string.ascii_letters)) | |
if name not in words and name.lower() not in words and name.upper() not in words and non_letters == 0 and ncap <= 1: | |
masternames.add(name) | |
# Print results | |
for s in sorted(list(masternames)): | |
print(s) | |
def get_urls(n=10): | |
# Get list of all the archived Dear Prudence webpages. Not sure how far back to go - up to at least 10 still has Mallory Ortberg and the same format. | |
urls = [] | |
for i in range(n): | |
url = 'http://www.slate.com/articles/life/dear_prudence.{}.html'.format(i+1) | |
r = requests.get(url) | |
soup = bs(r.text,'lxml') | |
ptext = '' | |
for link in soup.find_all('a',{'class':'primary'}): | |
urls.append(link['href']) | |
print("At least {} pages of Dear Prudence columns found.".format(len(urls))) | |
return urls | |
if __name__ == "__main__": | |
url_list = get_urls() | |
get_names(url_list) | |
# Some manual curation still required after the last step; cuts down candidates by a factor of several. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment