Skip to content

Instantly share code, notes, and snippets.

@DLu
Last active May 22, 2016 01:05
Show Gist options
  • Save DLu/c686ae69f2446634bed03cfb6c42b725 to your computer and use it in GitHub Desktop.
Save DLu/c686ae69f2446634bed03cfb6c42b725 to your computer and use it in GitHub Desktop.
XKCD 1412 - Wikipedia Article Title Stress Matcher
#!/usr/bin/python
import argparse
import sys
import nltk
import progressbar
# Replicating https://xkcd.com/1412/
# Special thanks to
# http://stackoverflow.com/questions/19015590/discovering-poetic-form-with-nltk-and-cmu-dict
pronunciations = nltk.corpus.cmudict.dict()
def pronunciation_to_stress(pronunciation):
return str(''.join([c for c in ''.join(pronunciation) if c.isdigit()]))
def parse_stress(phrase, loose=False):
# (1=primary, 2=secondary, 0=no stress
# loose considers 2=0
alternatives = ['']
for word in [words.lower() for words in nltk.word_tokenize(phrase)]:
if word not in pronunciations:
return None
stresses = set()
for pronunciation in pronunciations[word]:
stress = pronunciation_to_stress(pronunciation)
stresses.add( stress )
if loose:
stresses.add( stress.replace('2', '0') )
stresses.add( stress.replace('2', '1') )
L = []
for alt_word in stresses:
for alt_phrase in alternatives:
L.append( alt_phrase + alt_word )
alternatives = L
return alternatives
def translate_wiki(s):
s = s.strip()
if s[0]==s[-1] and s[0]=='"':
s = s[1:-1]
s = s.replace('_', ' ').replace('(', '').replace(')', '')
return s
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('key', help='Text to match. Example "Teenage Mutant Ninja Turtles"')
parser.add_argument('page_titles_file', help='Extracted version of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz')
parser.add_argument('page_counts_file', nargs='?', help='Extracted version of one log file from https://dumps.wikimedia.org/other/pagecounts-raw/')
parser.add_argument('-l', '--loose', action='store_true', help='Allow secondary stress to be considered stressed or unstressed syllable')
parser.add_argument('-p', '--progress', action='store_true', help='print a progress bar')
args = parser.parse_args()
pattern = parse_stress(args.key, args.loose)
titles = open(args.page_titles_file).readlines()
N = len(titles)
if args.progress:
pbar = progressbar.ProgressBar(maxval=N)
pbar.start()
# Find matching titles
matches = {}
for i, word in enumerate(titles):
word = word.strip()
if args.progress:
pbar.update(i)
s = translate_wiki(word)
try:
np = parse_stress(s, args.loose)
if len(set(np) & set(pattern))>0:
matches[word] = s
except:
continue
if args.progress:
pbar.finish()
# Print titles
if args.page_counts_file:
# sort by page counts
indexed = []
for line in open(args.page_counts_file).readlines():
try:
proj, name, size, views = line.split()
if name in matches:
indexed.append( (int(views), matches[name]) )
except:
continue
for views, title in sorted(indexed):
print "%-25s\t%10s"%(title, "{:,}".format(views))
else:
print '\n'.join(sorted(matches.values()))
print len(matches)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment