Last active
May 22, 2016 01:05
-
-
Save DLu/c686ae69f2446634bed03cfb6c42b725 to your computer and use it in GitHub Desktop.
XKCD 1412 - Wikipedia Article Title Stress Matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import argparse | |
import sys | |
import nltk | |
import progressbar | |
# Replicating https://xkcd.com/1412/ | |
# Special thanks to | |
# http://stackoverflow.com/questions/19015590/discovering-poetic-form-with-nltk-and-cmu-dict | |
pronunciations = nltk.corpus.cmudict.dict() | |
def pronunciation_to_stress(pronunciation): | |
return str(''.join([c for c in ''.join(pronunciation) if c.isdigit()])) | |
def parse_stress(phrase, loose=False): | |
# (1=primary, 2=secondary, 0=no stress | |
# loose considers 2=0 | |
alternatives = [''] | |
for word in [words.lower() for words in nltk.word_tokenize(phrase)]: | |
if word not in pronunciations: | |
return None | |
stresses = set() | |
for pronunciation in pronunciations[word]: | |
stress = pronunciation_to_stress(pronunciation) | |
stresses.add( stress ) | |
if loose: | |
stresses.add( stress.replace('2', '0') ) | |
stresses.add( stress.replace('2', '1') ) | |
L = [] | |
for alt_word in stresses: | |
for alt_phrase in alternatives: | |
L.append( alt_phrase + alt_word ) | |
alternatives = L | |
return alternatives | |
def translate_wiki(s): | |
s = s.strip() | |
if s[0]==s[-1] and s[0]=='"': | |
s = s[1:-1] | |
s = s.replace('_', ' ').replace('(', '').replace(')', '') | |
return s | |
if __name__=='__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('key', help='Text to match. Example "Teenage Mutant Ninja Turtles"') | |
parser.add_argument('page_titles_file', help='Extracted version of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz') | |
parser.add_argument('page_counts_file', nargs='?', help='Extracted version of one log file from https://dumps.wikimedia.org/other/pagecounts-raw/') | |
parser.add_argument('-l', '--loose', action='store_true', help='Allow secondary stress to be considered stressed or unstressed syllable') | |
parser.add_argument('-p', '--progress', action='store_true', help='print a progress bar') | |
args = parser.parse_args() | |
pattern = parse_stress(args.key, args.loose) | |
titles = open(args.page_titles_file).readlines() | |
N = len(titles) | |
if args.progress: | |
pbar = progressbar.ProgressBar(maxval=N) | |
pbar.start() | |
# Find matching titles | |
matches = {} | |
for i, word in enumerate(titles): | |
word = word.strip() | |
if args.progress: | |
pbar.update(i) | |
s = translate_wiki(word) | |
try: | |
np = parse_stress(s, args.loose) | |
if len(set(np) & set(pattern))>0: | |
matches[word] = s | |
except: | |
continue | |
if args.progress: | |
pbar.finish() | |
# Print titles | |
if args.page_counts_file: | |
# sort by page counts | |
indexed = [] | |
for line in open(args.page_counts_file).readlines(): | |
try: | |
proj, name, size, views = line.split() | |
if name in matches: | |
indexed.append( (int(views), matches[name]) ) | |
except: | |
continue | |
for views, title in sorted(indexed): | |
print "%-25s\t%10s"%(title, "{:,}".format(views)) | |
else: | |
print '\n'.join(sorted(matches.values())) | |
print len(matches) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment