DLu/xkcd1412.py

## xkcd1412.py
#!/usr/bin/python
import argparse
import sys
import nltk
import progressbar

# Replicating https://xkcd.com/1412/

# Special thanks to
# http://stackoverflow.com/questions/19015590/discovering-poetic-form-with-nltk-and-cmu-dict

pronunciations = nltk.corpus.cmudict.dict()

def pronunciation_to_stress(pronunciation):
    return  str(''.join([c for c in ''.join(pronunciation) if c.isdigit()]))

def parse_stress(phrase, loose=False):
    # (1=primary, 2=secondary, 0=no stress
    # loose considers 2=0
    alternatives = ['']
    for word in [words.lower() for words in nltk.word_tokenize(phrase)]:
        if word not in pronunciations:
            return None

        stresses = set()
        for pronunciation in pronunciations[word]:
            stress = pronunciation_to_stress(pronunciation)
            stresses.add( stress )
            if loose:
                stresses.add( stress.replace('2', '0') )
                stresses.add( stress.replace('2', '1') )

        L = []
        for alt_word in stresses:
            for alt_phrase in alternatives:
                L.append( alt_phrase + alt_word )
        alternatives = L

    return alternatives

def translate_wiki(s):
    s = s.strip()
    if s[0]==s[-1] and s[0]=='"':
        s = s[1:-1]
    s = s.replace('_', ' ').replace('(', '').replace(')', '')
    return s

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('key', help='Text to match. Example "Teenage Mutant Ninja Turtles"')
    parser.add_argument('page_titles_file', help='Extracted version of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz')
    parser.add_argument('page_counts_file', nargs='?', help='Extracted version of one log file from https://dumps.wikimedia.org/other/pagecounts-raw/')
    parser.add_argument('-l', '--loose', action='store_true', help='Allow secondary stress to be considered stressed or unstressed syllable')
    parser.add_argument('-p', '--progress', action='store_true', help='print a progress bar')

    args = parser.parse_args()
    pattern = parse_stress(args.key, args.loose)

    titles = open(args.page_titles_file).readlines()
    N = len(titles)
    if args.progress:
        pbar = progressbar.ProgressBar(maxval=N)
        pbar.start()

    # Find matching titles
    matches = {}
    for i, word in enumerate(titles):
        word = word.strip()
        if args.progress:
            pbar.update(i)

        s = translate_wiki(word)

        try:
            np = parse_stress(s, args.loose)
            if len(set(np) & set(pattern))>0:
                matches[word] = s
        except:
            continue
    if args.progress:
        pbar.finish()

    # Print titles
    if args.page_counts_file:
        # sort by page counts
        indexed = []
        for line in open(args.page_counts_file).readlines():
            try:
                proj, name, size, views = line.split()
                if name in matches:
                    indexed.append( (int(views), matches[name]) )
            except:
                continue
        for views, title in sorted(indexed):
            print "%-25s\t%10s"%(title, "{:,}".format(views))
    else:
        print '\n'.join(sorted(matches.values()))
    print len(matches)
	#!/usr/bin/python
	import argparse
	import sys
	import nltk
	import progressbar

	# Replicating https://xkcd.com/1412/

	# Special thanks to
	# http://stackoverflow.com/questions/19015590/discovering-poetic-form-with-nltk-and-cmu-dict

	pronunciations = nltk.corpus.cmudict.dict()

	def pronunciation_to_stress(pronunciation):
	return str(''.join([c for c in ''.join(pronunciation) if c.isdigit()]))

	def parse_stress(phrase, loose=False):
	# (1=primary, 2=secondary, 0=no stress
	# loose considers 2=0
	alternatives = ['']
	for word in [words.lower() for words in nltk.word_tokenize(phrase)]:
	if word not in pronunciations:
	return None

	stresses = set()
	for pronunciation in pronunciations[word]:
	stress = pronunciation_to_stress(pronunciation)
	stresses.add( stress )
	if loose:
	stresses.add( stress.replace('2', '0') )
	stresses.add( stress.replace('2', '1') )

	L = []
	for alt_word in stresses:
	for alt_phrase in alternatives:
	L.append( alt_phrase + alt_word )
	alternatives = L

	return alternatives

	def translate_wiki(s):
	s = s.strip()
	if s[0]==s[-1] and s[0]=='"':
	s = s[1:-1]
	s = s.replace('_', ' ').replace('(', '').replace(')', '')
	return s

	if __name__=='__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('key', help='Text to match. Example "Teenage Mutant Ninja Turtles"')
	parser.add_argument('page_titles_file', help='Extracted version of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz')
	parser.add_argument('page_counts_file', nargs='?', help='Extracted version of one log file from https://dumps.wikimedia.org/other/pagecounts-raw/')
	parser.add_argument('-l', '--loose', action='store_true', help='Allow secondary stress to be considered stressed or unstressed syllable')
	parser.add_argument('-p', '--progress', action='store_true', help='print a progress bar')

	args = parser.parse_args()
	pattern = parse_stress(args.key, args.loose)

	titles = open(args.page_titles_file).readlines()
	N = len(titles)
	if args.progress:
	pbar = progressbar.ProgressBar(maxval=N)
	pbar.start()

	# Find matching titles
	matches = {}
	for i, word in enumerate(titles):
	word = word.strip()
	if args.progress:
	pbar.update(i)

	s = translate_wiki(word)

	try:
	np = parse_stress(s, args.loose)
	if len(set(np) & set(pattern))>0:
	matches[word] = s
	except:
	continue
	if args.progress:
	pbar.finish()

	# Print titles
	if args.page_counts_file:
	# sort by page counts
	indexed = []
	for line in open(args.page_counts_file).readlines():
	try:
	proj, name, size, views = line.split()
	if name in matches:
	indexed.append( (int(views), matches[name]) )
	except:
	continue
	for views, title in sorted(indexed):
	print "%-25s\t%10s"%(title, "{:,}".format(views))
	else:
	print '\n'.join(sorted(matches.values()))
	print len(matches)