Last active
December 13, 2015 22:58
-
-
Save jorendorff/4987754 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" adlib.py - Randomly generate Mad-Libs-style fill-in-the-blanks games | |
from paragraphs in the Brown Corpus. | |
To use this without affecting your Python install: | |
mkdir adlib | |
cd adlib | |
wget https://gist.github.com/jorendorff/4987754/raw//adlib.py | |
virtualenv venv | |
. venv/bin/activate | |
pip install nltk | |
python -c 'import nltk; nltk.download("brown")' | |
python adlib.py | |
""" | |
import nltk | |
from nltk.corpus import brown | |
from nltk.probability import FreqDist | |
import random | |
# ---------------------------------------------------------------------------- | |
# Algorithm for selecting k random elements from a weighted list. | |
# Cribbed from: http://stackoverflow.com/a/2149533/94977 | |
class Node: | |
# Each node in the heap has a weight, value, and total weight. | |
# The total weight, self.tw, is self.w plus the weight of any children. | |
__slots__ = ['w', 'v', 'tw'] | |
def __init__(self, w, v, tw): | |
self.w, self.v, self.tw = w, v, tw | |
def rws_heap(items): | |
# h is the heap. It's like a binary tree that lives in an array. | |
# It has a Node for each pair in `items`. h[1] is the root. Each | |
# other Node h[i] has a parent at h[i>>1]. Each node has up to 2 | |
# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple | |
# arithmetic, we have to leave h[0] vacant. | |
h = [None] # leave h[0] vacant | |
for w, v in items: | |
h.append(Node(w, v, w)) | |
for i in range(len(h) - 1, 1, -1): # total up the tws | |
h[i>>1].tw += h[i].tw # add h[i]'s total to its parent | |
return h | |
def rws_heap_pop(h): | |
gas = h[1].tw * random.random() # start with a random amount of gas | |
i = 1 # start driving at the root | |
while gas > h[i].w: # while we have enough gas to get past node i: | |
gas -= h[i].w # drive past node i | |
i <<= 1 # move to first child | |
if gas > h[i].tw: # if we have enough gas: | |
gas -= h[i].tw # drive past first child and descendants | |
i += 1 # move to second child | |
w = h[i].w # out of gas! h[i] is the selected node. | |
v = h[i].v | |
h[i].w = 0 # make sure this node isn't chosen again | |
while i: # fix up total weights | |
h[i].tw -= w | |
i >>= 1 | |
return v | |
def random_weighted_sample_no_replacement(items, n): | |
heap = rws_heap(items) # just make a heap... | |
for i in range(n): | |
yield rws_heap_pop(heap) # and pop n items off it. | |
# ---------------------------------------------------------------------------- | |
# our code | |
def adlib(): | |
# Count how often each word appears in the corpus. | |
freq = FreqDist() | |
for word in brown.words(): | |
freq.inc(word.lower()) | |
# Choose a paragraph randomly from the corpus, | |
# making sure to eliminate short paragraphs. | |
# For fun, we'll focus on science fiction. | |
paras = [] | |
for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True): | |
if len(para) > 4 and sum(len(sentence) for sentence in para) > 40: | |
paras.append(para) | |
para = random.choice(paras) | |
# Explanations of the tags that we want to replace. | |
# (Never replace a preposition or a conjunction, for example; | |
# those are no fun because they are such small categories.) | |
# Simplified tags are a little too simple for our purpose, | |
# because singular and plural nouns get the same tag, N, | |
# but whatever. | |
tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number', | |
V='verb', VD='past tense verb', VG='-ing verb') | |
# Now choose some random words to replace. | |
# The output of this step is a set of pairs of ints: | |
# (sentence_index, word_index) | |
# --the coordinates of the chosen words within para. | |
# | |
# It would be easy to choose the words completely at random, | |
# but for our purposes, we wish to choose | |
# | |
# We do this using the "random weighted sample" code from | |
# http://stackoverflow.com/a/2149533/94977 | |
# which requires us to build a list of pairs (weight, data) | |
# where `data` is our coordinate pair. | |
# | |
items = [] | |
total_words = 0 | |
for i, sentence in enumerate(para): | |
for j, (word, tag) in enumerate(sentence): | |
total_words += 1 | |
if word.isalpha() and tag in tag_names: | |
# We weight the word based on its rarity; we want to replace rare words. | |
rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2 | |
items.append((rarity, (i, j))) | |
blanks = min(total_words // 7, len(items)) | |
chosen = set(random_weighted_sample_no_replacement(items, blanks)) | |
# Make a list of output words, replacing the chosen words with None. | |
results = [] | |
blank_labels = [] | |
for i, sentence in enumerate(para): | |
for j, (word, tag) in enumerate(sentence): | |
if (i, j) in chosen: | |
results.append(None) | |
blank_labels.append(tag_names[tag]) | |
else: | |
results.append(word) | |
# Print the results. Ordinarily you would use textwrap for this, but it | |
# doesn't quite apply here due to the blanks; also we need some extra | |
# effort to omit spaces appropriately around certain punctuation marks. | |
width = 72 | |
blank = '_' * 15 | |
line1 = '' | |
line2 = '' | |
seen_blanks = 0 | |
space_before = '' | |
for word in results: | |
label = '' | |
if word is None: | |
word = blank | |
label = blank_labels[seen_blanks] | |
seen_blanks += 1 | |
else: | |
if word in ('.', ',', '?', '!', ';', '--', "'", "''"): | |
space_before = '' | |
if len(line1) + len(space_before) + len(word) > width: | |
print line1 | |
print line2.rstrip() | |
line1 = '' | |
line2 = '' | |
space_before = '' | |
line1 += space_before + word | |
line2 += space_before + label.center(len(word)) | |
if word in ('``', '`', '('): | |
space_before = '' | |
else: | |
space_before = ' ' | |
if line1: | |
print line1 | |
print line2.rstrip() | |
if __name__ == '__main__': | |
adlib() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment