Create a gist now

Instantly share code, notes, and snippets.

@jorendorff /
Last active Dec 13, 2015

""" - Randomly generate Mad-Libs-style fill-in-the-blanks games
from paragraphs in the Brown Corpus.
To use this without affecting your Python install:
mkdir adlib
cd adlib
virtualenv venv
. venv/bin/activate
pip install nltk
python -c 'import nltk;"brown")'
import nltk
from nltk.corpus import brown
from nltk.probability import FreqDist
import random
# ----------------------------------------------------------------------------
# Algorithm for selecting k random elements from a weighted list.
# Cribbed from:
class Node:
# Each node in the heap has a weight, value, and total weight.
# The total weight,, is self.w plus the weight of any children.
__slots__ = ['w', 'v', 'tw']
def __init__(self, w, v, tw):
self.w, self.v, = w, v, tw
def rws_heap(items):
# h is the heap. It's like a binary tree that lives in an array.
# It has a Node for each pair in `items`. h[1] is the root. Each
# other Node h[i] has a parent at h[i>>1]. Each node has up to 2
# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple
# arithmetic, we have to leave h[0] vacant.
h = [None] # leave h[0] vacant
for w, v in items:
h.append(Node(w, v, w))
for i in range(len(h) - 1, 1, -1): # total up the tws
h[i>>1].tw += h[i].tw # add h[i]'s total to its parent
return h
def rws_heap_pop(h):
gas = h[1].tw * random.random() # start with a random amount of gas
i = 1 # start driving at the root
while gas > h[i].w: # while we have enough gas to get past node i:
gas -= h[i].w # drive past node i
i <<= 1 # move to first child
if gas > h[i].tw: # if we have enough gas:
gas -= h[i].tw # drive past first child and descendants
i += 1 # move to second child
w = h[i].w # out of gas! h[i] is the selected node.
v = h[i].v
h[i].w = 0 # make sure this node isn't chosen again
while i: # fix up total weights
h[i].tw -= w
i >>= 1
return v
def random_weighted_sample_no_replacement(items, n):
heap = rws_heap(items) # just make a heap...
for i in range(n):
yield rws_heap_pop(heap) # and pop n items off it.
# ----------------------------------------------------------------------------
# our code
def adlib():
# Count how often each word appears in the corpus.
freq = FreqDist()
for word in brown.words():
# Choose a paragraph randomly from the corpus,
# making sure to eliminate short paragraphs.
# For fun, we'll focus on science fiction.
paras = []
for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
para = random.choice(paras)
# Explanations of the tags that we want to replace.
# (Never replace a preposition or a conjunction, for example;
# those are no fun because they are such small categories.)
# Simplified tags are a little too simple for our purpose,
# because singular and plural nouns get the same tag, N,
# but whatever.
tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
V='verb', VD='past tense verb', VG='-ing verb')
# Now choose some random words to replace.
# The output of this step is a set of pairs of ints:
# (sentence_index, word_index)
# --the coordinates of the chosen words within para.
# It would be easy to choose the words completely at random,
# but for our purposes, we wish to choose
# We do this using the "random weighted sample" code from
# which requires us to build a list of pairs (weight, data)
# where `data` is our coordinate pair.
items = []
total_words = 0
for i, sentence in enumerate(para):
for j, (word, tag) in enumerate(sentence):
total_words += 1
if word.isalpha() and tag in tag_names:
# We weight the word based on its rarity; we want to replace rare words.
rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
items.append((rarity, (i, j)))
blanks = min(total_words // 7, len(items))
chosen = set(random_weighted_sample_no_replacement(items, blanks))
# Make a list of output words, replacing the chosen words with None.
results = []
blank_labels = []
for i, sentence in enumerate(para):
for j, (word, tag) in enumerate(sentence):
if (i, j) in chosen:
# Print the results. Ordinarily you would use textwrap for this, but it
# doesn't quite apply here due to the blanks; also we need some extra
# effort to omit spaces appropriately around certain punctuation marks.
width = 72
blank = '_' * 15
line1 = ''
line2 = ''
seen_blanks = 0
space_before = ''
for word in results:
label = ''
if word is None:
word = blank
label = blank_labels[seen_blanks]
seen_blanks += 1
if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
space_before = ''
if len(line1) + len(space_before) + len(word) > width:
print line1
print line2.rstrip()
line1 = ''
line2 = ''
space_before = ''
line1 += space_before + word
line2 += space_before +
if word in ('``', '`', '('):
space_before = ''
space_before = ' '
if line1:
print line1
print line2.rstrip()
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment