Skip to content

Instantly share code, notes, and snippets.

@jorendorff
Last active December 13, 2015 22:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorendorff/4987554 to your computer and use it in GitHub Desktop.
Save jorendorff/4987554 to your computer and use it in GitHub Desktop.
Full script, minus comments, of the mystery program in this blog post: http://pynash.org/2013/02/19/English-playground.html
""" censored.py - Randomly ######## ############## ################## #####
#### ########## ## ### Brown Corpus.
To run this program without affecting your Python install:
mkdir mystery-program
cd mystery-program
wget https://gist.github.com/jorendorff/4987554/raw//censored.py
virtualenv venv
. venv/bin/activate
pip install nltk
python -c 'import nltk; nltk.download("brown")'
python censored.py
"""
from nltk.corpus import brown
from nltk.probability import FreqDist
import random
# ----------------------------------------------------------------------------
# Algorithm for selecting k random elements from a weighted list.
# Cribbed from: http://stackoverflow.com/a/2149533/94977
class Node:
# Each node in the heap has a weight, value, and total weight.
# The total weight, self.tw, is self.w plus the weight of any children.
__slots__ = ['w', 'v', 'tw']
def __init__(self, w, v, tw):
self.w, self.v, self.tw = w, v, tw
def rws_heap(items):
# h is the heap. It's like a binary tree that lives in an array.
# It has a Node for each pair in `items`. h[1] is the root. Each
# other Node h[i] has a parent at h[i>>1]. Each node has up to 2
# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple
# arithmetic, we have to leave h[0] vacant.
h = [None] # leave h[0] vacant
for w, v in items:
h.append(Node(w, v, w))
for i in range(len(h) - 1, 1, -1): # total up the tws
h[i>>1].tw += h[i].tw # add h[i]'s total to its parent
return h
def rws_heap_pop(h):
gas = h[1].tw * random.random() # start with a random amount of gas
i = 1 # start driving at the root
while gas > h[i].w: # while we have enough gas to get past node i:
gas -= h[i].w # drive past node i
i <<= 1 # move to first child
if gas > h[i].tw: # if we have enough gas:
gas -= h[i].tw # drive past first child and descendants
i += 1 # move to second child
w = h[i].w # out of gas! h[i] is the selected node.
v = h[i].v
h[i].w = 0 # make sure this node isn't chosen again
while i: # fix up total weights
h[i].tw -= w
i >>= 1
return v
def random_weighted_sample_no_replacement(items, n):
heap = rws_heap(items) # just make a heap...
for i in range(n):
yield rws_heap_pop(heap) # and pop n items off it.
# ----------------------------------------------------------------------------
# our code
def main():
freq = FreqDist()
for word in brown.words():
freq.inc(word.lower())
paras = []
for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True):
if len(para) > 4 and sum(len(sentence) for sentence in para) > 40:
paras.append(para)
para = random.choice(paras)
tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number',
V='verb', VD='past tense verb', VG='-ing verb')
items = []
total_words = 0
for i, sentence in enumerate(para):
for j, (word, tag) in enumerate(sentence):
total_words += 1
if word.isalpha() and tag in tag_names:
# We weight the word based on its rarity; we want to replace rare words.
rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2
items.append((rarity, (i, j)))
blanks = min(total_words // 7, len(items))
chosen = set(random_weighted_sample_no_replacement(items, blanks))
results = []
blank_labels = []
for i, sentence in enumerate(para):
for j, (word, tag) in enumerate(sentence):
if (i, j) in chosen:
results.append(None)
blank_labels.append(tag_names[tag])
else:
results.append(word)
width = 72
blank = '_' * 15
line1 = ''
line2 = ''
seen_blanks = 0
space_before = ''
for word in results:
label = ''
if word is None:
word = blank
label = blank_labels[seen_blanks]
seen_blanks += 1
else:
if word in ('.', ',', '?', '!', ';', '--', "'", "''"):
space_before = ''
if len(line1) + len(space_before) + len(word) > width:
print line1
print line2.rstrip()
print
line1 = ''
line2 = ''
space_before = ''
line1 += space_before + word
line2 += space_before + label.center(len(word))
if word in ('``', '`', '('):
space_before = ''
else:
space_before = ' '
if line1:
print line1
print line2.rstrip()
print
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment