Last active
December 13, 2015 22:49
-
-
Save jorendorff/4987554 to your computer and use it in GitHub Desktop.
Full script, minus comments, of the mystery program in this blog post: http://pynash.org/2013/02/19/English-playground.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" censored.py - Randomly ######## ############## ################## ##### | |
#### ########## ## ### Brown Corpus. | |
To run this program without affecting your Python install: | |
mkdir mystery-program | |
cd mystery-program | |
wget https://gist.github.com/jorendorff/4987554/raw//censored.py | |
virtualenv venv | |
. venv/bin/activate | |
pip install nltk | |
python -c 'import nltk; nltk.download("brown")' | |
python censored.py | |
""" | |
from nltk.corpus import brown | |
from nltk.probability import FreqDist | |
import random | |
# ---------------------------------------------------------------------------- | |
# Algorithm for selecting k random elements from a weighted list. | |
# Cribbed from: http://stackoverflow.com/a/2149533/94977 | |
class Node: | |
# Each node in the heap has a weight, value, and total weight. | |
# The total weight, self.tw, is self.w plus the weight of any children. | |
__slots__ = ['w', 'v', 'tw'] | |
def __init__(self, w, v, tw): | |
self.w, self.v, self.tw = w, v, tw | |
def rws_heap(items): | |
# h is the heap. It's like a binary tree that lives in an array. | |
# It has a Node for each pair in `items`. h[1] is the root. Each | |
# other Node h[i] has a parent at h[i>>1]. Each node has up to 2 | |
# children, h[i<<1] and h[(i<<1)+1]. To get this nice simple | |
# arithmetic, we have to leave h[0] vacant. | |
h = [None] # leave h[0] vacant | |
for w, v in items: | |
h.append(Node(w, v, w)) | |
for i in range(len(h) - 1, 1, -1): # total up the tws | |
h[i>>1].tw += h[i].tw # add h[i]'s total to its parent | |
return h | |
def rws_heap_pop(h): | |
gas = h[1].tw * random.random() # start with a random amount of gas | |
i = 1 # start driving at the root | |
while gas > h[i].w: # while we have enough gas to get past node i: | |
gas -= h[i].w # drive past node i | |
i <<= 1 # move to first child | |
if gas > h[i].tw: # if we have enough gas: | |
gas -= h[i].tw # drive past first child and descendants | |
i += 1 # move to second child | |
w = h[i].w # out of gas! h[i] is the selected node. | |
v = h[i].v | |
h[i].w = 0 # make sure this node isn't chosen again | |
while i: # fix up total weights | |
h[i].tw -= w | |
i >>= 1 | |
return v | |
def random_weighted_sample_no_replacement(items, n): | |
heap = rws_heap(items) # just make a heap... | |
for i in range(n): | |
yield rws_heap_pop(heap) # and pop n items off it. | |
# ---------------------------------------------------------------------------- | |
# our code | |
def main(): | |
freq = FreqDist() | |
for word in brown.words(): | |
freq.inc(word.lower()) | |
paras = [] | |
for para in brown.tagged_paras(categories='science_fiction', simplify_tags=True): | |
if len(para) > 4 and sum(len(sentence) for sentence in para) > 40: | |
paras.append(para) | |
para = random.choice(paras) | |
tag_names = dict(ADJ='adjective', ADV='adverb', N='noun', NUM='number', | |
V='verb', VD='past tense verb', VG='-ing verb') | |
items = [] | |
total_words = 0 | |
for i, sentence in enumerate(para): | |
for j, (word, tag) in enumerate(sentence): | |
total_words += 1 | |
if word.isalpha() and tag in tag_names: | |
# We weight the word based on its rarity; we want to replace rare words. | |
rarity = 1 / (freq.freq(word.lower()) + 0.0001)**2 | |
items.append((rarity, (i, j))) | |
blanks = min(total_words // 7, len(items)) | |
chosen = set(random_weighted_sample_no_replacement(items, blanks)) | |
results = [] | |
blank_labels = [] | |
for i, sentence in enumerate(para): | |
for j, (word, tag) in enumerate(sentence): | |
if (i, j) in chosen: | |
results.append(None) | |
blank_labels.append(tag_names[tag]) | |
else: | |
results.append(word) | |
width = 72 | |
blank = '_' * 15 | |
line1 = '' | |
line2 = '' | |
seen_blanks = 0 | |
space_before = '' | |
for word in results: | |
label = '' | |
if word is None: | |
word = blank | |
label = blank_labels[seen_blanks] | |
seen_blanks += 1 | |
else: | |
if word in ('.', ',', '?', '!', ';', '--', "'", "''"): | |
space_before = '' | |
if len(line1) + len(space_before) + len(word) > width: | |
print line1 | |
print line2.rstrip() | |
line1 = '' | |
line2 = '' | |
space_before = '' | |
line1 += space_before + word | |
line2 += space_before + label.center(len(word)) | |
if word in ('``', '`', '('): | |
space_before = '' | |
else: | |
space_before = ' ' | |
if line1: | |
print line1 | |
print line2.rstrip() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment