Created
April 29, 2011 02:48
-
-
Save aparrish/947758 to your computer and use it in GitHub Desktop.
dramatically enjambed poems (using nltk POS-tagging)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import sys | |
from nltk.corpus import brown | |
import random | |
import re | |
def only_tokens(paired_list): | |
return [x[0] for x in paired_list] | |
def clean(s): | |
s = re.sub(r"^[;'.,?]", '', s) | |
s = re.sub(r"^\s+", '', s) | |
s = re.sub(r" ([;'.,?])", r'\1', s) | |
return s | |
first_to_gram = dict() | |
gram_to_after = dict() | |
brown_tagged_sents = brown.tagged_sents() | |
tagger = nltk.UnigramTagger(brown_tagged_sents) | |
low_order = 4 | |
high_order = 9 | |
# break text into POS-tagged n-grams of various lengths | |
for line in sys.stdin: | |
line = line.strip() | |
toks = nltk.word_tokenize(line.strip()) | |
tagged = tagger.tag(toks) | |
for i in range(len(tagged) - high_order): | |
for n in range(i+low_order, i+high_order): | |
this_gram = tuple(tagged[i:n+1]) | |
# store this n-gram and the POS that came after it | |
after = tagged[n+1][1] | |
if after is not None: | |
if this_gram not in gram_to_after: | |
gram_to_after[this_gram] = [] | |
gram_to_after[this_gram].append(after) | |
# store the first POS of this n-gram along with | |
# the n-gram itself | |
if i > 0: | |
first = tagged[i][1] | |
if first is not None: | |
if first not in first_to_gram: | |
first_to_gram[first] = [] | |
first_to_gram[first].append(this_gram) | |
for i in range(50): | |
# randomly select an n-gram... | |
current = random.choice(gram_to_after.keys()) | |
print clean(' '.join((only_tokens(current)))) | |
# then chain together lines, selecting a random | |
# line that begins with the part of speech that | |
# followed the n-gram in the previous line | |
for j in range(13): | |
next = random.choice(gram_to_after[current]) | |
if next in first_to_gram: | |
next_choice = random.choice(first_to_gram[next]) | |
print clean(' '.join(only_tokens(next_choice))) | |
if next_choice in gram_to_after: | |
current = next_choice | |
else: | |
break | |
print '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment