Skip to content

Instantly share code, notes, and snippets.

@codingisacopingstrategy
Created November 29, 2010 00:25
Show Gist options
  • Save codingisacopingstrategy/719432 to your computer and use it in GitHub Desktop.
Save codingisacopingstrategy/719432 to your computer and use it in GitHub Desktop.
suave trigram based text generation
#!/usr/bin/python
# -*- coding: utf-8 -*-
import nltk
import codecs
import re
f = codecs.open("newphilosophers2.txt", "r", "utf-8")
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+|\n+')
tokenized_content = tokenizer.tokenize(f.read())
content_model = nltk.NgramModel(3, tokenized_content)
# witregels worden nu ook als token herkend (\n+)
# wat een heel ander effect geeft
class Relevations:
"""Takes an n-gram model, constructs an iterator that
generates nicely formatted text indefinitely."""
def __init__(self, content_model):
self.content_model = content_model
self.generated = self.content_model.generate(400)
self.e = re.compile(r'[^\w\s]+') #alles wat niet een spatie of woordcharacter is
def next(self):
formatted = []
i = 0
while i < len(self.generated):
token = self.generated[i]
if self.e.match(token) or token in 'smt':
if token in [u'‘', u'“', '(']:
next = self.generated[i+1]
formatted.append("%s%s" % (token,next))
i += 2
elif token == "-":
next = self.generated[i+1]
formatted[-1] += "%s%s" % (token,next)
i += 2
else:
formatted[-1] += token
i += 1
else:
formatted.append(token)
i += 1
self.generated = self.content_model.generate(400,self.generated[-2:])[2:]
return u' '.join(formatted).encode('utf-8')
def __iter__(self):
return self
if __name__ == "__main__":
print """
example usage:
import relevations
tst = relevations.Relevations(relevations.content_model)
print tst.next()
print tst.next()
etc.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment