Skip to content

Instantly share code, notes, and snippets.

@timbennett
Created August 2, 2015 23:29
Show Gist options
  • Save timbennett/1500512037827d309b12 to your computer and use it in GitHub Desktop.
Save timbennett/1500512037827d309b12 to your computer and use it in GitHub Desktop.
Generate text with a Character-level Language Model
# based entirely on http://nbviewer.ipython.org/gist/yoavg/d76121dfde2618422139 by Yoav Goldberg
# inputfile.txt (specified on line 42): is the corpus you want to learn & generate from
# nletters=1000 (on line 33): is how many characters you want to generate
# order=10 (line 42): is the history length (higher generally gives better output with diminishing returns above 7ish)
# print generate_text(lm, 10) (line 43): 10 is the order again, but for generation
from collections import *
def train_char_lm(fname, order=4):
data = file(fname).read()
lm = defaultdict(Counter)
pad = "~" * order
data = pad + data
for i in xrange(len(data)-order):
history, char = data[i:i+order], data[i+order]
lm[history][char]+=1
def normalize(counter):
s = float(sum(counter.values()))
return [(c,cnt/s) for c,cnt in counter.iteritems()]
outlm = {hist:normalize(chars) for hist, chars in lm.iteritems()}
return outlm
from random import random
def generate_letter(lm, history, order):
history = history[-order:]
dist = lm[history]
x = random()
for c,v in dist:
x = x - v
if x <= 0: return c
def generate_text(lm, order, nletters=1000):
history = "~" * order
out = []
for i in xrange(nletters):
c = generate_letter(lm, history, order)
history = history[-order:] + c
out.append(c)
return "".join(out)
lm = train_char_lm("inputfile.txt", order=10)
print generate_text(lm, 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment