Skip to content

Instantly share code, notes, and snippets.

@kevinduh
Created January 9, 2017 05:01
Show Gist options
  • Save kevinduh/d562dca0c845bc88a8ba0b9ad2af07a6 to your computer and use it in GitHub Desktop.
Save kevinduh/d562dca0c845bc88a8ba0b9ad2af07a6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import theano
import theano.tensor as T
import numpy as np
import sys
def create_ngram_data(input_file, ngram_size):
'''Reads input_file and returns a character ngram dataset
where each row is an ngram [char1, char2, char3,.. charN],
with N=ngram_size, and the characters are mapped to ASCII id's.
'''
data = []
with open(input_file, 'r') as f:
for line in f:
for i in xrange(len(line) - ngram_size):
data.append([ord(c) for c in line[i:i+ngram_size]])
return data
def perplexity(testdata, ngram_size, inference_model):
'''Computes perplexity of the provided inference model on an ngram dataset
'''
nn = ngram_size - 1
expn = 0.0
count = 0
for d in testdata:
prob = inference_model(d[0:nn])[0][d[nn]]
expn -= np.log2(prob)
count +=1
return 2**(expn/count)
def generate_sample(initial_input, inference_model):
'''Generate a random text sample given initial input and an inference model
'''
inp = initial_input
print ' '.join([chr(i) for i in initial_input]),
for i in range(30):
o = np.random.choice(128, 1, p=inference_model(inp)[0])[0]
#o = inference_model(inp)[0].argmax()
print chr(o),
inp.pop(0)
inp.append(o)
def create_graphs(ngram_size, vocab_size, embedding_size, hidden_size):
'''Returns the (inference graph, training graph) of a feedforward neural language model
Assume a 5-gram setup, where input context = [char1,char2,char3,char4] to predict [char5]
- inference_graph([char1,char2,char3,char4]) outputs softmax probability distribution for char5
- train_graph([char1,char2,char3,char4],[char5]) will update neural language model and return training cost for this sample
'''
# embedding matrix
E = theano.shared(np.random.randn(vocab_size, embedding_size))
# input_id is a vector [char1,char2,char3,char4], represented by ASCII values
input_id = T.ivector('input_id')
# indexes the embedding matrix and concatenates
concat_embedding = E[input_id].reshape((1,-1))
# TODO1: MLP code here to connect concat_embedding to output
# output = T.nnet.softmax...
# ....
# inference_model = theano.function([input_id], output)
# TODO2: code here for training. output_id is target to predict, i.e. char5
# output_id = T.iscalar('output_id')
# ....
# train_model = theano.function([input_id, output_id], cost, updates=...
return inference_model, train_model
# 0: run program: "python nlm.py textfile.txt" where textfile is an ASCII text
input_text = sys.argv[1]
# 1: create dataset. we'll have 5-gram language models, i.e. given 4 characters, predict the 5th
ngram_size = 5
data = create_ngram_data(input_text,ngram_size)
traindata, testdata = data[0:int(len(data)*0.8)], data[int(len(data)*0.8):]
# 2: create computation graphs
inference_graph, train_graph = create_graphs(ngram_size, vocab_size=128, embedding_size=7, hidden_size=25)
# 3: training loop
for epoch in range(10):
cumulative_cost = 0
for d in traindata:
nn = ngram_size - 1
c = train_graph(d[0:nn],d[nn])
cumulative_cost += c
print "Epoch=%d CumulativeCost=%f" %(epoch, cumulative_cost),
print "TrainPerplexity=%f TestPerplexity=%f" % (perplexity(traindata,ngram_size,inference_graph),
perplexity(testdata,ngram_size,inference_graph))
for i in range(3):
print "sample: ",
generate_sample(data[i][0:nn],inference_graph)
print ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment