-
-
Save GavinXing/9954ea846072e115bb07d9758892382c to your computer and use it in GitHub Desktop.
# encoding=utf-8 | |
# Project: learn-pytorch | |
# Author: xingjunjie github: @gavinxing | |
# Create Time: 29/07/2017 11:58 AM on PyCharm | |
# Basic template from http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html | |
import torch | |
import torch.nn as nn | |
import torch.autograd as autograd | |
import torch.optim as optim | |
import torch.nn.functional as F | |
class CBOW(nn.Module): | |
def __init__(self, context_size=2, embedding_size=100, vocab_size=None): | |
super(CBOW, self).__init__() | |
self.embeddings = nn.Embedding(vocab_size, embedding_size) | |
self.linear1 = nn.Linear(embedding_size, vocab_size) | |
def forward(self, inputs): | |
lookup_embeds = self.embeddings(inputs) | |
embeds = lookup_embeds.sum(dim=0) | |
out = self.linear1(embeds) | |
out = F.log_softmax(out) | |
return out | |
# create your model and train. here are some functions to help you make | |
# the data ready for use by your module | |
def make_context_vector(context, word_to_ix): | |
idxs = [word_to_ix[w] for w in context] | |
tensor = torch.LongTensor(idxs) | |
return autograd.Variable(tensor) | |
# print(make_context_vector(data[0][0], word_to_ix)) # example | |
if __name__ == '__main__': | |
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right | |
EMBEDDING_SIZE = 10 | |
raw_text = """We are about to study the idea of a computational process. | |
Computational processes are abstract beings that inhabit computers. | |
As they evolve, processes manipulate other abstract things called data. | |
The evolution of a process is directed by a pattern of rules | |
called a program. People create programs to direct processes. In effect, | |
we conjure the spirits of the computer with our spells.""".split() | |
# By deriving a set from `raw_text`, we deduplicate the array | |
vocab = set(raw_text) | |
vocab_size = len(vocab) | |
word_to_ix = {word: i for i, word in enumerate(vocab)} | |
data = [] | |
for i in range(2, len(raw_text) - 2): | |
context = [raw_text[i - 2], raw_text[i - 1], | |
raw_text[i + 1], raw_text[i + 2]] | |
target = raw_text[i] | |
data.append((context, target)) | |
loss_func = nn.CrossEntropyLoss() | |
net = CBOW(CONTEXT_SIZE, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size) | |
optimizer = optim.SGD(net.parameters(), lr=0.01) | |
for epoch in range(100): | |
total_loss = 0 | |
for context, target in data: | |
context_var = make_context_vector(context, word_to_ix) | |
net.zero_grad() | |
log_probs = net(context_var) | |
loss = loss_func(log_probs, autograd.Variable( | |
torch.LongTensor([word_to_ix[target]]) | |
)) | |
loss.backward() | |
optimizer.step() | |
total_loss += loss.data | |
print(total_loss) |
net.zero_grad()
is used to clear the gradients before we back propagate for correct tuning of parameters. Failure to do so will accumulate the gradient which will lead to erroneous tuning. Since, there is backward movement in the network for every instance of data (stochastic gradient descent), we need to clear the existing gradients.
Since the nn.Crossentropy includes a Softmax module, I suppose the network should remove the softmax for the sake of speeding up.
Btw CONTEXT_SIZE
is unused
Where do you update the embeddings? It seems to me that this is simply training to predict a word given the context, but I don't see where the embeddings are updated (or even what they would be updated with).
Running the code gives error "RuntimeError: dimension out of range (expected to be in range of [-1, 0], but got 1)" error...What could be the reason?
To make it work, in CBOW.forward()
comment out line 24: out = F.log_softmax(out)
. Also update line 74 to read loss = loss_func(log_probs.view(-1,1), autograd.Variable(
.
To make it work, in
CBOW.forward()
comment out line 24:out = F.log_softmax(out)
. Also update line 74 to readloss = loss_func(log_probs.view(-1,1), autograd.Variable(
.
line 74 to read
loss = loss_func(log_probs.view(1,-1), autograd.Variable(
.
works for me
why context_size is unused?
Hi, thanks for the nice code!
Quick question, why is the gradient reset at every context word
net.zero_grad()
and not at every epoch?Should it be this instead?