Skip to content

Instantly share code, notes, and snippets.

@GavinXing
Created August 6, 2017 01:44
Show Gist options
  • Star 23 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save GavinXing/9954ea846072e115bb07d9758892382c to your computer and use it in GitHub Desktop.
Save GavinXing/9954ea846072e115bb07d9758892382c to your computer and use it in GitHub Desktop.
A complete word2vec based on pytorch tutorial
# encoding=utf-8
# Project: learn-pytorch
# Author: xingjunjie github: @gavinxing
# Create Time: 29/07/2017 11:58 AM on PyCharm
# Basic template from http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
class CBOW(nn.Module):
def __init__(self, context_size=2, embedding_size=100, vocab_size=None):
super(CBOW, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_size)
self.linear1 = nn.Linear(embedding_size, vocab_size)
def forward(self, inputs):
lookup_embeds = self.embeddings(inputs)
embeds = lookup_embeds.sum(dim=0)
out = self.linear1(embeds)
out = F.log_softmax(out)
return out
# create your model and train. here are some functions to help you make
# the data ready for use by your module
def make_context_vector(context, word_to_ix):
idxs = [word_to_ix[w] for w in context]
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)
# print(make_context_vector(data[0][0], word_to_ix)) # example
if __name__ == '__main__':
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
EMBEDDING_SIZE = 10
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
context = [raw_text[i - 2], raw_text[i - 1],
raw_text[i + 1], raw_text[i + 2]]
target = raw_text[i]
data.append((context, target))
loss_func = nn.CrossEntropyLoss()
net = CBOW(CONTEXT_SIZE, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size)
optimizer = optim.SGD(net.parameters(), lr=0.01)
for epoch in range(100):
total_loss = 0
for context, target in data:
context_var = make_context_vector(context, word_to_ix)
net.zero_grad()
log_probs = net(context_var)
loss = loss_func(log_probs, autograd.Variable(
torch.LongTensor([word_to_ix[target]])
))
loss.backward()
optimizer.step()
total_loss += loss.data
print(total_loss)
@austinmw
Copy link

austinmw commented Jul 1, 2018

Btw CONTEXT_SIZE is unused

@zpaines
Copy link

zpaines commented Sep 14, 2018

Where do you update the embeddings? It seems to me that this is simply training to predict a word given the context, but I don't see where the embeddings are updated (or even what they would be updated with).

@satvikmashkaria
Copy link

Running the code gives error "RuntimeError: dimension out of range (expected to be in range of [-1, 0], but got 1)" error...What could be the reason?

@authman
Copy link

authman commented Apr 18, 2019

To make it work, in CBOW.forward() comment out line 24: out = F.log_softmax(out). Also update line 74 to read loss = loss_func(log_probs.view(-1,1), autograd.Variable(.

@insublee
Copy link

insublee commented Aug 3, 2020

To make it work, in CBOW.forward() comment out line 24: out = F.log_softmax(out). Also update line 74 to read loss = loss_func(log_probs.view(-1,1), autograd.Variable(.

line 74 to read
loss = loss_func(log_probs.view(1,-1), autograd.Variable(.
works for me

@fkurushin
Copy link

why context_size is unused?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment