Skip to content

Instantly share code, notes, and snippets.

@jhumigas
Last active November 24, 2022 14:26
Show Gist options
  • Save jhumigas/fbd09c24ba1fe16f63dc078df011ebba to your computer and use it in GitHub Desktop.
Save jhumigas/fbd09c24ba1fe16f63dc078df011ebba to your computer and use it in GitHub Desktop.
Pytorch about Continuous Bag Of Words Representation
"""
My proposal to the exercise in the tutorial about Deep Learning for NLP with Pytorch
This is one is about Word Embeddings that encodes Lexical Semantics.
Continuous Bag-of-Words model (CBOW) is model that tries to predict a word given the context
of a few words before and after the target.
.. _Source:
http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#exercise-computing-word-embeddings-continuous-bag-of-words
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
# We will be considering two words before and after the target
CONTEXT_SIZE = 2
# This the embedding space dimension, meaning we want to represent each word by a 10 dimensional
# vector. Thanks to this representation we will be computing the CBoW
EMBEDDING_DIM = 10
# Initial data
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
vocab = set(raw_text)
vocab_size = len(vocab)
# 1. Pre-process the dataset
# We associate each word to a unique index
word_to_ix = {word: i for i, word in enumerate(vocab)}
# Building our training set
data = []
for i in range(2, len(raw_text) -2):
context = [raw_text[i-2], raw_text[i-1],
raw_text[i+1], raw_text[i+2]]
target = raw_text[i]
data.append((context, target))
print(data[:5])
# 2. Design of our model
class CBOW(nn.Module):
"""CBOW model
Two layer CBOW model
Attributes:
embeddings: vocab_size*embedding_dim holding the word embedding representations
linear1: First layer that maps embedding vector to 128 space
linear2: Final layer that maps 128-d vector to vocab size space
"""
def __init__(self, vocab_size, embedding_dim, context_size):
"""
Args:
vocab_size (int): Size of our vocabulary
embedding_dim (int): Dimension of the embedding space
context_size (int): Number of words to consider around the target word
"""
super(CBOW, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim*context_size*2, 128)
self.linear2 = nn.Linear(128, vocab_size)
def forward(self, inputs):
embeds = self.embeddings(inputs).view((1, -1))
out = F.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = F.log_softmax(out, dim=1)
return log_probs
def make_context_vector(context, word_to_ix):
"""Generate from the context a variable to be used with our model
Args:
context(list): List of words surrounding the target
word_to_ix(dict): Dictionary of words as key and index as value
Returns:
autograd.Variable: A variable corresponding to the context in terms of indexis
"""
idxs = [word_to_ix[w] for w in context]
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)
# Example of running make_context_vector
# make_context_vector(data[0][0], word_to_ix)
# Let's train !
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)
# We will go through the whole dataset 100 times
for epoch in range(100):
total_loss = torch.Tensor([0])
for context, target in data:
# Preparing the input of the model
# Turn the word into indices then wrap them into variables
context_var = make_context_vector(context, word_to_ix)
# Since pytorch accumulates the gradient, we zero it before every iteration
model.zero_grad()
# Forward pass
log_probs = model(context_var)
# Get target
target_val = torch.tensor([word_to_ix[target]], dtype=torch.long)
# Compute the loss function or how we are far from being correct
loss = loss_function(log_probs, target_val)
# Backward pass and update the gradient
loss.backward()
optimizer.step()
total_loss += loss.data
losses.append(total_loss)
print(losses)
def do_inference(context, word_to_ix, model):
"""
Predict word given its context.
Args:
context(list): List of words surrounding the target
word_to_ix(dict): Dictionary of words as key and index as value
model(CBOW): Trained model
"""
print(f"Context: {context}")
context_var = make_context_vector(context, word_to_ix)
# Forward pass
log_probs = model(context_var)
# Find max prob
predicted_target_idx = int(log_probs.argmax().numpy())
# Get word
predicted_target = [key for key, val in word_to_ix.items() if val==predicted_target_idx][0]
return predicted_target
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment