jhumigas/cbow.py

## cbow.py
"""
My proposal to the exercise in the tutorial about Deep Learning for NLP with Pytorch
This is one is about Word Embeddings that encodes Lexical Semantics.

Continuous Bag-of-Words model (CBOW) is model that tries to predict a word given the context
of a few words before and after the target.

.. _Source:
   http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#exercise-computing-word-embeddings-continuous-bag-of-words
"""

#!/usr/bin/env python
# -*- coding: utf-8 -*-


import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

# We will be considering two words before and after the target
CONTEXT_SIZE = 2

# This the embedding space dimension, meaning we want to represent each word by a 10 dimensional
# vector. Thanks to this representation we will be computing the CBoW
EMBEDDING_DIM = 10

# Initial data
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

vocab = set(raw_text)
vocab_size = len(vocab)

# 1. Pre-process the dataset
# We associate each word to a unique index
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Building our training set
data = []
for i in range(2, len(raw_text) -2):
    context = [raw_text[i-2], raw_text[i-1],
               raw_text[i+1], raw_text[i+2]]
    target = raw_text[i]
    data.append((context, target))

print(data[:5])

# 2. Design of our model
class CBOW(nn.Module):
    """CBOW model
    Two layer CBOW model

    Attributes:
        embeddings: vocab_size*embedding_dim holding the word embedding representations
        linear1: First layer that maps embedding vector to 128 space
        linear2: Final layer that maps 128-d vector to vocab size space
    """
    def __init__(self, vocab_size, embedding_dim, context_size):
        """

        Args:
            vocab_size (int): Size of our vocabulary
            embedding_dim (int): Dimension of the embedding space
            context_size (int): Number of words to consider around the target word
        """
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim*context_size*2, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

def make_context_vector(context, word_to_ix):
    """Generate from the context a variable to be used with our model

    Args:
        context(list): List of words surrounding the target
        word_to_ix(dict): Dictionary of words as key and index as value

    Returns:
        autograd.Variable: A variable corresponding to the context in terms of indexis
    """
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

# Example of running make_context_vector
# make_context_vector(data[0][0], word_to_ix)

# Let's train !
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

# We will go through the whole dataset 100 times
for epoch in range(100):
    total_loss = torch.Tensor([0])
    for context, target in data:
        # Preparing the input of the model
        # Turn the word into indices then wrap them into variables
        context_var = make_context_vector(context, word_to_ix)

        # Since pytorch accumulates the gradient, we zero it before every iteration
        model.zero_grad()

        # Forward pass
        log_probs = model(context_var)

        # Get target
        target_val = torch.tensor([word_to_ix[target]], dtype=torch.long)

        # Compute the loss function or how we are far from being correct
        loss = loss_function(log_probs, target_val)

        # Backward pass and update the gradient
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    losses.append(total_loss)

print(losses)

def do_inference(context, word_to_ix, model):
    """
    Predict word given its context.

    Args:
        context(list): List of words surrounding the target
        word_to_ix(dict): Dictionary of words as key and index as value
        model(CBOW): Trained model
    """
    print(f"Context: {context}")
    context_var = make_context_vector(context, word_to_ix)
    # Forward pass
    log_probs = model(context_var)
    # Find max prob
    predicted_target_idx = int(log_probs.argmax().numpy())
    # Get word
    predicted_target = [key for key, val in word_to_ix.items() if val==predicted_target_idx][0]
    return predicted_target
	"""
	My proposal to the exercise in the tutorial about Deep Learning for NLP with Pytorch
	This is one is about Word Embeddings that encodes Lexical Semantics.

	Continuous Bag-of-Words model (CBOW) is model that tries to predict a word given the context
	of a few words before and after the target.

	.. _Source:
	http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#exercise-computing-word-embeddings-continuous-bag-of-words
	"""

	#!/usr/bin/env python
	# -- coding: utf-8 --


	import torch
	import torch.nn as nn
	import torch.autograd as autograd
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim

	torch.manual_seed(1)

	# We will be considering two words before and after the target
	CONTEXT_SIZE = 2

	# This the embedding space dimension, meaning we want to represent each word by a 10 dimensional
	# vector. Thanks to this representation we will be computing the CBoW
	EMBEDDING_DIM = 10

	# Initial data
	raw_text = """We are about to study the idea of a computational process.
	Computational processes are abstract beings that inhabit computers.
	As they evolve, processes manipulate other abstract things called data.
	The evolution of a process is directed by a pattern of rules
	called a program. People create programs to direct processes. In effect,
	we conjure the spirits of the computer with our spells.""".split()

	vocab = set(raw_text)
	vocab_size = len(vocab)

	# 1. Pre-process the dataset
	# We associate each word to a unique index
	word_to_ix = {word: i for i, word in enumerate(vocab)}

	# Building our training set
	data = []
	for i in range(2, len(raw_text) -2):
	context = [raw_text[i-2], raw_text[i-1],
	raw_text[i+1], raw_text[i+2]]
	target = raw_text[i]
	data.append((context, target))

	print(data[:5])

	# 2. Design of our model
	class CBOW(nn.Module):
	"""CBOW model
	Two layer CBOW model

	Attributes:
	embeddings: vocab_size*embedding_dim holding the word embedding representations
	linear1: First layer that maps embedding vector to 128 space
	linear2: Final layer that maps 128-d vector to vocab size space
	"""
	def __init__(self, vocab_size, embedding_dim, context_size):
	"""

	Args:
	vocab_size (int): Size of our vocabulary
	embedding_dim (int): Dimension of the embedding space
	context_size (int): Number of words to consider around the target word
	"""
	super(CBOW, self).__init__()
	self.embeddings = nn.Embedding(vocab_size, embedding_dim)
	self.linear1 = nn.Linear(embedding_dimcontext_size2, 128)
	self.linear2 = nn.Linear(128, vocab_size)

	def forward(self, inputs):
	embeds = self.embeddings(inputs).view((1, -1))
	out = F.relu(self.linear1(embeds))
	out = self.linear2(out)
	log_probs = F.log_softmax(out, dim=1)
	return log_probs

	def make_context_vector(context, word_to_ix):
	"""Generate from the context a variable to be used with our model

	Args:
	context(list): List of words surrounding the target
	word_to_ix(dict): Dictionary of words as key and index as value

	Returns:
	autograd.Variable: A variable corresponding to the context in terms of indexis
	"""
	idxs = [word_to_ix[w] for w in context]
	tensor = torch.LongTensor(idxs)
	return autograd.Variable(tensor)

	# Example of running make_context_vector
	# make_context_vector(data[0][0], word_to_ix)

	# Let's train !
	losses = []
	loss_function = nn.NLLLoss()
	model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
	optimizer = optim.SGD(model.parameters(), lr=0.001)

	# We will go through the whole dataset 100 times
	for epoch in range(100):
	total_loss = torch.Tensor([0])
	for context, target in data:
	# Preparing the input of the model
	# Turn the word into indices then wrap them into variables
	context_var = make_context_vector(context, word_to_ix)

	# Since pytorch accumulates the gradient, we zero it before every iteration
	model.zero_grad()

	# Forward pass
	log_probs = model(context_var)

	# Get target
	target_val = torch.tensor([word_to_ix[target]], dtype=torch.long)

	# Compute the loss function or how we are far from being correct
	loss = loss_function(log_probs, target_val)

	# Backward pass and update the gradient
	loss.backward()
	optimizer.step()
	total_loss += loss.data
	losses.append(total_loss)

	print(losses)

	def do_inference(context, word_to_ix, model):
	"""
	Predict word given its context.

	Args:
	context(list): List of words surrounding the target
	word_to_ix(dict): Dictionary of words as key and index as value
	model(CBOW): Trained model
	"""
	print(f"Context: {context}")
	context_var = make_context_vector(context, word_to_ix)
	# Forward pass
	log_probs = model(context_var)
	# Find max prob
	predicted_target_idx = int(log_probs.argmax().numpy())
	# Get word
	predicted_target = [key for key, val in word_to_ix.items() if val==predicted_target_idx][0]
	return predicted_target