Skip to content

Instantly share code, notes, and snippets.

@RawPlutonium
Forked from germank/ex2.py
Created December 18, 2017 16:03
Show Gist options
  • Save RawPlutonium/a1c4a62217614b164c2d19b9b40f2461 to your computer and use it in GitHub Desktop.
Save RawPlutonium/a1c4a62217614b164c2d19b9b40f2461 to your computer and use it in GitHub Desktop.
Code stub for a simple text classifier
# encoding=utf-8
# --- Adapted From ---
# Project: learn-pytorch
# Author: xingjunjie github: @gavinxing
# Create Time: 29/07/2017 11:58 AM on PyCharm
# Original code at: https://gist.github.com/GavinXing/9954ea846072e115bb07d9758892382c
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
class CBOW(nn.Module):
def __init__(self, vocab_size, num_classes=2, embedding_size=100):
super(CBOW, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
self.class_embeddings = nn.Linear(embedding_size, num_classes)
def forward(self, inputs):
input_embeddings = self.word_embeddings(inputs)
sent_embedding = input_embeddings.sum(dim=0)
out = self.class_embeddings(sent_embedding)
out = F.log_softmax(out)
return out
def make_sentence_vector(context, word_to_ix):
idxs = [word_to_ix[w] for w in context]
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)
if __name__ == '__main__':
EMBEDDING_SIZE = 10
#
#
# TODO: here is some example training data, but we need more.
# Download some movie review data from http://ai.stanford.edu/~amaas/data/sentiment/
# and load it so you can train the classifier!
#
#
positive = ["' stanley and iris ' show the triumph of the human spirit.".split(),
"what a fun movie !".split()]
negative = ["there are times when finishing a film one wishes to have a refund for the time just spent .".split(),
"this movie was so unrelentingly bad , I could hardly believe I was watching it .".split()]
# for each sentence we label them as "1" for positive and "0" for negative
data = [(s, 1) for s in positive] + [(s, 0) for s in negative]
# we extract the vocabulary
vocab = set(sum(positive, []) + sum(negative, []))
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
loss_func = nn.CrossEntropyLoss()
net = CBOW(num_classes=2, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size)
optimizer = optim.SGD(net.parameters(), lr=0.01)
for epoch in range(200):
total_loss = 0
for sentence, label in data:
# creates a vector with the indexes of each word in the sentence
sentence_var = make_sentence_vector(sentence, word_to_ix)
# compute predictions
log_probs = net(sentence_var)
# compute error function
loss = loss_func(log_probs, autograd.Variable(
torch.LongTensor([label])
))
net.zero_grad() # reset gradients
loss.backward() # compute updates
optimizer.step() # update vectors
total_loss += loss.data
print("loss =", total_loss[0])
# Sanity check that we fitted the training set, but there is no glory in that!
# We will need (much) more training data to generalize to new sentences
sentence_var = make_sentence_vector("what a fun movie".split(), word_to_ix)
print("Positive prediction: ", net(sentence_var).exp())
sentence_var = make_sentence_vector("this movie was so unrelentingly bad".split(), word_to_ix)
print("Negative prediction: ", net(sentence_var).exp())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment