Created
May 28, 2018 01:27
-
-
Save JoshZastrow/b724308bd9ca5eab83a577ae5b1a21cb to your computer and use it in GitHub Desktop.
PyTorch implementation of a sentiment analysis classifier using Embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
from sklearn.model_selection import train_test_split | |
from torch.utils import data | |
from keras import preprocessing | |
from keras.datasets import imdb | |
import numpy as np | |
# Get Data | |
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(seed=7, num_words=10000) | |
# Pad to proper dimensions | |
x_train = preprocessing.sequence.pad_sequences(X_train, maxlen=100) | |
y_train = Y_train[..., np.newaxis] | |
# Convert to torch tensors | |
x_train = torch.from_numpy(x_train).long() | |
y_train = torch.from_numpy(y_train).float() # directly create a tensor on GPU | |
# Create a data loader | |
train = data.TensorDataset(x_train, y_train) | |
datagen = data.DataLoader(train, batch_size=32, shuffle=False) | |
# PyTorch model | |
class sentimentNet(nn.Module): | |
def __init__(self, batch_size, num_words, input_size, embedding_size): | |
super(sentimentNet, self).__init__() | |
self.embedd1 = nn.Embedding(num_words, embedding_size) | |
self.linear1 = nn.Linear(input_size * embedding_size, 1) | |
def forward(self, inputs): | |
embed1 = self.embedd1(inputs) | |
flatt1 = embed1.view(-1, self.num_flat_features(embed1)) | |
layer1 = self.linear1(flatt1) | |
output = F.log_softmax(layer1, dim=1) | |
return output | |
def num_flat_features(self, x): | |
"""multiplies all features to flatten a layer""" | |
size = x.size()[1:] # all dimensions except the batch dimension | |
num_features = 1 | |
for s in size: | |
num_features *= s | |
return num_features | |
# Instantiate a model | |
modelT = sentimentNet(batch_size=32, num_words=10000, input_size=100, embedding_size=8) | |
# Training time and metric recorder | |
num_epochs = 5 | |
train_hist = collections.defaultdict(lambda:[]) | |
# PyTorch optimizers | |
cost_func = nn.BCELoss() | |
optimizer = torch.optim.SGD(modelT.parameters(), lr=1e-3, momentum=0.9) | |
# Training routing | |
for epoch in range(num_epochs): | |
print("Epoch {} / {}".format(epoch, num_epochs)) | |
for i, data_batch in enumerate(datagen, 0): | |
inputs, labels = data_batch | |
optimizer.zero_grad() # reset grads | |
pred = modelT(inputs) | |
loss = cost_func(pred, labels) | |
loss.backward() | |
optimizer.step() | |
# print statistics | |
running_loss += loss.item() | |
if i % 200 == 199: # print every 2000 mini-batches | |
train_hist['loss'] += [loss.item()] | |
print('sample {} --> loss: {}'.format(i + 1, loss.item())) | |
print('Finished Training') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment