Created
December 18, 2019 13:38
-
-
Save lironsade/2411e3bb37a9f56da96921c614c8f838 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
import numpy as np | |
import os | |
from torch.utils.data import DataLoader, TensorDataset, Dataset | |
import operator | |
import data_loader | |
import pickle | |
import tqdm | |
# Important setting | |
torch.set_default_dtype(torch.float64) | |
# ------------------------------------------- Constants ---------------------------------------- | |
SEQ_LEN = 52 | |
W2V_EMBEDDING_DIM = 300 | |
ONEHOT_AVERAGE = "onehot_average" | |
W2V_AVERAGE = "w2v_average" | |
W2V_SEQUENCE = "w2v_sequence" | |
TRAIN = "train" | |
VAL = "val" | |
TEST = "test" | |
# ------------------------------------------ Helper methods and classes -------------------------- | |
def get_available_device(): | |
""" | |
Allows training on GPU if available. Can help with running things faster when a GPU with cuda is | |
available but not a most... | |
Given a device, one can use module.to(device) | |
and criterion.to(device) so that all the computations will be done on the GPU. | |
""" | |
return torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
def save_pickle(obj, path): | |
with open(path, "wb") as f: | |
pickle.dump(obj, f) | |
def load_pickle(path): | |
with open(path, "rb") as f: | |
return pickle.load(f) | |
def save_model(model, path, epoch, optimizer): | |
""" | |
Utility function for saving checkpoint of a model, so training or evaluation can be executed later on. | |
:param model: torch module representing the model | |
:param optimizer: torch optimizer used for training the module | |
:param path: path to save the checkpoint into | |
""" | |
torch.save({ | |
'epoch': epoch, | |
'model_state_dict': model.state_dict(), | |
'optimizer_state_dict': optimizer.state_dict()}, path) | |
def load(model, path, optimizer): | |
""" | |
Loads the state (weights, paramters...) of a model which was saved with save_model | |
:param model: should be the same model as the one which was saved in the path | |
:param path: path to the saved checkpoint | |
:param optimizer: should be the same optimizer as the one which was saved in the path | |
""" | |
checkpoint = torch.load(path) | |
model.load_state_dict(checkpoint['model_state_dict']) | |
optimizer.load_state_dict(checkpoint['optimizer_state_dict']) | |
epoch = checkpoint['epoch'] | |
return model, optimizer, epoch | |
# ------------------------------------------ Data utilities ---------------------------------------- | |
def load_word2vec(): | |
""" Load Word2Vec Vectors | |
Return: | |
wv_from_bin: All 3 million embeddings, each lengh 300 | |
""" | |
import gensim.downloader as api | |
wv_from_bin = api.load("word2vec-google-news-300") | |
vocab = list(wv_from_bin.vocab.keys()) | |
print(wv_from_bin.vocab[vocab[0]]) | |
print("Loaded vocab size %i" % len(vocab)) | |
return wv_from_bin | |
def create_or_load_slim_w2v(words_list, cache_w2v=True): | |
""" | |
returns word2vec dict only for words which appear in the dataset. | |
:param words_list: list of words to use for the w2v dict | |
:param cache_w2v: whether to save locally the small w2v dictionary | |
:return: dictionary which maps the known words to their vectors | |
""" | |
w2v_path = "w2v_dict.pkl" | |
if not os.path.exists(w2v_path): | |
full_w2v = load_word2vec() | |
w2v_emb_dict = {k: full_w2v[k] for k in words_list if k in full_w2v} | |
if cache_w2v: | |
save_pickle(w2v_emb_dict, w2v_path) | |
else: | |
w2v_emb_dict = load_pickle(w2v_path) | |
return w2v_emb_dict | |
def get_w2v_average(sent, word_to_vec, embedding_dim): | |
""" | |
This method gets a sentence and returns the average word embedding of the words consisting | |
the sentence. | |
:param sent: the sentence object | |
:param word_to_vec: a dictionary mapping words to their vector embeddings | |
:param embedding_dim: the dimension of the word embedding vectors | |
:return The average embedding vector as numpy ndarray. | |
""" | |
vec = np.zeros(embedding_dim) | |
c = 0 | |
for word in sent.text: | |
if word in word_to_vec: | |
vec += word_to_vec[word] | |
c += 1 | |
if c > 0: | |
vec /= c | |
return vec | |
def get_one_hot(size, ind): | |
""" | |
this method returns a one-hot vector of the given size, where the 1 is placed in the ind entry. | |
:param size: the size of the vector | |
:param ind: the entry index to turn to 1 | |
:return: numpy ndarray which represents the one-hot vector | |
""" | |
rv = np.zeros(size) | |
rv[ind] = 1 | |
return rv | |
def average_one_hots(sent, word_to_ind): | |
""" | |
this method gets a sentence, and a mapping between words to indices, and returns the average | |
one-hot embedding of the tokens in the sentence. | |
:param sent: a sentence object. | |
:param word_to_ind: a mapping between words to indices | |
:return: | |
""" | |
size = len(word_to_ind) | |
rv = np.zeros(size) | |
for w in sent.text: | |
rv += get_one_hot(size, word_to_ind[w]) | |
return (rv / len(sent.text)) | |
def get_word_to_ind(words_list): | |
""" | |
this function gets a list of words, and returns a mapping between | |
words to their index. | |
:param words_list: a list of words | |
:return: the dictionary mapping words to the index | |
""" | |
d = dict() | |
ind = 0 | |
for word in words_list: | |
if word not in d: | |
d[word] = ind | |
ind += 1 | |
return d | |
def sentence_to_embedding(sent, word_to_vec, seq_len, embedding_dim=300): | |
""" | |
this method gets a sentence and a word to vector mapping, and returns a list containing the | |
words embeddings of the tokens in the sentence. | |
:param sent: a sentence object | |
:param word_to_vec: a word to vector mapping. | |
:param seq_len: the fixed length for which the sentence will be mapped to. | |
:param embedding_dim: the dimension of the w2v embedding | |
:return: numpy ndarray of shape (seq_len, embedding_dim) with the representation of the sentence | |
""" | |
rv = [] | |
for i in range(min(seq_len, len(sent.text))): | |
emb = word_to_vec.get(sent.text[i], np.zeros(embedding_dim)) | |
rv.append(emb) | |
while len(rv) < seq_len: | |
rv.append(np.zeros(embedding_dim)) | |
return np.array(rv) | |
class OnlineDataset(Dataset): | |
""" | |
A pytorch dataset which generates model inputs on the fly from sentences of SentimentTreeBank | |
""" | |
def __init__(self, sent_data, sent_func, sent_func_kwargs): | |
""" | |
:param sent_data: list of sentences from SentimentTreeBank | |
:param sent_func: Function which converts a sentence to an input datapoint | |
:param sent_func_kwargs: fixed keyword arguments for the state_func | |
""" | |
self.data = sent_data | |
self.sent_func = sent_func | |
self.sent_func_kwargs = sent_func_kwargs | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
sent = self.data[idx] | |
sent_emb = self.sent_func(sent, **self.sent_func_kwargs) | |
sent_label = sent.sentiment_class | |
return sent_emb, sent_label | |
class DataManager(): | |
""" | |
Utility class for handling all data management task. Can be used to get iterators for training and | |
evaluation. | |
""" | |
def __init__(self, data_type=ONEHOT_AVERAGE, use_sub_phrases=True, dataset_path="stanfordSentimentTreebank", batch_size=50, | |
embedding_dim=None): | |
""" | |
builds the data manager used for training and evaluation. | |
:param data_type: one of ONEHOT_AVERAGE, W2V_AVERAGE and W2V_SEQUENCE | |
:param use_sub_phrases: if true, training data will include all sub-phrases plus the full sentences | |
:param dataset_path: path to the dataset directory | |
:param batch_size: number of examples per batch | |
:param embedding_dim: relevant only for the W2V data types. | |
""" | |
# load the dataset | |
self.sentiment_dataset = data_loader.SentimentTreeBank(dataset_path, split_words=True) | |
# map data splits to sentences lists | |
self.sentences = {} | |
if use_sub_phrases: | |
self.sentences[TRAIN] = self.sentiment_dataset.get_train_set_phrases() | |
else: | |
self.sentences[TRAIN] = self.sentiment_dataset.get_train_set() | |
self.sentences[VAL] = self.sentiment_dataset.get_validation_set() | |
self.sentences[TEST] = self.sentiment_dataset.get_test_set() | |
# map data splits to sentence input preperation functions | |
words_list = list(self.sentiment_dataset.get_word_counts().keys()) | |
if data_type == ONEHOT_AVERAGE: | |
self.sent_func = average_one_hots | |
self.sent_func_kwargs = {"word_to_ind": get_word_to_ind(words_list)} | |
elif data_type == W2V_SEQUENCE: | |
self.sent_func = sentence_to_embedding | |
self.sent_func_kwargs = {"seq_len": SEQ_LEN, | |
"word_to_vec": create_or_load_slim_w2v(words_list), | |
"embedding_dim": embedding_dim | |
} | |
elif data_type == W2V_AVERAGE: | |
self.sent_func = get_w2v_average | |
words_list = list(self.sentiment_dataset.get_word_counts().keys()) | |
self.sent_func_kwargs = {"word_to_vec": create_or_load_slim_w2v(words_list), | |
"embedding_dim": embedding_dim | |
} | |
else: | |
raise ValueError("invalid data_type: {}".format(data_type)) | |
# map data splits to torch datasets and iterators | |
self.torch_datasets = {k: OnlineDataset(sentences, self.sent_func, self.sent_func_kwargs) for | |
k, sentences in self.sentences.items()} | |
self.torch_iterators = {k: DataLoader(dataset, batch_size=batch_size, shuffle=k == TRAIN) | |
for k, dataset in self.torch_datasets.items()} | |
def get_torch_iterator(self, data_subset=TRAIN): | |
""" | |
:param data_subset: one of TRAIN VAL and TEST | |
:return: torch batches iterator for this part of the datset | |
""" | |
return self.torch_iterators[data_subset] | |
def get_labels(self, data_subset=TRAIN): | |
""" | |
:param data_subset: one of TRAIN VAL and TEST | |
:return: numpy array with the labels of the requested part of the datset in the same order of the | |
examples. | |
""" | |
return np.array([sent.sentiment_class for sent in self.sentences[data_subset]]) | |
def get_input_shape(self): | |
""" | |
:return: the shape of a single example from this dataset (only of x, ignoring y the label). | |
""" | |
return self.torch_datasets[TRAIN][0][0].shape | |
# ------------------------------------ Models ---------------------------------------------------- | |
class LSTM(nn.Module): | |
""" | |
An LSTM for sentiment analysis with architecture as described in the exercise description. | |
""" | |
def __init__(self, embedding_dim, hidden_dim, n_layers, dropout): | |
super().__init__() | |
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True) | |
# self.dropout = nn.Dropout() | |
self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=1) | |
def forward(self, x): | |
_, (c, h) = self.lstm(x) | |
h = torch.cat((h[0], h[1]), 1) | |
return self.linear(h) | |
def predict(self, x): | |
return nn.Sigmoid(self.forward(x)) | |
class LogLinear(nn.Module): | |
""" | |
general class for the log-linear models for sentiment analysis. | |
""" | |
def __init__(self, embedding_dim): | |
super().__init__() | |
# Initialize Model | |
self.linear1 = nn.Linear(in_features=embedding_dim, out_features=1) | |
def forward(self, x): | |
return self.linear1(x) | |
def predict(self, x): | |
return nn.Sigmoid(self.forward(x)) | |
# ------------------------- training functions ------------- | |
def binary_accuracy(preds, y): | |
""" | |
This method returns tha accuracy of the predictions, relative to the labels. | |
You can choose whether to use numpy arrays or tensors here. | |
:param preds: a vector of predictions | |
:param y: a vector of true labels | |
:return: scalar value - (<number of accurate predictions> / <number of examples>) | |
""" | |
#print(preds[:,0] >= 0.5) | |
# TODO is 0.5 bad? | |
s = torch.sum((preds[:,0] >= 0.5) == (y), dtype=torch.float64) | |
acc = s / y.shape[0] | |
return (acc.item()) | |
def train_epoch(model, data_iterator, optimizer, criterion): | |
""" | |
This method operates one epoch (pass over the whole train set) of training of the given model, | |
and returns the accuracy and loss for this epoch | |
:param model: the model we're currently training | |
:param data_iterator: an iterator, iterating over the training data for the model. | |
:param optimizer: the optimizer object for the training process. | |
:param criterion: the criterion object for the training process. | |
""" | |
model.train() | |
batches = 0 | |
acc = 0 | |
for x, y in data_iterator: | |
#x = x.to('cuda') | |
#y = y.to('cuda') | |
batches += 1 | |
optimizer.zero_grad() | |
pred = model(x) # TODO are we iterating over ALL the training set? | |
loss = criterion(pred[:,0], y) | |
loss.backward() | |
optimizer.step() | |
acc += binary_accuracy(pred, y) | |
acc = acc / batches | |
return acc, loss.item() | |
def evaluate(model, data_iterator, criterion): | |
""" | |
evaluate the model performance on the given data | |
:param model: one of our models.. | |
:param data_iterator: torch data iterator for the relevant subset | |
:param criterion: the loss criterion used for evaluation | |
:return: tuple of (average loss over all examples, average accuracy over all examples) | |
""" | |
model.eval() | |
batches = 0 | |
acc = 0 | |
for x, y in data_iterator: | |
#x = x.to('cuda') | |
#y = y.to('cuda') | |
batches += 1 | |
pred = model(x) | |
loss = criterion(pred[:,0], y) | |
acc += binary_accuracy(pred, y) | |
acc = acc / batches | |
return acc, loss.item() | |
def get_predictions_for_data(model, data_iter): | |
""" | |
This function should iterate over all batches of examples from data_iter and return all of the models | |
predictions as a numpy ndarray or torch tensor (or list if you prefer). the prediction should be in the | |
same order of the examples returned by data_iter. | |
:param model: one of the models you implemented in the exercise | |
:param data_iter: torch iterator as given by the DataManager | |
:return: | |
""" | |
return | |
def train_model(model, data_manager, n_epochs, lr, weight_decay=0.): | |
""" | |
Runs the full training procedure for the given model. The optimization should be done using the Adam | |
optimizer with all parameters but learning rate and weight decay set to default. | |
:param model: module of one of the models implemented in the exercise | |
:param data_manager: the DataManager object | |
:param n_epochs: number of times to go over the whole training set | |
:param lr: learning rate to be used for optimization | |
:param weight_decay: parameter for l2 regularization | |
""" | |
train_acc_lst, train_loss_lst = [], [] | |
val_acc_lst, val_loss_lst = [], [] | |
optimizer = optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay) | |
criterion = nn.BCEWithLogitsLoss() | |
for i in range(n_epochs): | |
train_acc, train_loss = train_epoch(model, data_manager.get_torch_iterator(TRAIN), optimizer, criterion) | |
val_acc, val_loss = evaluate(model, data_manager.get_torch_iterator(VAL), criterion) | |
train_acc_lst.append(train_acc) | |
train_loss_lst.append(train_loss) | |
val_acc_lst.append(val_acc) | |
val_loss_lst.append(val_loss) | |
print(f'epoch {i} validatin acc {val_acc}') | |
return [[train_acc_lst, train_loss_lst], [val_acc_lst, val_loss_lst]] | |
def train_log_linear_with_one_hot(): | |
""" | |
Here comes your code for training and evaluation of the log linear model with one hot representation. | |
""" | |
n_epochs = 20 | |
batch_size = 64 | |
lr = 0.01 | |
weight_decay = 0.0001 | |
data_manager = DataManager(batch_size=batch_size) | |
model = LogLinear(data_manager.get_input_shape()[0]) | |
rv = train_model(model, data_manager, n_epochs, lr, weight_decay) | |
return rv | |
def train_log_linear_with_w2v(): | |
""" | |
Here comes your code for training and evaluation of the log linear model with word embeddings | |
representation. | |
""" | |
n_epochs = 20 | |
batch_size = 64 | |
lr = 0.01 | |
weight_decay = 0.0001 | |
data_manager = DataManager(batch_size=batch_size, data_type="w2v_average", embedding_dim=300) | |
model = LogLinear(data_manager.get_input_shape()[0]) | |
#model = model.to(get_available_device()) | |
rv = train_model(model, data_manager, n_epochs, lr, weight_decay) | |
return rv | |
def train_lstm_with_w2v(): | |
""" | |
Here comes your code for training and evaluation of the LSTM model. | |
""" | |
# Training Params | |
n_epochs = 20 | |
batch_size = 64 | |
lr = 0.01 | |
weight_decay = 0.0001 | |
# Model Params | |
hidden_dim = 100 | |
dropout = 0.5 | |
data_manager = DataManager(batch_size=batch_size, data_type="w2v_sequence", embedding_dim=300) | |
model = LSTM(300, hidden_dim, 1, dropout) | |
#model = model.to(get_available_device()) | |
rv = train_model(model, data_manager, n_epochs, lr, weight_decay) | |
return rv | |
if __name__ == '__main__': | |
# print(train_log_linear_with_one_hot()) | |
# print(train_log_linear_with_w2v()) | |
train_lstm_with_w2v() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment