Created
February 15, 2019 03:18
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ToxicDataset(Dataset): | |
def __init__(self, texts, labels): | |
self.texts = texts | |
self.vocab = Dictionary(texts) | |
special_tokens = {'<pad>': 0, '<unk>':1} | |
self.vocab = Dictionary(texts) | |
self.vocab.patch_with_special_tokens(special_tokens) | |
# Vectorize labels | |
self.labels = torch.tensor(labels) | |
# Keep track of how many data points. | |
self._len = len(texts) | |
# Find the longest text in the data. | |
self.max_len = max(len(txt) for txt in texts) | |
def __getitem__(self, index): | |
vectorized_sent = self.vectorize(self.texts[index]) | |
# To pad the sentence: | |
# Pad left = 0; Pad right = max_len - len of sent. | |
pad_dim = (0, self.max_len - len(vectorized_sent)) | |
vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant') | |
return {'x':vectorized_sent, | |
'y':self.labels[index], | |
'x_len':len(vectorized_sent)} | |
def __len__(self): | |
return self._len | |
def vectorize(self, tokens): | |
""" | |
:param tokens: Tokens that should be vectorized. | |
:type tokens: list(str) | |
""" | |
# See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx | |
# Lets just cast list of indices into torch tensors directly =) | |
return torch.tensor(self.vocab.doc2idx(tokens)) | |
def unvectorize(self, indices): | |
""" | |
:param indices: Converts the indices back to tokens. | |
:type tokens: list(int) | |
""" | |
return [self.vocab[i] for i in indices] | |
label_column_names = "toxic severe_toxic obscene threat insult identity_hate".split() | |
toxic_data = ToxicDataset(df_train['comment_text_tokenzied'], | |
df_train[label_column_names].values) | |
toxic_data[113] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment