Skip to content

Instantly share code, notes, and snippets.

@alvations
Created February 15, 2019 03:18
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save alvations/9eb93b7ba75101c9cac3eb8d17500038 to your computer and use it in GitHub Desktop.
class ToxicDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.vocab = Dictionary(texts)
special_tokens = {'<pad>': 0, '<unk>':1}
self.vocab = Dictionary(texts)
self.vocab.patch_with_special_tokens(special_tokens)
# Vectorize labels
self.labels = torch.tensor(labels)
# Keep track of how many data points.
self._len = len(texts)
# Find the longest text in the data.
self.max_len = max(len(txt) for txt in texts)
def __getitem__(self, index):
vectorized_sent = self.vectorize(self.texts[index])
# To pad the sentence:
# Pad left = 0; Pad right = max_len - len of sent.
pad_dim = (0, self.max_len - len(vectorized_sent))
vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
return {'x':vectorized_sent,
'y':self.labels[index],
'x_len':len(vectorized_sent)}
def __len__(self):
return self._len
def vectorize(self, tokens):
"""
:param tokens: Tokens that should be vectorized.
:type tokens: list(str)
"""
# See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx
# Lets just cast list of indices into torch tensors directly =)
return torch.tensor(self.vocab.doc2idx(tokens))
def unvectorize(self, indices):
"""
:param indices: Converts the indices back to tokens.
:type tokens: list(int)
"""
return [self.vocab[i] for i in indices]
label_column_names = "toxic severe_toxic obscene threat insult identity_hate".split()
toxic_data = ToxicDataset(df_train['comment_text_tokenzied'],
df_train[label_column_names].values)
toxic_data[113]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment