Created
December 15, 2021 07:05
-
-
Save konverner/34ca1b13812827f4fdd3fc1b34279850 to your computer and use it in GitHub Desktop.
functions for converting text into encoded tokens for a dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_vocab(texts): | |
id2word = {0 : "<pad>", 1 : "<unk>"} | |
word2id = {"<pad>" : 0, 1 : "<unk>"} | |
i = 1 | |
for text in texts: | |
for word in text: | |
word = word.lower() | |
if word not in word2id.keys(): | |
word2id[word] = i | |
id2word[i] = word | |
i += 1 | |
return id2word, word2id | |
def encode(text, word2id, size=64): | |
encoded = [word2id[word.lower()] for word in text] | |
while len(encoded) < size: | |
encoded.append(0) | |
return torch.tensor(encoded) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment