Created
July 19, 2021 18:28
-
-
Save lazuxd/c74a8264f78708d66f2a6d323320ea88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EOS = chr(10) # End of sentence | |
def build_vocabulary() -> list: | |
# builds a vocabulary using ASCII characters | |
vocabulary = [chr(i) for i in range(10, 128)] | |
return vocabulary | |
def word2index(vocabulary: list, word: str) -> int: | |
# returns the index of 'word' in the vocabulary | |
return vocabulary.index(word) | |
def words2onehot(vocabulary: list, words: list) -> np.ndarray: | |
# transforms the list of words given as argument into | |
# a one-hot matrix representation using the index in the vocabulary | |
n_words = len(words) | |
n_voc = len(vocabulary) | |
indices = np.array([word2index(vocabulary, word) for word in words]) | |
a = np.zeros((n_words, n_voc)) | |
a[np.arange(n_words), indices] = 1 | |
return a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment