Skip to content

Instantly share code, notes, and snippets.

@mmmayo13
Last active November 7, 2019 11:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmmayo13/4fb6adb245532622d3e9ec12a8bb1f29 to your computer and use it in GitHub Desktop.
Save mmmayo13/4fb6adb245532622d3e9ec12a8bb1f29 to your computer and use it in GitHub Desktop.
class Vocabulary:
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3
self.num_sentences = 0
self.longest_sentence = 0
def add_word(self, word):
if word not in self.word2index:
# First entry of word into vocabulary
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
# Word exists; increase word count
self.word2count[word] += 1
def add_sentence(self, sentence):
sentence_len = 0
for word in sentence.split(' '):
sentence_len += 1
self.add_word(word)
if sentence_len > self.longest_sentence:
# This is the longest sentence
self.longest_sentence = sentence_len
# Count the number of sentences
self.num_sentences += 1
def to_word(self, index):
return self.index2word[index]
def to_index(self, word):
return self.word2index[word]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment