Last active
September 25, 2020 18:20
-
-
Save edumunozsala/cce0574fbabe957a943a0e255491bddb to your computer and use it in GitHub Desktop.
Code to create dictionaries for Char-level text generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CharVocab: | |
''' Create a Vocabulary for ''' | |
def __init__(self, type_vocab,pad_token='<PAD>', eos_token='<EOS>', unk_token='<UNK>'): #Initialization of the type of vocabulary | |
self.type = type_vocab | |
#self.int2char ={} | |
self.int2char = [] | |
if pad_token !=None: | |
self.int2char += [pad_token] | |
if eos_token !=None: | |
self.int2char += [eos_token] | |
if unk_token !=None: | |
self.int2char += [unk_token] | |
#self.int2char[1]=eos_token | |
#self.int2char[2]=unk_token | |
self.char2int = {} | |
def __call__(self, text): #When called, adds the values of parameters x_1 and x_2, prints and returns the result | |
# Join all the sentences together and extract the unique characters from the combined sentences | |
chars = set(''.join(text)) | |
# Creating a dictionary that maps integers to the characters | |
self.int2char += list(chars) | |
# Creating another dictionary that maps characters to integers | |
self.char2int = {char: ind for ind, char in enumerate(self.int2char)} | |
vocab = CharVocab('char',None,None,'<UNK>') | |
vocab(sentences) | |
print('Length of vocabulary: ', len(vocab.int2char)) | |
print('Int to Char: ', vocab.int2char) | |
print('Char to Int: ', vocab.char2int) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment