-
-
Save rajy4683/9116308120efb87b17f1da15330a4ada to your computer and use it in GitHub Desktop.
tokenizer utilities to be passed to Field
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize_de(text): | |
""" | |
Tokenizes German text from a string into a list of strings | |
""" | |
return [tok.text for tok in spacy_de.tokenizer(text)] | |
def tokenize_en(text): | |
""" | |
Tokenizes English text from a string into a list of strings | |
""" | |
return [tok.text for tok in spacy_en.tokenizer(text)] | |
SRC = Field(tokenize = tokenize_de, ## custom tokenizer for german | |
init_token = '<sos>', | |
eos_token = '<eos>', | |
lower = True, ## convert to lower case | |
batch_first = True) | |
TRG = Field(tokenize = tokenize_en, ## custom tokenizer for english | |
init_token = '<sos>', | |
eos_token = '<eos>', | |
lower = True, ## convert to lower case | |
batch_first = True) | |
### Multi30K already has train(29000), valid(1014) and test(1000) data split | |
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), | |
fields=(SRC, TRG)) | |
### Each object has an list of "Examples" object that contains the actual data | |
sample_example = train_data.examples[100] | |
sample_src = sample_example.src | |
sample_trg = sample_example.trg | |
print("Sample Source sentence:{} Length: {}".format(sample_src, len(sample_src))) | |
print("Sample Source sentence:{} Length: {}".format(sample_trg, len(sample_trg))) | |
### Sample output | |
""" | |
Sample Source sentence:['männliches', 'kleinkind', 'in', 'einem', 'roten', 'hut', ',', 'das', 'sich', 'an', | |
'einem', 'geländer', 'festhält', '.'] Length: 14 | |
Sample Source sentence:['toddler', 'boy', 'in', 'a', 'red', 'hat', 'holding', 'on', 'to', 'some', 'railings', '.'] Length: 12 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment