rajy4683/convs2s.py Secret

## convs2s.py
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, ## custom tokenizer for german
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True, ## convert to lower case
            batch_first = True)


TRG = Field(tokenize = tokenize_en, ## custom tokenizer for english
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True, ## convert to lower case
            batch_first = True)

### Multi30K already has train(29000), valid(1014) and test(1000) data split
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))

### Each object has an list of "Examples" object that contains the actual data
sample_example = train_data.examples[100]
sample_src = sample_example.src
sample_trg = sample_example.trg
print("Sample Source sentence:{} Length: {}".format(sample_src, len(sample_src)))
print("Sample Source sentence:{} Length: {}".format(sample_trg, len(sample_trg)))

### Sample output
"""
Sample Source sentence:['männliches', 'kleinkind', 'in', 'einem', 'roten', 'hut', ',', 'das', 'sich', 'an',
                         'einem', 'geländer', 'festhält', '.'] Length: 14
Sample Source sentence:['toddler', 'boy', 'in', 'a', 'red', 'hat', 'holding', 'on', 'to', 'some', 'railings', '.'] Length: 12
"""
	def tokenize_de(text):
	"""
	Tokenizes German text from a string into a list of strings
	"""
	return [tok.text for tok in spacy_de.tokenizer(text)]

	def tokenize_en(text):
	"""
	Tokenizes English text from a string into a list of strings
	"""
	return [tok.text for tok in spacy_en.tokenizer(text)]

	SRC = Field(tokenize = tokenize_de, ## custom tokenizer for german
	init_token = '<sos>',
	eos_token = '<eos>',
	lower = True, ## convert to lower case
	batch_first = True)


	TRG = Field(tokenize = tokenize_en, ## custom tokenizer for english
	init_token = '<sos>',
	eos_token = '<eos>',
	lower = True, ## convert to lower case
	batch_first = True)

	### Multi30K already has train(29000), valid(1014) and test(1000) data split
	train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
	fields=(SRC, TRG))

	### Each object has an list of "Examples" object that contains the actual data
	sample_example = train_data.examples[100]
	sample_src = sample_example.src
	sample_trg = sample_example.trg
	print("Sample Source sentence:{} Length: {}".format(sample_src, len(sample_src)))
	print("Sample Source sentence:{} Length: {}".format(sample_trg, len(sample_trg)))

	### Sample output
	"""
	Sample Source sentence:['männliches', 'kleinkind', 'in', 'einem', 'roten', 'hut', ',', 'das', 'sich', 'an',
	'einem', 'geländer', 'festhält', '.'] Length: 14
	Sample Source sentence:['toddler', 'boy', 'in', 'a', 'red', 'hat', 'holding', 'on', 'to', 'some', 'railings', '.'] Length: 12
	"""