eileen-code4fun/translation_prepro.py

## translation_prepro.py
def standardize(text):
    # Split accecented characters.
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

eng_text_processor = tf.keras.layers.TextVectorization(standardize=standardize, max_tokens=5000)
spa_text_processor = tf.keras.layers.TextVectorization(standardize=standardize, max_tokens=5000)

eng_text_processor.adapt(eng_dataset.batch(128))
spa_text_processor.adapt(spa_dataset.batch(128))
	def standardize(text):
	# Split accecented characters.
	text = tf_text.normalize_utf8(text, 'NFKD')
	text = tf.strings.lower(text)
	# Keep space, a to z, and select punctuation.
	text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
	# Add spaces around punctuation.
	text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
	# Strip whitespace.
	text = tf.strings.strip(text)

	text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
	return text

	eng_text_processor = tf.keras.layers.TextVectorization(standardize=standardize, max_tokens=5000)
	spa_text_processor = tf.keras.layers.TextVectorization(standardize=standardize, max_tokens=5000)

	eng_text_processor.adapt(eng_dataset.batch(128))
	spa_text_processor.adapt(spa_dataset.batch(128))