HuangWeiKulish/transformer: pre-processing.py

## transformer: pre-processing.py
def tokenizer(source, target):
    """
    :param source: list of lists of strings
    :param target: list of lists of strings
    :return: tk_in, tk_out
    """
    tk_in = tf.keras.preprocessing.text.Tokenizer(filters='')
    tk_out = tf.keras.preprocessing.text.Tokenizer(filters='')
    tk_in.fit_on_texts(source)
    tk_out.fit_on_texts(target)
    tk_out.fit_on_texts(['<start>', '<end>'])
    return tk_in, tk_out


def preprocess(source, target, tk_in, tk_out):
    """
    :param source: list of lists of strings
    :param target: list of lists of strings
    :param: tk_in: tf.keras.preprocessing.text.Tokenizer
    :param: tk_out: tf.keras.preprocessing.text.Tokenizer
    :return: x_en, x_de_in, x_de_out
    """
    tar_in = ['<start> ' + tg for tg in target]
    tar_out = [tg + ' <end>' for tg in target]
    x_en = tf.keras.preprocessing.sequence.pad_sequences(tk_in.texts_to_sequences(source), padding='post')
    x_de_in = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_in), padding='post')
    x_de_out = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_out), padding='post')
    return x_en, x_de_in, x_de_out


"""
source = ['I read books', 'I love you', 'sweet cake']
target = ['Я читаю книги', 'Я люблю тебе', 'солодкий торт']

tk_in, tk_out = tokenizer(source, target)
x_en, x_de_in, x_de_out = preprocess(source, target, tk_in, tk_out)
"""
	def tokenizer(source, target):
	"""
	:param source: list of lists of strings
	:param target: list of lists of strings
	:return: tk_in, tk_out
	"""
	tk_in = tf.keras.preprocessing.text.Tokenizer(filters='')
	tk_out = tf.keras.preprocessing.text.Tokenizer(filters='')
	tk_in.fit_on_texts(source)
	tk_out.fit_on_texts(target)
	tk_out.fit_on_texts(['<start>', '<end>'])
	return tk_in, tk_out


	def preprocess(source, target, tk_in, tk_out):
	"""
	:param source: list of lists of strings
	:param target: list of lists of strings
	:param: tk_in: tf.keras.preprocessing.text.Tokenizer
	:param: tk_out: tf.keras.preprocessing.text.Tokenizer
	:return: x_en, x_de_in, x_de_out
	"""
	tar_in = ['<start> ' + tg for tg in target]
	tar_out = [tg + ' <end>' for tg in target]
	x_en = tf.keras.preprocessing.sequence.pad_sequences(tk_in.texts_to_sequences(source), padding='post')
	x_de_in = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_in), padding='post')
	x_de_out = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_out), padding='post')
	return x_en, x_de_in, x_de_out


	"""
	source = ['I read books', 'I love you', 'sweet cake']
	target = ['Я читаю книги', 'Я люблю тебе', 'солодкий торт']

	tk_in, tk_out = tokenizer(source, target)
	x_en, x_de_in, x_de_out = preprocess(source, target, tk_in, tk_out)
	"""