Skip to content

Instantly share code, notes, and snippets.

@HuangWeiKulish
Last active October 4, 2020 05:29
Show Gist options
  • Save HuangWeiKulish/bd2beeddb9fe3161d2f10297ad2ad2d5 to your computer and use it in GitHub Desktop.
Save HuangWeiKulish/bd2beeddb9fe3161d2f10297ad2ad2d5 to your computer and use it in GitHub Desktop.
def tokenizer(source, target):
"""
:param source: list of lists of strings
:param target: list of lists of strings
:return: tk_in, tk_out
"""
tk_in = tf.keras.preprocessing.text.Tokenizer(filters='')
tk_out = tf.keras.preprocessing.text.Tokenizer(filters='')
tk_in.fit_on_texts(source)
tk_out.fit_on_texts(target)
tk_out.fit_on_texts(['<start>', '<end>'])
return tk_in, tk_out
def preprocess(source, target, tk_in, tk_out):
"""
:param source: list of lists of strings
:param target: list of lists of strings
:param: tk_in: tf.keras.preprocessing.text.Tokenizer
:param: tk_out: tf.keras.preprocessing.text.Tokenizer
:return: x_en, x_de_in, x_de_out
"""
tar_in = ['<start> ' + tg for tg in target]
tar_out = [tg + ' <end>' for tg in target]
x_en = tf.keras.preprocessing.sequence.pad_sequences(tk_in.texts_to_sequences(source), padding='post')
x_de_in = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_in), padding='post')
x_de_out = tf.keras.preprocessing.sequence.pad_sequences(tk_out.texts_to_sequences(tar_out), padding='post')
return x_en, x_de_in, x_de_out
"""
source = ['I read books', 'I love you', 'sweet cake']
target = ['Я читаю книги', 'Я люблю тебе', 'солодкий торт']
tk_in, tk_out = tokenizer(source, target)
x_en, x_de_in, x_de_out = preprocess(source, target, tk_in, tk_out)
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment