phisad/text_sequence_preprocessing.py

## text_sequence_preprocessing.py
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import hashing_trick

def preprocessing(questions, questions_max_length, vocabulary_size):
    """
        Stateless preprocessing the text questions to a one-hot encoding and pads to max length questions.

        The one-hot encodings max value is the vocabulary size.
        The padding is attached at the end of each question up to the maximal question length.

        @param questions: the text questions as list
        @param vocalbulary_size: the (globally) amount of known words
        @param questions_max_length: the (globally) maximal length of a question
        @return: the padded and encoded questions
    """
    encoded_questions = [hashing_trick(question, round(vocabulary_size * 1.3), hash_function="md5") for question in questions]
    padded_questions = pad_sequences(encoded_questions, maxlen=questions_max_length, padding="post")
    return padded_questions
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.text import hashing_trick

	def preprocessing(questions, questions_max_length, vocabulary_size):
	"""
	Stateless preprocessing the text questions to a one-hot encoding and pads to max length questions.

	The one-hot encodings max value is the vocabulary size.
	The padding is attached at the end of each question up to the maximal question length.

	@param questions: the text questions as list
	@param vocalbulary_size: the (globally) amount of known words
	@param questions_max_length: the (globally) maximal length of a question
	@return: the padded and encoded questions
	"""
	encoded_questions = [hashing_trick(question, round(vocabulary_size * 1.3), hash_function="md5") for question in questions]
	padded_questions = pad_sequences(encoded_questions, maxlen=questions_max_length, padding="post")
	return padded_questions