Created
May 9, 2020 13:49
-
-
Save ThibaudLamothe/debbf94eabee6e1c68658ac2109d9e1f to your computer and use it in GitHub Desktop.
Preparation of a batch corpus with maximum size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def prepare_batch_corpus(corpus, max_caracter=5000): | |
# Size information | |
nb_sentence = len(corpus) | |
# Batch information (reset these values after each batch finalization) | |
batch = [] | |
batch_length = 0 | |
# All batches are stored in that list, which will bbe the output of the function | |
batch_corpus = [] | |
# Going throug each sentence of the initial corpus to create the batches | |
for idx, sentence in enumerate(corpus): | |
# Are we dealing with the last sentence ? | |
last_sentence = idx + 1 == nb_sentence | |
# Checking the batch size before adding a new sentence in it | |
hypothetical_length = batch_length + len(sentence) | |
if hypothetical_length < max_caracter: | |
batch.append(sentence) | |
batch_length += len(sentence) + len(joiner) | |
# If sentence can be added to the corpus wa add it and don't save the corpus yet | |
# Except if this is the last sentence | |
if not last_sentence: | |
continue | |
# Finalizing batch beforee storing | |
joined_batch = joiner.join(batch) | |
# Save batch in the corpus | |
batch_corpus.append(joined_batch) | |
# Reseting batch parameters | |
batch = [] | |
batch_length = 0 | |
return batch_corpus |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment