This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create sequences of length 5 tokens | |
def create_seq(text, seq_len = 5): | |
sequences = [] | |
# if the number of tokens in 'text' is greater than 5 | |
if len(text.split()) > seq_len: | |
for i in range(seq_len, len(text.split())): | |
# select sequence of tokens | |
seq = text.split()[i-seq_len:i+1] | |
# add to the list | |
sequences.append(" ".join(seq)) | |
return sequences | |
# if the number of tokens in 'text' is less than or equal to 5 | |
else: | |
return [text] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment