Created
July 28, 2020 06:42
-
-
Save prateekjoshi565/5c7bfc79f2f04bf7ab637a73481297ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create sequences of length 5 tokens | |
def create_seq(text, seq_len = 5): | |
sequences = [] | |
# if the number of tokens in 'text' is greater than 5 | |
if len(text.split()) > seq_len: | |
for i in range(seq_len, len(text.split())): | |
# select sequence of tokens | |
seq = text.split()[i-seq_len:i+1] | |
# add to the list | |
sequences.append(" ".join(seq)) | |
return sequences | |
# if the number of tokens in 'text' is less than or equal to 5 | |
else: | |
return [text] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment