Skip to content

Instantly share code, notes, and snippets.

@amrakm
Created October 4, 2022 15:04
Show Gist options
  • Save amrakm/db2db4288134a9609c2a83d5cc970312 to your computer and use it in GitHub Desktop.
Save amrakm/db2db4288134a9609c2a83d5cc970312 to your computer and use it in GitHub Desktop.
split reviews into multiple parts based on max_token size
# split reviews into multiple parts based on max_token size, appending one sentence at a time until the part hits the max token limit
def split_rev(rev, max_tokens = 384):
rev_sentences = rev.split('.')
parts_list = []
curr_part_tokens = []
for sentence in rev_sentences:
if len(curr_part_tokens) == 0:
curr_part_tokens.extend(sentence.split()[:max_tokens])
continue
if len(curr_part_tokens) + len(sentence.split()) + 1 <= max_tokens:
curr_part_tokens.extend(['.'] + sentence.split()[:max_tokens])
else:
parts_list.append(" ".join(curr_part_tokens))
curr_part_tokens = sentence.split()[:max_tokens]
if len(curr_part_tokens) > 0:
parts_list.append(" ".join(curr_part_tokens))
parts_list = [x.replace(' .', '.') for x in parts_list]
# print(parts_list)
# print(len(parts_list))
# print([len(x.split()) for x in parts_list])
return parts_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment