Skip to content

Instantly share code, notes, and snippets.

@sudoaza
Created January 30, 2024 01:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sudoaza/5105777c9d0dd503353931321b4ae406 to your computer and use it in GitHub Desktop.
Save sudoaza/5105777c9d0dd503353931321b4ae406 to your computer and use it in GitHub Desktop.
Recursive text splitter, because Langchain's one sucks!
def split_text(text, chunk_size=500, separators=['\n\n', '.\n', ':\n', '\n', '. ', ', ', " ", ""]):
"""Split text into chunks of size less than chunk_size, using separators."""
chunks = []
current_separator_index = 0
for separator in separators:
current_separator_index += 1
if len(text) < chunk_size:
if len(text) > 0:
chunks.append(text)
break
# Break condition, no separator, split by chunk_size
if separator == "":
chunks += [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
break
if separator in text:
while separator in text:
if len(text) < chunk_size:
chunks.append(text)
text = ""
break
split_at = text.rfind(separator, 0, chunk_size)
# Chunk is too big, try next separator
if split_at == -1:
chunk, text = text.split(separator, 1)
chunks += split_text(chunk, chunk_size, separators[current_separator_index:])
else:
chunks.append(text[:split_at+len(separator)])
text = text[split_at+len(separator):]
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment