Skip to content

Instantly share code, notes, and snippets.

@cnmoro
Created March 13, 2024 23:35
Show Gist options
  • Save cnmoro/0b2bc4a228928d967f1e43116c77a38d to your computer and use it in GitHub Desktop.
Save cnmoro/0b2bc4a228928d967f1e43116c77a38d to your computer and use it in GitHub Desktop.
tiktoken_chunkenizer_with_overlap.py
import tiktoken
gpt_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
def chunk_text(full_text, tokens_per_chunk=300, chunk_overlap=20):
chunks = []
current_chunk = []
current_chunk_length = 0
tokens = gpt_encoding.encode(full_text)
for i, token in enumerate(tokens):
if current_chunk_length + 1 > tokens_per_chunk:
chunks.append(current_chunk)
current_chunk = tokens[i-chunk_overlap:i] if i > chunk_overlap else []
current_chunk_length = len(current_chunk)
current_chunk.append(token)
current_chunk_length += 1
chunks.append(current_chunk)
chunks = [gpt_encoding.decode(chunk) for chunk in chunks]
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment