Skip to content

Instantly share code, notes, and snippets.

Last active December 28, 2023 10:12
Show Gist options
  • Save buanzo/7cdd2c34fc0bb25c71b857a16853c6fa to your computer and use it in GitHub Desktop.
Save buanzo/7cdd2c34fc0bb25c71b857a16853c6fa to your computer and use it in GitHub Desktop.
# This is a work in progress. There are still bugs. Once it is production-ready this will become a full repo.
import os
def count_tokens(text, model_name="gpt-3.5-turbo", debug=False):
Count the number of tokens in a given text string without using the OpenAI API.
This function tries three methods in the following order:
1. tiktoken (preferred): Accurate token counting similar to the OpenAI API.
2. nltk: Token counting using the Natural Language Toolkit library.
3. split: Simple whitespace-based token counting as a fallback.
text = "Your text here"
result = count_tokens(text, model_name="gpt-3.5-turbo", debug=True)
Required libraries:
- tiktoken: Install with 'pip install tiktoken'
- nltk: Install with 'pip install nltk'
text : str
The text string for which you want to count tokens.
model_name : str, optional
The OpenAI model for which you want to count tokens (default: "gpt-3.5-turbo").
debug : bool, optional
Set to True to print error messages (default: False).
result : dict
A dictionary containing the number of tokens and the method used for counting.
# Try using tiktoken
import tiktoken
encoding = tiktoken.encoding_for_model(model_name)
num_tokens = len(encoding.encode(text))
result = {"n_tokens": num_tokens, "method": "tiktoken"}
return result
except Exception as e:
if debug:
print(f"Error using tiktoken: {e}")
# Try using nltk
import nltk"punkt")
tokens = nltk.word_tokenize(text)
result = {"n_tokens": len(tokens), "method": "nltk"}
return result
except Exception as e:
if debug:
print(f"Error using nltk: {e}")
# If nltk and tiktoken fail, use a simple split-based method
tokens = text.split()
result = {"n_tokens": len(tokens), "method": "split"}
return result
class TokenBuffer:
def __init__(self, max_tokens=2048):
self.max_tokens = max_tokens
self.buffer = ""
self.token_lengths = []
self.token_count = 0
def update(self, text, model_name="gpt-3.5-turbo", debug=False):
new_tokens = count_tokens(text, model_name=model_name, debug=debug)["n_tokens"]
self.token_count += new_tokens
self.buffer += text
while self.token_count > self.max_tokens:
removed_tokens = self.token_lengths.pop(0)
self.token_count -= removed_tokens
self.buffer = self.buffer.split(" ", removed_tokens)[-1]
def get_buffer(self):
return self.buffer
Copy link

buanzo commented Apr 11, 2023

Example usage for TokenBuffer:

from token_counter import TokenBuffer

# Initialize a TokenBuffer with a maximum token count of 30
buffer = TokenBuffer(max_tokens=30)

# Add a sentence to the buffer
buffer.update("Hello, how are you doing?")
print("Token count:", buffer.token_count)

# Add another sentence to the buffer
buffer.update("I'm doing well, thank you!")
print("Token count:", buffer.token_count)

# Add a longer sentence to the buffer
buffer.update("I've been working on a project and making great progress.")
print("Token count:", buffer.token_count)

# Add one more sentence to the buffer
buffer.update("That's great to hear, keep up the good work!")
print("Token count:", buffer.token_count)

Output (YMMV):

Hello, how are you doing?
Token count: 6
Hello, how are you doing?I'm doing well, thank you!
Token count: 11
Hello, how are you doing?I'm doing well, thank you!I've been working on a project and making great progress.
Token count: 24
I'm doing well, thank you!I've been working on a project and making great progress.That's great to hear, keep up the good work!
Token count: 30

Copy link

buanzo commented Apr 11, 2023

Tiktoken's github repo:

Copy link

buanzo commented Apr 11, 2023

NLTK's github repo:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment