-
-
Save renxida/7ea07acb3e77fc937f16150ba7a17161 to your computer and use it in GitHub Desktop.
tokc: a token counting utility
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
tokc: A Token Counter Utility | |
Description: | |
------------- | |
tokc is a tool designed to count tokens in text files or input streams. It's especially useful for calculating the cost of using text with various models. | |
Installation: | |
------------- | |
Please install the required package before using: | |
``` | |
pip install -y tiktoken | |
``` | |
Usage: | |
------ | |
- Count tokens for a single file: | |
``` | |
tokc file | |
``` | |
- Specify a model: | |
``` | |
tokc --model gpt-3.5-turbo | |
``` | |
- Count tokens for multiple files: | |
``` | |
tokc file1 file2 | |
``` | |
- Count tokens for all files in a directory: | |
``` | |
tokc directory | |
``` | |
- Count tokens from a stream: | |
``` | |
journalctl | head -n 20 | tokc | |
``` | |
Outputs: | |
-------- | |
Example output: | |
``` | |
filename: 1410 tokens | |
Calculating cost for 1410 tokens | |
------------------------------------------------ | |
| Model | Cost for Input | Cost for Output | | |
------------------------------------------------ | |
| GPT-4 8K | $0.0423 | $0.0846 | | |
| GPT-4 32K | $0.0846 | $0.1692 | | |
| GPT-3.5 4K | $0.0021 | $0.0028 | | |
| GPT-3.5 16K | $0.0042 | $0.0056 | | |
| Babbage | $0.0023 | $0.0023 | | |
| Davinci | $0.0169 | $0.0169 | | |
------------------------------------------------ | |
grand total: 1410 tokens | |
``` | |
""" | |
import argparse | |
import os | |
import sys | |
import tiktoken | |
def num_tokens_from_messages(messages, model="gpt-4"): | |
"""Return the number of tokens used by a list of messages.""" | |
try: | |
encoding = tiktoken.encoding_for_model(model) | |
except KeyError: | |
print("Warning: model not found. Using cl100k_base encoding.") | |
encoding = tiktoken.get_encoding("cl100k_base") | |
if model in { | |
"gpt-3.5-turbo-0613", | |
"gpt-3.5-turbo-16k-0613", | |
"gpt-4-0314", | |
"gpt-4-32k-0314", | |
"gpt-4-0613", | |
"gpt-4-32k-0613", | |
}: | |
tokens_per_message = 3 | |
tokens_per_name = 1 | |
elif model == "gpt-3.5-turbo-0301": | |
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n | |
tokens_per_name = -1 # if there's a name, the role is omitted | |
elif "gpt-3.5-turbo" in model: | |
print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.") | |
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613") | |
elif "gpt-4" in model: | |
print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") | |
return num_tokens_from_messages(messages, model="gpt-4-0613") | |
else: | |
raise NotImplementedError( | |
f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""" | |
) | |
num_tokens = 0 | |
for message in messages: | |
num_tokens += tokens_per_message | |
for key, value in message.items(): | |
num_tokens += len(encoding.encode(value)) | |
if key == "name": | |
num_tokens += tokens_per_name | |
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> | |
return num_tokens | |
def count_tokens_in_file(file_path, model): | |
with open(file_path, 'r') as f: | |
content = f.read() | |
num_tokens = num_tokens_from_messages([{"user":content}], model) | |
print(f"{file_path}: {num_tokens} tokens") | |
return num_tokens | |
def calculate_cost(total_tokens, rate_per_1k): | |
return (total_tokens / 1000) * rate_per_1k | |
def print_cost_table(total): | |
print(f"Calculating cost for {total} tokens") | |
print("------------------------------------------------") | |
print("| Model | Cost for Input | Cost for Output |") | |
print("------------------------------------------------") | |
# GPT-4 | |
gpt4_8k_input = calculate_cost(total, 0.03) | |
gpt4_8k_output = calculate_cost(total, 0.06) | |
print(f"| GPT-4 8K | ${gpt4_8k_input:.4f} | ${gpt4_8k_output:.4f} |") | |
gpt4_32k_input = calculate_cost(total, 0.06) | |
gpt4_32k_output = calculate_cost(total, 0.12) | |
print(f"| GPT-4 32K | ${gpt4_32k_input:.4f} | ${gpt4_32k_output:.4f} |") | |
# GPT-3.5 Turbo | |
gpt3_4k_input = calculate_cost(total, 0.0015) | |
gpt3_4k_output = calculate_cost(total, 0.002) | |
print(f"| GPT-3.5 4K | ${gpt3_4k_input:.4f} | ${gpt3_4k_output:.4f} |") | |
gpt3_16k_input = calculate_cost(total, 0.003) | |
gpt3_16k_output = calculate_cost(total, 0.004) | |
print(f"| GPT-3.5 16K | ${gpt3_16k_input:.4f} | ${gpt3_16k_output:.4f} |") | |
# Fine-tuning models | |
babbage_input = calculate_cost(total, 0.0016) | |
babbage_output = calculate_cost(total, 0.0016) | |
print(f"| Babbage | ${babbage_input:.4f} | ${babbage_output:.4f} |") | |
davinci_input = calculate_cost(total, 0.012) | |
davinci_output = calculate_cost(total, 0.012) | |
print(f"| Davinci | ${davinci_input:.4f} | ${davinci_output:.4f} |") | |
print("------------------------------------------------") | |
def main(): | |
parser = argparse.ArgumentParser(description="Count tokens in files.") | |
parser.add_argument("paths", nargs="*", default=[], help="Paths to files or directories.") | |
parser.add_argument("--model", default="gpt-3.5-turbo-0613", help="Model to use for token counting.") | |
args = parser.parse_args() | |
if not args.paths: | |
print("Reading from stdin...") | |
content = sys.stdin.read() | |
total = num_tokens_from_messages([{"user" :content}], args.model) | |
print(f"stdin: {total} tokens") | |
print_cost_table(total) | |
print(f"grand total: {total} tokens") | |
return | |
total = 0 | |
for path in args.paths: | |
if os.path.isdir(path): | |
for root, _, files in os.walk(path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
total += count_tokens_in_file(file_path, args.model) | |
else: | |
total += count_tokens_in_file(path, args.model) | |
print_cost_table(total) | |
print(f"grand total: {total} tokens") | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment