renxida/tokc

## tokc
#!/usr/bin/env python

"""
tokc: A Token Counter Utility

Description:
-------------
tokc is a tool designed to count tokens in text files or input streams. It's especially useful for calculating the cost of using text with various models.

Installation:
-------------
Please install the required package before using:
```
pip install -y tiktoken
```

Usage:
------
- Count tokens for a single file:
    ```
    tokc file
    ```

- Specify a model:
    ```
    tokc --model gpt-3.5-turbo
    ```

- Count tokens for multiple files:
    ```
    tokc file1 file2
    ```

- Count tokens for all files in a directory:
    ```
    tokc directory
    ```

- Count tokens from a stream:
    ```
    journalctl | head -n 20 | tokc
    ```

Outputs:
--------
Example output:
```
filename: 1410 tokens
Calculating cost for 1410 tokens
------------------------------------------------
| Model          | Cost for Input | Cost for Output |
------------------------------------------------
| GPT-4 8K       |  $0.0423      |  $0.0846       |
| GPT-4 32K      |  $0.0846      |  $0.1692       |
| GPT-3.5 4K     |  $0.0021      |  $0.0028       |
| GPT-3.5 16K    |  $0.0042      |  $0.0056       |
| Babbage        |  $0.0023      |  $0.0023       |
| Davinci        |  $0.0169      |  $0.0169       |
------------------------------------------------
grand total: 1410 tokens
```
"""


import argparse
import os
import sys
import tiktoken

def num_tokens_from_messages(messages, model="gpt-4"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

def count_tokens_in_file(file_path, model):
    with open(file_path, 'r') as f:
        content = f.read()
        num_tokens = num_tokens_from_messages([{"user":content}], model)
        print(f"{file_path}: {num_tokens} tokens")
    return num_tokens

def calculate_cost(total_tokens, rate_per_1k):
    return (total_tokens / 1000) * rate_per_1k

def print_cost_table(total):
    print(f"Calculating cost for {total} tokens")
    print("------------------------------------------------")
    print("| Model          | Cost for Input | Cost for Output |")
    print("------------------------------------------------")

    # GPT-4
    gpt4_8k_input = calculate_cost(total, 0.03)
    gpt4_8k_output = calculate_cost(total, 0.06)
    print(f"| GPT-4 8K       |  ${gpt4_8k_input:.4f}      |  ${gpt4_8k_output:.4f}       |")

    gpt4_32k_input = calculate_cost(total, 0.06)
    gpt4_32k_output = calculate_cost(total, 0.12)
    print(f"| GPT-4 32K      |  ${gpt4_32k_input:.4f}      |  ${gpt4_32k_output:.4f}       |")

    # GPT-3.5 Turbo
    gpt3_4k_input = calculate_cost(total, 0.0015)
    gpt3_4k_output = calculate_cost(total, 0.002)
    print(f"| GPT-3.5 4K     |  ${gpt3_4k_input:.4f}      |  ${gpt3_4k_output:.4f}       |")

    gpt3_16k_input = calculate_cost(total, 0.003)
    gpt3_16k_output = calculate_cost(total, 0.004)
    print(f"| GPT-3.5 16K    |  ${gpt3_16k_input:.4f}      |  ${gpt3_16k_output:.4f}       |")

    # Fine-tuning models
    babbage_input = calculate_cost(total, 0.0016)
    babbage_output = calculate_cost(total, 0.0016)
    print(f"| Babbage        |  ${babbage_input:.4f}      |  ${babbage_output:.4f}       |")

    davinci_input = calculate_cost(total, 0.012)
    davinci_output = calculate_cost(total, 0.012)
    print(f"| Davinci        |  ${davinci_input:.4f}      |  ${davinci_output:.4f}       |")

    print("------------------------------------------------")


def main():
    parser = argparse.ArgumentParser(description="Count tokens in files.")
    parser.add_argument("paths", nargs="*", default=[], help="Paths to files or directories.")
    parser.add_argument("--model", default="gpt-3.5-turbo-0613", help="Model to use for token counting.")
    args = parser.parse_args()

    if not args.paths:
        print("Reading from stdin...")
        content = sys.stdin.read()
        total = num_tokens_from_messages([{"user" :content}], args.model)
        print(f"stdin: {total} tokens")
        print_cost_table(total)
        print(f"grand total: {total} tokens")
        return

    total = 0
    for path in args.paths:
        if os.path.isdir(path):
            for root, _, files in os.walk(path):
                for file in files:
                    file_path = os.path.join(root, file)
                    total += count_tokens_in_file(file_path, args.model)
        else:
            total += count_tokens_in_file(path, args.model)
    print_cost_table(total)
    print(f"grand total: {total} tokens")

if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	"""
	tokc: A Token Counter Utility

	Description:
	-------------
	tokc is a tool designed to count tokens in text files or input streams. It's especially useful for calculating the cost of using text with various models.

	Installation:
	-------------
	Please install the required package before using:
	```
	pip install -y tiktoken
	```

	Usage:
	------
	- Count tokens for a single file:
	```
	tokc file
	```

	- Specify a model:
	```
	tokc --model gpt-3.5-turbo
	```

	- Count tokens for multiple files:
	```
	tokc file1 file2
	```

	- Count tokens for all files in a directory:
	```
	tokc directory
	```

	- Count tokens from a stream:
	```
	journalctl \| head -n 20 \| tokc
	```

	Outputs:
	--------
	Example output:
	```
	filename: 1410 tokens
	Calculating cost for 1410 tokens
	------------------------------------------------
	\| Model \| Cost for Input \| Cost for Output \|
	------------------------------------------------
	\| GPT-4 8K \| $0.0423 \| $0.0846 \|
	\| GPT-4 32K \| $0.0846 \| $0.1692 \|
	\| GPT-3.5 4K \| $0.0021 \| $0.0028 \|
	\| GPT-3.5 16K \| $0.0042 \| $0.0056 \|
	\| Babbage \| $0.0023 \| $0.0023 \|
	\| Davinci \| $0.0169 \| $0.0169 \|
	------------------------------------------------
	grand total: 1410 tokens
	```
	"""


	import argparse
	import os
	import sys
	import tiktoken

	def num_tokens_from_messages(messages, model="gpt-4"):
	"""Return the number of tokens used by a list of messages."""
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	print("Warning: model not found. Using cl100k_base encoding.")
	encoding = tiktoken.get_encoding("cl100k_base")
	if model in {
	"gpt-3.5-turbo-0613",
	"gpt-3.5-turbo-16k-0613",
	"gpt-4-0314",
	"gpt-4-32k-0314",
	"gpt-4-0613",
	"gpt-4-32k-0613",
	}:
	tokens_per_message = 3
	tokens_per_name = 1
	elif model == "gpt-3.5-turbo-0301":
	tokens_per_message = 4 # every message follows <\|start\|>{role/name}\n{content}<\|end\|>\n
	tokens_per_name = -1 # if there's a name, the role is omitted
	elif "gpt-3.5-turbo" in model:
	print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
	return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
	elif "gpt-4" in model:
	print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
	return num_tokens_from_messages(messages, model="gpt-4-0613")
	else:
	raise NotImplementedError(
	f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
	)
	num_tokens = 0
	for message in messages:
	num_tokens += tokens_per_message
	for key, value in message.items():
	num_tokens += len(encoding.encode(value))
	if key == "name":
	num_tokens += tokens_per_name
	num_tokens += 3 # every reply is primed with <\|start\|>assistant<\|message\|>
	return num_tokens

	def count_tokens_in_file(file_path, model):
	with open(file_path, 'r') as f:
	content = f.read()
	num_tokens = num_tokens_from_messages([{"user":content}], model)
	print(f"{file_path}: {num_tokens} tokens")
	return num_tokens

	def calculate_cost(total_tokens, rate_per_1k):
	return (total_tokens / 1000) * rate_per_1k

	def print_cost_table(total):
	print(f"Calculating cost for {total} tokens")
	print("------------------------------------------------")
	print("\| Model \| Cost for Input \| Cost for Output \|")
	print("------------------------------------------------")

	# GPT-4
	gpt4_8k_input = calculate_cost(total, 0.03)
	gpt4_8k_output = calculate_cost(total, 0.06)
	print(f"\| GPT-4 8K \| ${gpt4_8k_input:.4f} \| ${gpt4_8k_output:.4f} \|")

	gpt4_32k_input = calculate_cost(total, 0.06)
	gpt4_32k_output = calculate_cost(total, 0.12)
	print(f"\| GPT-4 32K \| ${gpt4_32k_input:.4f} \| ${gpt4_32k_output:.4f} \|")

	# GPT-3.5 Turbo
	gpt3_4k_input = calculate_cost(total, 0.0015)
	gpt3_4k_output = calculate_cost(total, 0.002)
	print(f"\| GPT-3.5 4K \| ${gpt3_4k_input:.4f} \| ${gpt3_4k_output:.4f} \|")

	gpt3_16k_input = calculate_cost(total, 0.003)
	gpt3_16k_output = calculate_cost(total, 0.004)
	print(f"\| GPT-3.5 16K \| ${gpt3_16k_input:.4f} \| ${gpt3_16k_output:.4f} \|")

	# Fine-tuning models
	babbage_input = calculate_cost(total, 0.0016)
	babbage_output = calculate_cost(total, 0.0016)
	print(f"\| Babbage \| ${babbage_input:.4f} \| ${babbage_output:.4f} \|")

	davinci_input = calculate_cost(total, 0.012)
	davinci_output = calculate_cost(total, 0.012)
	print(f"\| Davinci \| ${davinci_input:.4f} \| ${davinci_output:.4f} \|")

	print("------------------------------------------------")



	def main():
	parser = argparse.ArgumentParser(description="Count tokens in files.")
	parser.add_argument("paths", nargs="*", default=[], help="Paths to files or directories.")
	parser.add_argument("--model", default="gpt-3.5-turbo-0613", help="Model to use for token counting.")
	args = parser.parse_args()

	if not args.paths:
	print("Reading from stdin...")
	content = sys.stdin.read()
	total = num_tokens_from_messages([{"user" :content}], args.model)
	print(f"stdin: {total} tokens")
	print_cost_table(total)
	print(f"grand total: {total} tokens")
	return

	total = 0
	for path in args.paths:
	if os.path.isdir(path):
	for root, _, files in os.walk(path):
	for file in files:
	file_path = os.path.join(root, file)
	total += count_tokens_in_file(file_path, args.model)
	else:
	total += count_tokens_in_file(path, args.model)
	print_cost_table(total)
	print(f"grand total: {total} tokens")

	if __name__ == "__main__":
	main()