Skip to content

Instantly share code, notes, and snippets.

@burke
Last active June 28, 2023 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save burke/5de4e85c4fe4188bc1b365bef9d2c3bb to your computer and use it in GitHub Desktop.
Save burke/5de4e85c4fe4188bc1b365bef9d2c3bb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import fire
from itertools import cycle
from transformers import AutoTokenizer
def cycle_colors(tokenizer_name):
text = sys.stdin.read()
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokens = tokenizer.tokenize(text)
colors = cycle(['41', '42', '43', '44', '45', '46', '47'])
total_chars = 0
for token in tokens:
token_str = tokenizer.convert_tokens_to_string([token])
token_str = token_str.replace('\n', '¶\033[0m\n')
total_chars += len(token_str)
print(f'\033[30;{next(colors)}m{token_str}\033[0m', end='')
print(f"\033[0m\033[J\n{len(tokens)} tokens, {total_chars / len(tokens)} chars/token\033[0m")
def main():
fire.Fire(cycle_colors)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment