Skip to content

Instantly share code, notes, and snippets.

@SkySung
Created May 14, 2024 16:11
Show Gist options
  • Save SkySung/e43d35b9f282bc46de2d553d76af13d4 to your computer and use it in GitHub Desktop.
Save SkySung/e43d35b9f282bc46de2d553d76af13d4 to your computer and use it in GitHub Desktop.
ChatGPT4o_encoder
enc = tiktoken.get_encoding("o200k_base")
length_dict = {}
for i in range(enc.n_vocab):
try:
length_dict[i] = len(enc.decode([i]))
except:
pass
# Sort the tokens by length in descending order
sorted_length_dict = dict(sorted(length_dict.items(), key=lambda item: -item[1]))
chinese_tokens = []
# Identify Chinese tokens from the sorted list and store their lengths
for token_id in sorted_length_dict:
if len(chinese_tokens) >= 10: # Stop after finding the longest ten Chinese tokens
break
try:
decoded_text = enc.decode([token_id])
if langdetect.detect(decoded_text) == 'zh-cn': # Detect if the text is Chinese
chinese_tokens.append((token_id, sorted_length_dict[token_id], decoded_text))
print(f"Token ID: {token_id}, Length: {sorted_length_dict[token_id]}, Text: {decoded_text}")
except KeyError as e:
pass # Handle missing key by skipping
except Exception as e:
pass # Handle other exceptions by skipping
# Print the longest ten Chinese tokens and their corresponding text
for token_id, length, text in chinese_tokens:
print(f"Token ID: {token_id}, Length: {length}, Text: {text}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment