Skip to content

Instantly share code, notes, and snippets.

@opparco
Created August 21, 2023 04:59
Show Gist options
  • Save opparco/d86ef144604b38e594de3a1bb3730e4c to your computer and use it in GitHub Desktop.
Save opparco/d86ef144604b38e594de3a1bb3730e4c to your computer and use it in GitHub Desktop.
debug tokenizer of lmsys/vicuna-13b-v1.3
#
# debug tokenizer of lmsys/vicuna-13b-v1.3
#
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-13b-v1.3")
def encode_decode(string: str):
ids = tokenizer.encode(string, add_special_tokens=False)
decoded = tokenizer.decode(ids)
u8 = decoded.encode('utf-8')
print(ids, decoded, u8)
def encode_decode_each_id(string: str):
ids = tokenizer.encode(string, add_special_tokens=False)
for i in ids:
decoded = tokenizer.decode([i])
u8 = decoded.encode('utf-8')
print(i, decoded, u8)
string = '神々の黄昏'
encode_decode(string)
encode_decode_each_id(string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment