Skip to content

Instantly share code, notes, and snippets.

@opparco
Created August 18, 2023 14:44
Show Gist options
  • Save opparco/86572881f3e75287606540cef7bd7dcf to your computer and use it in GitHub Desktop.
Save opparco/86572881f3e75287606540cef7bd7dcf to your computer and use it in GitHub Desktop.
debug tokenizer of matsuo-lab/weblab-10b
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("matsuo-lab/weblab-10b")
file_path = "hiragana.txt"
with open(file_path, "w", encoding="utf-8") as o:
o.write("# unicode decoded [num-ids] id, ...\n")
for i in range(0x3040, 0x309F + 1):
ids = tokenizer.encode(chr(i), add_special_tokens=False)
decoded = tokenizer.decode(ids)
o.write("U+{:x} {} [{:d}] {}\n".format(i, decoded, len(ids), ', '.join(map(str, ids))))
print("saved {}".format(file_path))
# unicode decoded [num-ids] id, ...
U+3040 ぀ [2] 765, 211
U+3041 ぁ [2] 765, 212
U+3042 あ [1] 14010
U+3043 ぃ [2] 765, 214
U+3044 い [1] 5151
U+3045 ぅ [2] 765, 216
U+3046 う [1] 9935
U+3047 ぇ [2] 765, 218
U+3048 え [1] 19857
U+3049 ぉ [2] 765, 220
U+304a お [1] 21322
U+304b か [1] 9146
U+304c が [1] 6957
U+304d き [1] 14321
U+304e ぎ [2] 765, 225
U+304f く [1] 14016
U+3050 ぐ [2] 765, 227
U+3051 け [1] 17531
U+3052 げ [1] 48756
U+3053 こ [1] 10446
U+3054 ご [1] 46843
U+3055 さ [1] 13129
U+3056 ざ [2] 765, 233
U+3057 し [1] 5891
U+3058 じ [1] 31355
U+3059 す [1] 7149
U+305a ず [1] 41581
U+305b せ [1] 24617
U+305c ぜ [2] 765, 239
U+305d そ [1] 15936
U+305e ぞ [2] 765, 241
U+305f た [1] 6218
U+3060 だ [1] 13973
U+3061 ち [1] 23826
U+3062 ぢ [2] 765, 97
U+3063 っ [1] 9736
U+3064 つ [1] 20691
U+3065 づ [2] 765, 100
U+3066 て [1] 6261
U+3067 で [1] 6344
U+3068 と [1] 6088
U+3069 ど [1] 20603
U+306a な [1] 6686
U+306b に [1] 5444
U+306c ぬ [2] 765, 107
U+306d ね [1] 40404
U+306e の [1] 3917
U+306f は [1] 6418
U+3070 ば [1] 30723
U+3071 ぱ [2] 765, 111
U+3072 ひ [2] 765, 112
U+3073 び [1] 46674
U+3074 ぴ [2] 765, 114
U+3075 ふ [2] 765, 115
U+3076 ぶ [2] 765, 116
U+3077 ぷ [2] 765, 117
U+3078 へ [2] 765, 118
U+3079 べ [1] 43645
U+307a ぺ [2] 765, 120
U+307b ほ [1] 46989
U+307c ぼ [2] 765, 122
U+307d ぽ [2] 765, 123
U+307e ま [1] 8509
U+307f み [1] 26725
U+3080 む [2] 1357, 211
U+3081 め [1] 20584
U+3082 も [1] 10786
U+3083 ゃ [1] 47343
U+3084 や [1] 21918
U+3085 ゅ [2] 1357, 216
U+3086 ゆ [2] 1357, 217
U+3087 ょ [1] 43972
U+3088 よ [1] 15275
U+3089 ら [1] 11039
U+308a り [1] 11473
U+308b る [1] 5832
U+308c れ [1] 9345
U+308d ろ [1] 34198
U+308e ゎ [2] 1357, 225
U+308f わ [1] 23872
U+3090 ゐ [2] 1357, 227
U+3091 ゑ [2] 1357, 228
U+3092 を [1] 6449
U+3093 ん [1] 13639
U+3094 ゔ [2] 1357, 231
U+3095 ゕ [2] 1357, 232
U+3096 ゖ [2] 1357, 233
U+3097 ゗ [2] 1357, 234
U+3098 ゘ [2] 1357, 235
U+3099 ゙ [2] 1357, 236
U+309a ゚ [2] 1357, 237
U+309b ゛ [2] 1357, 238
U+309c ゜ [2] 1357, 239
U+309d ゝ [2] 1357, 240
U+309e ゞ [2] 1357, 241
U+309f ゟ [2] 1357, 242
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment