Skip to content

Instantly share code, notes, and snippets.

@so298
Last active May 14, 2024 07:32
Show Gist options
  • Save so298/9cb485ec8d7d7a279979527257a56abf to your computer and use it in GitHub Desktop.
Save so298/9cb485ec8d7d7a279979527257a56abf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"## Top 100 longest Japanese tokens in GPT-4o\n",
"\n",
"GPT-4oに利用されている日本語トークンの長さ順リストトップ100\n",
"\n",
"Used this code\n",
"https://gist.github.com/ctlllll/4451e94f3b2ca415515f3ee369c8c374"
],
"metadata": {
"id": "WdQwtFrc-lod"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "eunta-zC37cR",
"outputId": "8fb782c4-ae0c-460d-ef6d-be9b43a61728"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting tiktoken\n",
" Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting langdetect\n",
" Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.12.25)\n",
"Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2.31.0)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect) (1.16.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2024.2.2)\n",
"Building wheels for collected packages: langdetect\n",
" Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=0888dfdaf677335a3dd5a031e5b8843fd4aecaa85dba4f26b3b0566e7ba141dd\n",
" Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
"Successfully built langdetect\n",
"Installing collected packages: langdetect, tiktoken\n",
"Successfully installed langdetect-1.0.9 tiktoken-0.7.0\n"
]
}
],
"source": [
"!pip install tiktoken langdetect"
]
},
{
"cell_type": "code",
"source": [
"import tiktoken\n",
"import langdetect\n",
"T = tiktoken.get_encoding(\"o200k_base\")\n",
"\n",
"length_dict = {}\n",
"\n",
"for i in range(T.n_vocab):\n",
" try:\n",
" length_dict[i] = len(T.decode([i]))\n",
" except:\n",
" pass\n",
"\n",
"# Sort by length\n",
"length_dict = dict(sorted(length_dict.items(), key=lambda item: -item[1]))\n",
"\n",
"# Print the top 100 japanese words\n",
"tot = 0\n",
"for item in length_dict:\n",
" try:\n",
" if langdetect.detect(T.decode([item])) == \"ja\":\n",
" print(item, T.decode([item]))\n",
" tot += 1\n",
" except:\n",
" pass\n",
" if tot == 100:\n",
" break\n",
"\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8rPUq7RS4E78",
"outputId": "68418b00-f628-4c90-c436-5a39867f9793"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"106378                 \n",
"113862 ありがとうございました\n",
"93926 ありがとうございます\n",
"147058 VIPがお送りします\n",
"33552         \n",
"108576 風吹けば名無し\n",
"170050   \n",
"\n",
"  \n",
"\n",
"\n",
"171403 スーパーコピー\n",
"196368 】【。】【”】【\n",
"77298 @お腹いっぱい\n",
"87123 風吹けば名無し\n",
"118712 トラックバック\n",
"123086 がお送りします\n",
"139807        \n",
"177976 ご了承ください\n",
"44948 名無しさん\n",
"106409 ございました\n",
"109574 レディース\n",
"109585 ラックバック\n",
"119029       \n",
"136526 お問い合わせ\n",
"137015 お願いします\n",
"146906 の名無しさん\n",
"148746 してください\n",
"169195 中央値との差\n",
"192583 しております\n",
"194528 ・・・。\n",
"\n",
"\n",
"195002 プロフィール\n",
"195972 】【。】\n",
"\n",
"\n",
"196993 されています\n",
"26139 名無しさん\n",
"44832 】【。】【\n",
"55300 ありがとう\n",
"59809 ございます\n",
"76286 ・・・\n",
"\n",
"\n",
"90602 @恐縮です\n",
"92559 ランキング\n",
"94092      \n",
"95839 こんにちは\n",
"105097 問い合わせ\n",
"105931 続きを読む\n",
"111060 】【、】【\n",
"120362 送りします\n",
"121986 スーパー\n",
"123622 スポンサー\n",
"124871 しています\n",
"143814 名無しの\n",
"144531 ありません\n",
"159656 ブラック\n",
"164454 があります\n",
"168396 平均との差\n",
"173276 になります\n",
"178655 オンライン\n",
"178757 こんばんは\n",
"180276 ブランド\n",
"185024 カテゴリー\n",
"190841 っています\n",
"191902 コメント\n",
"197513 @おーぷん\n",
"12754     \n",
"41460 コメント\n",
"44900 ください\n",
"45955   \n",
"\n",
"\n",
"69920 いっぱい\n",
"72905    \n",
"75206 ・・・・\n",
"77798 いました\n",
"79854 しました\n",
"86165 風吹けば\n",
"89941 ディース\n",
"90288 恐縮です\n",
"92994 ポイント\n",
"94408 について\n",
"95113  \n",
"95197 ケース\n",
"98560 ・・・\n",
"\n",
"101560 あります\n",
"104099 コピー\n",
"110484 レビュー\n",
"111674 ブランド\n",
"115557 メンズ\n",
"116908 はこちら\n",
"117391 この記事\n",
"117789 おすすめ\n",
"120447 。しかし\n",
"122223 ています\n",
"124256 サービス\n",
"125442 アクセス\n",
"127204 おります\n",
"127850 フォーム\n",
"128237 ニュース\n",
"131888 よろしく\n",
"132285 カテゴリ\n",
"143402 サイズ\n",
"146281 クリック\n",
"146381 ッション\n",
"146703 りました\n",
"147131 」という\n",
"147589 ーション\n",
"152546 できます\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment