Created
May 22, 2022 00:10
-
-
Save nariaki3551/cca40b4011c3b656df9cb1fa487612a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "98ca052b", | |
"metadata": {}, | |
"source": [ | |
"# Vocabulary" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ef5bdb24", | |
"metadata": {}, | |
"source": [ | |
"## torchtext.vocab.vocab\n", | |
"\n", | |
"Vocabulary manager" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "3c7377b0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import torchtext\n", | |
"\n", | |
"# 空のVocabオブジェクトを生成\n", | |
"vocab = torchtext.vocab.vocab(dict())\n", | |
"\n", | |
"# vocabularyサイズ\n", | |
"len(vocab)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "ebe9c236", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# tokenの追加\n", | |
"vocab.append_token(\"dog\")\n", | |
"vocab.append_token(\"cat\")\n", | |
"\n", | |
"# vocabularyサイズ\n", | |
"len(vocab)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "cd3fbffb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['dog', 'cat']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# tokenリストの取得\n", | |
"vocab.get_itos()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "d7181b20", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'cat': 1, 'dog': 0}\n" | |
] | |
} | |
], | |
"source": [ | |
"# token → idの紐付けを表示\n", | |
"print(vocab.get_stoi())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "4d7d4330", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# tokenをid列に変換\n", | |
"vocab[\"cat\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "d231835e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1, 0]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# token列をid列に変換\n", | |
"vocab.lookup_indices([\"cat\", \"dog\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "aad4f672", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1, 0]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# lookup_indicesと同様\n", | |
"vocab.forward([\"cat\", \"dog\"]) # token → idの紐付けを表示" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "5126f62b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'cat'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# token列をid列に変換\n", | |
"vocab.lookup_token(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "12a1b1ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['cat', 'dog']" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# token列をid列に変換\n", | |
"vocab.lookup_tokens([1, 0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "dddcf150", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'cat': 2, 'dog': 1, 'house': 0}\n" | |
] | |
} | |
], | |
"source": [ | |
"# tokenの挿入\n", | |
"vocab.insert_token(\"house\", 0)\n", | |
"\n", | |
"# token → idの紐付けを表示\n", | |
"print(vocab.get_stoi())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "517e11af", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 存在確認\n", | |
"\"dog\" in vocab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "4cb7af2f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 取得\n", | |
"vocab[\"dog\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "c438788b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"-1" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# defaultを-1に変更\n", | |
"vocab.set_default_index(-1)\n", | |
"vocab[\"out of vocab\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "482b46e2", | |
"metadata": {}, | |
"source": [ | |
"### initial parameters of vocab " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "209c692c", | |
"metadata": {}, | |
"source": [ | |
"- ordered_dict – Ordered Dictionary トークンのその頻出数の割り当て辞書\n", | |
"- min_freq – vocabraryに含めるための頻出最小数\n", | |
"- specials – Special symbols\n", | |
"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "a9c960a3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'<eos>': 5, '<bos>': 6, '<unk>': 4, '<pad>': 3, 'hot': 2, 'is': 1, 'today': 0}" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"PAD = \"<pad>\"\n", | |
"UNK = \"<unk>\"\n", | |
"EOS = \"<eos>\"\n", | |
"BOS = \"<bos>\"\n", | |
"vocab = torchtext.vocab.vocab(\n", | |
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 2},\n", | |
" min_freq=1,\n", | |
" specials=[PAD, UNK, EOS, BOS],\n", | |
" special_first=False\n", | |
")\n", | |
"vocab.get_stoi()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "a2b48e6f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'hot': 6, 'is': 5, 'today': 4, '<pad>': 0, '<unk>': 1, '<bos>': 3, '<eos>': 2}" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# change special_first to True\n", | |
"vocab = torchtext.vocab.vocab(\n", | |
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n", | |
" min_freq=1,\n", | |
" specials=[PAD, UNK, EOS, BOS],\n", | |
" special_first=True\n", | |
")\n", | |
"vocab.get_stoi()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "0282dcf0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'<unk>': 2, '<bos>': 4, '<eos>': 3, '<pad>': 1, 'hot': 0}" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# change min_freq to 2\n", | |
"vocab = torchtext.vocab.vocab(\n", | |
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n", | |
" min_freq=2,\n", | |
" specials=[PAD, UNK, EOS, BOS],\n", | |
" special_first=False\n", | |
")\n", | |
"vocab.get_stoi()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d0f9eb04", | |
"metadata": {}, | |
"source": [ | |
"## build_vocab_from_iterator\n", | |
"\n", | |
"Factory of Vocab instance\n", | |
"\n", | |
"- iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.\n", | |
"- min_freq – 指定されていればmin_freq以上の頻出度のtokenのみ登録する\n", | |
"- specials – Special symbols\n", | |
"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか\n", | |
"- max_tokens – 指定されていれば max_tokens - len(specials) の数のtokenのみ登録する(頻出度の高い方から)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "04f10d82", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'hot': 3, 'cold': 2, 'today': 1, 'is': 0}" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab = torchtext.vocab.build_vocab_from_iterator(\n", | |
" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]]\n", | |
")\n", | |
"vocab.get_stoi()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "e592dba5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'cold': 2, 'today': 1, 'is': 0}" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab = torchtext.vocab.build_vocab_from_iterator(\n", | |
" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]],\n", | |
" max_tokens=3\n", | |
")\n", | |
"vocab.get_stoi()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "836bea71", | |
"metadata": {}, | |
"source": [ | |
"# Vectors\n", | |
"\n", | |
"単語の分散表現" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d04c935b", | |
"metadata": {}, | |
"source": [ | |
"## Pretrained Word Embeddings\n", | |
"\n", | |
"事前学習済みのword embeddingの利用" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d44eb5f1", | |
"metadata": {}, | |
"source": [ | |
"### GloVe(Gloval Vectors)\n", | |
"\n", | |
"\"Glove (Gloval Vectors)[1] は,スタンフォード大のManning先生の研究室から提案された,単語分散表現である \"\n", | |
"(引用元: https://cvml-expertguide.net/terms/nlp/distributed-represnetation/glove/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "5a97b3ec", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2196017" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectors = torchtext.vocab.GloVe()\n", | |
"len(vectors)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "ae0d4a48", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"torch.Size([300])" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 単語をvectorに変換\n", | |
"vec = vectors.get_vecs_by_tokens(\"犬\")\n", | |
"vec.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "04cb49db", | |
"metadata": {}, | |
"source": [ | |
"### FastText\n", | |
"\n", | |
"\"「fastText」とは2016年にFacebookが公開した自然言語処理ライブラリです\"\n", | |
"(引用元: https://service.plan-b.co.jp/blog/tech/14298/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "6a7aea0f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"580000" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectors = torchtext.vocab.FastText(language=\"ja\")\n", | |
"len(vectors) # 単語数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "ce840c6f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"torch.Size([300])" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 単語をvectorに変換\n", | |
"vec = vectors.get_vecs_by_tokens(\"犬\")\n", | |
"vec.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "8ea7d8e1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"torch.Size([3, 300])" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 複数の単語をvectorに変換\n", | |
"vecs = vectors.get_vecs_by_tokens([\"犬\", \"は\", \"元気\"])\n", | |
"vecs.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "08f5f7ef", | |
"metadata": {}, | |
"source": [ | |
"### CharNGram" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "86824be6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"874474" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vectors = torchtext.vocab.CharNGram()\n", | |
"len(vectors) # 単語数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "bc4663af", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"torch.Size([1, 100])" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 単語をvectorに変換\n", | |
"vec = vectors.get_vecs_by_tokens(\"犬\")\n", | |
"vec.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "1e51b1f0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment