Skip to content

Instantly share code, notes, and snippets.

@nariaki3551
Created May 22, 2022 00:10
Show Gist options
  • Save nariaki3551/cca40b4011c3b656df9cb1fa487612a7 to your computer and use it in GitHub Desktop.
Save nariaki3551/cca40b4011c3b656df9cb1fa487612a7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "98ca052b",
"metadata": {},
"source": [
"# Vocabulary"
]
},
{
"cell_type": "markdown",
"id": "ef5bdb24",
"metadata": {},
"source": [
"## torchtext.vocab.vocab\n",
"\n",
"Vocabulary manager"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3c7377b0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torchtext\n",
"\n",
"# 空のVocabオブジェクトを生成\n",
"vocab = torchtext.vocab.vocab(dict())\n",
"\n",
"# vocabularyサイズ\n",
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ebe9c236",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tokenの追加\n",
"vocab.append_token(\"dog\")\n",
"vocab.append_token(\"cat\")\n",
"\n",
"# vocabularyサイズ\n",
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cd3fbffb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['dog', 'cat']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tokenリストの取得\n",
"vocab.get_itos()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d7181b20",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'cat': 1, 'dog': 0}\n"
]
}
],
"source": [
"# token → idの紐付けを表示\n",
"print(vocab.get_stoi())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4d7d4330",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tokenをid列に変換\n",
"vocab[\"cat\"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d231835e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1, 0]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# token列をid列に変換\n",
"vocab.lookup_indices([\"cat\", \"dog\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "aad4f672",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1, 0]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lookup_indicesと同様\n",
"vocab.forward([\"cat\", \"dog\"]) # token → idの紐付けを表示"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5126f62b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'cat'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# token列をid列に変換\n",
"vocab.lookup_token(1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "12a1b1ff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['cat', 'dog']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# token列をid列に変換\n",
"vocab.lookup_tokens([1, 0])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "dddcf150",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'cat': 2, 'dog': 1, 'house': 0}\n"
]
}
],
"source": [
"# tokenの挿入\n",
"vocab.insert_token(\"house\", 0)\n",
"\n",
"# token → idの紐付けを表示\n",
"print(vocab.get_stoi())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "517e11af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 存在確認\n",
"\"dog\" in vocab"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4cb7af2f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 取得\n",
"vocab[\"dog\"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c438788b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-1"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# defaultを-1に変更\n",
"vocab.set_default_index(-1)\n",
"vocab[\"out of vocab\"]"
]
},
{
"cell_type": "markdown",
"id": "482b46e2",
"metadata": {},
"source": [
"### initial parameters of vocab "
]
},
{
"cell_type": "markdown",
"id": "209c692c",
"metadata": {},
"source": [
"- ordered_dict – Ordered Dictionary トークンのその頻出数の割り当て辞書\n",
"- min_freq – vocabraryに含めるための頻出最小数\n",
"- specials – Special symbols\n",
"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a9c960a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'<eos>': 5, '<bos>': 6, '<unk>': 4, '<pad>': 3, 'hot': 2, 'is': 1, 'today': 0}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PAD = \"<pad>\"\n",
"UNK = \"<unk>\"\n",
"EOS = \"<eos>\"\n",
"BOS = \"<bos>\"\n",
"vocab = torchtext.vocab.vocab(\n",
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 2},\n",
" min_freq=1,\n",
" specials=[PAD, UNK, EOS, BOS],\n",
" special_first=False\n",
")\n",
"vocab.get_stoi()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a2b48e6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'hot': 6, 'is': 5, 'today': 4, '<pad>': 0, '<unk>': 1, '<bos>': 3, '<eos>': 2}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# change special_first to True\n",
"vocab = torchtext.vocab.vocab(\n",
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
" min_freq=1,\n",
" specials=[PAD, UNK, EOS, BOS],\n",
" special_first=True\n",
")\n",
"vocab.get_stoi()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "0282dcf0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'<unk>': 2, '<bos>': 4, '<eos>': 3, '<pad>': 1, 'hot': 0}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# change min_freq to 2\n",
"vocab = torchtext.vocab.vocab(\n",
" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
" min_freq=2,\n",
" specials=[PAD, UNK, EOS, BOS],\n",
" special_first=False\n",
")\n",
"vocab.get_stoi()"
]
},
{
"cell_type": "markdown",
"id": "d0f9eb04",
"metadata": {},
"source": [
"## build_vocab_from_iterator\n",
"\n",
"Factory of Vocab instance\n",
"\n",
"- iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.\n",
"- min_freq – 指定されていればmin_freq以上の頻出度のtokenのみ登録する\n",
"- specials – Special symbols\n",
"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか\n",
"- max_tokens – 指定されていれば max_tokens - len(specials) の数のtokenのみ登録する(頻出度の高い方から)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "04f10d82",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'hot': 3, 'cold': 2, 'today': 1, 'is': 0}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = torchtext.vocab.build_vocab_from_iterator(\n",
" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]]\n",
")\n",
"vocab.get_stoi()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e592dba5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cold': 2, 'today': 1, 'is': 0}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = torchtext.vocab.build_vocab_from_iterator(\n",
" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]],\n",
" max_tokens=3\n",
")\n",
"vocab.get_stoi()"
]
},
{
"cell_type": "markdown",
"id": "836bea71",
"metadata": {},
"source": [
"# Vectors\n",
"\n",
"単語の分散表現"
]
},
{
"cell_type": "markdown",
"id": "d04c935b",
"metadata": {},
"source": [
"## Pretrained Word Embeddings\n",
"\n",
"事前学習済みのword embeddingの利用"
]
},
{
"cell_type": "markdown",
"id": "d44eb5f1",
"metadata": {},
"source": [
"### GloVe(Gloval Vectors)\n",
"\n",
"\"Glove (Gloval Vectors)[1] は,スタンフォード大のManning先生の研究室から提案された,単語分散表現である \"\n",
"(引用元: https://cvml-expertguide.net/terms/nlp/distributed-represnetation/glove/)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5a97b3ec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2196017"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors = torchtext.vocab.GloVe()\n",
"len(vectors)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ae0d4a48",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([300])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 単語をvectorに変換\n",
"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
"vec.shape"
]
},
{
"cell_type": "markdown",
"id": "04cb49db",
"metadata": {},
"source": [
"### FastText\n",
"\n",
"\"「fastText」とは2016年にFacebookが公開した自然言語処理ライブラリです\"\n",
"(引用元: https://service.plan-b.co.jp/blog/tech/14298/)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "6a7aea0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"580000"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors = torchtext.vocab.FastText(language=\"ja\")\n",
"len(vectors) # 単語数"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ce840c6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([300])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 単語をvectorに変換\n",
"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
"vec.shape"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "8ea7d8e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([3, 300])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 複数の単語をvectorに変換\n",
"vecs = vectors.get_vecs_by_tokens([\"犬\", \"は\", \"元気\"])\n",
"vecs.shape"
]
},
{
"cell_type": "markdown",
"id": "08f5f7ef",
"metadata": {},
"source": [
"### CharNGram"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "86824be6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"874474"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors = torchtext.vocab.CharNGram()\n",
"len(vectors) # 単語数"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "bc4663af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([1, 100])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 単語をvectorに変換\n",
"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
"vec.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e51b1f0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment