nariaki3551/torchtext.vocab.ipynb

## torchtext.vocab.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "98ca052b",
   "metadata": {},
   "source": [
    "# Vocabulary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef5bdb24",
   "metadata": {},
   "source": [
    "## torchtext.vocab.vocab\n",
    "\n",
    "Vocabulary manager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3c7377b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torchtext\n",
    "\n",
    "# 空のVocabオブジェクトを生成\n",
    "vocab = torchtext.vocab.vocab(dict())\n",
    "\n",
    "# vocabularyサイズ\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ebe9c236",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tokenの追加\n",
    "vocab.append_token(\"dog\")\n",
    "vocab.append_token(\"cat\")\n",
    "\n",
    "# vocabularyサイズ\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cd3fbffb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dog', 'cat']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tokenリストの取得\n",
    "vocab.get_itos()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d7181b20",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'cat': 1, 'dog': 0}\n"
     ]
    }
   ],
   "source": [
    "# token → idの紐付けを表示\n",
    "print(vocab.get_stoi())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4d7d4330",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tokenをid列に変換\n",
    "vocab[\"cat\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d231835e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 0]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# token列をid列に変換\n",
    "vocab.lookup_indices([\"cat\", \"dog\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "aad4f672",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 0]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lookup_indicesと同様\n",
    "vocab.forward([\"cat\", \"dog\"])  # token → idの紐付けを表示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5126f62b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'cat'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# token列をid列に変換\n",
    "vocab.lookup_token(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "12a1b1ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['cat', 'dog']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# token列をid列に変換\n",
    "vocab.lookup_tokens([1, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "dddcf150",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'cat': 2, 'dog': 1, 'house': 0}\n"
     ]
    }
   ],
   "source": [
    "# tokenの挿入\n",
    "vocab.insert_token(\"house\", 0)\n",
    "\n",
    "# token → idの紐付けを表示\n",
    "print(vocab.get_stoi())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "517e11af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 存在確認\n",
    "\"dog\" in vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4cb7af2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 取得\n",
    "vocab[\"dog\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c438788b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# defaultを-1に変更\n",
    "vocab.set_default_index(-1)\n",
    "vocab[\"out of vocab\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "482b46e2",
   "metadata": {},
   "source": [
    "### initial parameters of vocab "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "209c692c",
   "metadata": {},
   "source": [
    "- ordered_dict – Ordered Dictionary トークンのその頻出数の割り当て辞書\n",
    "- min_freq – vocabraryに含めるための頻出最小数\n",
    "- specials – Special symbols\n",
    "- special_first – specialsをidの割り当ての先頭に持ってくるかどうか"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a9c960a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'<eos>': 5, '<bos>': 6, '<unk>': 4, '<pad>': 3, 'hot': 2, 'is': 1, 'today': 0}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PAD = \"<pad>\"\n",
    "UNK = \"<unk>\"\n",
    "EOS = \"<eos>\"\n",
    "BOS = \"<bos>\"\n",
    "vocab = torchtext.vocab.vocab(\n",
    "    ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 2},\n",
    "    min_freq=1,\n",
    "    specials=[PAD, UNK, EOS, BOS],\n",
    "    special_first=False\n",
    ")\n",
    "vocab.get_stoi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a2b48e6f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'hot': 6, 'is': 5, 'today': 4, '<pad>': 0, '<unk>': 1, '<bos>': 3, '<eos>': 2}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# change special_first to True\n",
    "vocab = torchtext.vocab.vocab(\n",
    "    ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
    "    min_freq=1,\n",
    "    specials=[PAD, UNK, EOS, BOS],\n",
    "    special_first=True\n",
    ")\n",
    "vocab.get_stoi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0282dcf0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'<unk>': 2, '<bos>': 4, '<eos>': 3, '<pad>': 1, 'hot': 0}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# change min_freq to 2\n",
    "vocab = torchtext.vocab.vocab(\n",
    "    ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
    "    min_freq=2,\n",
    "    specials=[PAD, UNK, EOS, BOS],\n",
    "    special_first=False\n",
    ")\n",
    "vocab.get_stoi()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0f9eb04",
   "metadata": {},
   "source": [
    "## build_vocab_from_iterator\n",
    "\n",
    "Factory of Vocab instance\n",
    "\n",
    "- iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.\n",
    "- min_freq – 指定されていればmin_freq以上の頻出度のtokenのみ登録する\n",
    "- specials – Special symbols\n",
    "- special_first – specialsをidの割り当ての先頭に持ってくるかどうか\n",
    "- max_tokens – 指定されていれば max_tokens - len(specials) の数のtokenのみ登録する(頻出度の高い方から)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "04f10d82",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'hot': 3, 'cold': 2, 'today': 1, 'is': 0}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = torchtext.vocab.build_vocab_from_iterator(\n",
    "    iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]]\n",
    ")\n",
    "vocab.get_stoi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e592dba5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cold': 2, 'today': 1, 'is': 0}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = torchtext.vocab.build_vocab_from_iterator(\n",
    "    iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]],\n",
    "    max_tokens=3\n",
    ")\n",
    "vocab.get_stoi()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "836bea71",
   "metadata": {},
   "source": [
    "# Vectors\n",
    "\n",
    "単語の分散表現"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d04c935b",
   "metadata": {},
   "source": [
    "## Pretrained Word Embeddings\n",
    "\n",
    "事前学習済みのword embeddingの利用"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d44eb5f1",
   "metadata": {},
   "source": [
    "### GloVe(Gloval Vectors)\n",
    "\n",
    "\"Glove (Gloval Vectors)[1] は，スタンフォード大のManning先生の研究室から提案された，単語分散表現である \"\n",
    "(引用元: https://cvml-expertguide.net/terms/nlp/distributed-represnetation/glove/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5a97b3ec",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2196017"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectors = torchtext.vocab.GloVe()\n",
    "len(vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ae0d4a48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([300])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 単語をvectorに変換\n",
    "vec = vectors.get_vecs_by_tokens(\"犬\")\n",
    "vec.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04cb49db",
   "metadata": {},
   "source": [
    "### FastText\n",
    "\n",
    "\"「fastText」とは2016年にFacebookが公開した自然言語処理ライブラリです\"\n",
    "(引用元: https://service.plan-b.co.jp/blog/tech/14298/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "6a7aea0f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "580000"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectors = torchtext.vocab.FastText(language=\"ja\")\n",
    "len(vectors)  # 単語数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "ce840c6f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([300])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 単語をvectorに変換\n",
    "vec = vectors.get_vecs_by_tokens(\"犬\")\n",
    "vec.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8ea7d8e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([3, 300])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 複数の単語をvectorに変換\n",
    "vecs = vectors.get_vecs_by_tokens([\"犬\", \"は\", \"元気\"])\n",
    "vecs.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08f5f7ef",
   "metadata": {},
   "source": [
    "### CharNGram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "86824be6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "874474"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectors = torchtext.vocab.CharNGram()\n",
    "len(vectors)  # 単語数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bc4663af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 100])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 単語をvectorに変換\n",
    "vec = vectors.get_vecs_by_tokens(\"犬\")\n",
    "vec.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e51b1f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "98ca052b",
	"metadata": {},
	"source": [
	"# Vocabulary"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ef5bdb24",
	"metadata": {},
	"source": [
	"## torchtext.vocab.vocab\n",
	"\n",
	"Vocabulary manager"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "3c7377b0",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import torchtext\n",
	"\n",
	"# 空のVocabオブジェクトを生成\n",
	"vocab = torchtext.vocab.vocab(dict())\n",
	"\n",
	"# vocabularyサイズ\n",
	"len(vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "ebe9c236",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tokenの追加\n",
	"vocab.append_token(\"dog\")\n",
	"vocab.append_token(\"cat\")\n",
	"\n",
	"# vocabularyサイズ\n",
	"len(vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "cd3fbffb",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['dog', 'cat']"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tokenリストの取得\n",
	"vocab.get_itos()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "d7181b20",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'cat': 1, 'dog': 0}\n"
	]
	}
	],
	"source": [
	"# token → idの紐付けを表示\n",
	"print(vocab.get_stoi())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "4d7d4330",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# tokenをid列に変換\n",
	"vocab[\"cat\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "d231835e",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[1, 0]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# token列をid列に変換\n",
	"vocab.lookup_indices([\"cat\", \"dog\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "aad4f672",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[1, 0]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# lookup_indicesと同様\n",
	"vocab.forward([\"cat\", \"dog\"]) # token → idの紐付けを表示"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "5126f62b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'cat'"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# token列をid列に変換\n",
	"vocab.lookup_token(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "12a1b1ff",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['cat', 'dog']"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# token列をid列に変換\n",
	"vocab.lookup_tokens([1, 0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "dddcf150",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'cat': 2, 'dog': 1, 'house': 0}\n"
	]
	}
	],
	"source": [
	"# tokenの挿入\n",
	"vocab.insert_token(\"house\", 0)\n",
	"\n",
	"# token → idの紐付けを表示\n",
	"print(vocab.get_stoi())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "517e11af",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 存在確認\n",
	"\"dog\" in vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "4cb7af2f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 取得\n",
	"vocab[\"dog\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "c438788b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"-1"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# defaultを-1に変更\n",
	"vocab.set_default_index(-1)\n",
	"vocab[\"out of vocab\"]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "482b46e2",
	"metadata": {},
	"source": [
	"### initial parameters of vocab "
	]
	},
	{
	"cell_type": "markdown",
	"id": "209c692c",
	"metadata": {},
	"source": [
	"- ordered_dict – Ordered Dictionary トークンのその頻出数の割り当て辞書\n",
	"- min_freq – vocabraryに含めるための頻出最小数\n",
	"- specials – Special symbols\n",
	"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "a9c960a3",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'<eos>': 5, '<bos>': 6, '<unk>': 4, '<pad>': 3, 'hot': 2, 'is': 1, 'today': 0}"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"PAD = \"<pad>\"\n",
	"UNK = \"<unk>\"\n",
	"EOS = \"<eos>\"\n",
	"BOS = \"<bos>\"\n",
	"vocab = torchtext.vocab.vocab(\n",
	" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 2},\n",
	" min_freq=1,\n",
	" specials=[PAD, UNK, EOS, BOS],\n",
	" special_first=False\n",
	")\n",
	"vocab.get_stoi()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "a2b48e6f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'hot': 6, 'is': 5, 'today': 4, '<pad>': 0, '<unk>': 1, '<bos>': 3, '<eos>': 2}"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# change special_first to True\n",
	"vocab = torchtext.vocab.vocab(\n",
	" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
	" min_freq=1,\n",
	" specials=[PAD, UNK, EOS, BOS],\n",
	" special_first=True\n",
	")\n",
	"vocab.get_stoi()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "0282dcf0",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'<unk>': 2, '<bos>': 4, '<eos>': 3, '<pad>': 1, 'hot': 0}"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# change min_freq to 2\n",
	"vocab = torchtext.vocab.vocab(\n",
	" ordered_dict={\"today\": 1, \"is\": 1, \"hot\": 3},\n",
	" min_freq=2,\n",
	" specials=[PAD, UNK, EOS, BOS],\n",
	" special_first=False\n",
	")\n",
	"vocab.get_stoi()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d0f9eb04",
	"metadata": {},
	"source": [
	"## build_vocab_from_iterator\n",
	"\n",
	"Factory of Vocab instance\n",
	"\n",
	"- iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.\n",
	"- min_freq – 指定されていればmin_freq以上の頻出度のtokenのみ登録する\n",
	"- specials – Special symbols\n",
	"- special_first – specialsをidの割り当ての先頭に持ってくるかどうか\n",
	"- max_tokens – 指定されていれば max_tokens - len(specials) の数のtokenのみ登録する(頻出度の高い方から)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"id": "04f10d82",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'hot': 3, 'cold': 2, 'today': 1, 'is': 0}"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vocab = torchtext.vocab.build_vocab_from_iterator(\n",
	" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]]\n",
	")\n",
	"vocab.get_stoi()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"id": "e592dba5",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'cold': 2, 'today': 1, 'is': 0}"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vocab = torchtext.vocab.build_vocab_from_iterator(\n",
	" iterator=[[\"today\", \"is\", \"hot\"], [\"today\", \"is\", \"cold\"]],\n",
	" max_tokens=3\n",
	")\n",
	"vocab.get_stoi()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "836bea71",
	"metadata": {},
	"source": [
	"# Vectors\n",
	"\n",
	"単語の分散表現"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d04c935b",
	"metadata": {},
	"source": [
	"## Pretrained Word Embeddings\n",
	"\n",
	"事前学習済みのword embeddingの利用"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d44eb5f1",
	"metadata": {},
	"source": [
	"### GloVe(Gloval Vectors)\n",
	"\n",
	"\"Glove (Gloval Vectors)[1] は，スタンフォード大のManning先生の研究室から提案された，単語分散表現である \"\n",
	"(引用元: https://cvml-expertguide.net/terms/nlp/distributed-represnetation/glove/)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"id": "5a97b3ec",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2196017"
	]
	},
	"execution_count": 19,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectors = torchtext.vocab.GloVe()\n",
	"len(vectors)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"id": "ae0d4a48",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"torch.Size([300])"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 単語をvectorに変換\n",
	"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
	"vec.shape"
	]
	},
	{
	"cell_type": "markdown",
	"id": "04cb49db",
	"metadata": {},
	"source": [
	"### FastText\n",
	"\n",
	"\"「fastText」とは2016年にFacebookが公開した自然言語処理ライブラリです\"\n",
	"(引用元: https://service.plan-b.co.jp/blog/tech/14298/)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "6a7aea0f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"580000"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectors = torchtext.vocab.FastText(language=\"ja\")\n",
	"len(vectors) # 単語数"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "ce840c6f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"torch.Size([300])"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 単語をvectorに変換\n",
	"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
	"vec.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "8ea7d8e1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"torch.Size([3, 300])"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 複数の単語をvectorに変換\n",
	"vecs = vectors.get_vecs_by_tokens([\"犬\", \"は\", \"元気\"])\n",
	"vecs.shape"
	]
	},
	{
	"cell_type": "markdown",
	"id": "08f5f7ef",
	"metadata": {},
	"source": [
	"### CharNGram"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"id": "86824be6",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"874474"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectors = torchtext.vocab.CharNGram()\n",
	"len(vectors) # 単語数"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "bc4663af",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"torch.Size([1, 100])"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 単語をvectorに変換\n",
	"vec = vectors.get_vecs_by_tokens(\"犬\")\n",
	"vec.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "1e51b1f0",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}